From 0ec10f571c93cd4bd705288f151a5013cfdbccdf Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 12 Aug 2016 13:50:22 +0530 Subject: [PATCH] Add simple hipblas saxpy sample Change-Id: I67ae83e1e5397d5191a3c644aba068f06ff97830 [ROCm/hip commit: d192976b002940f1411282c0642a694017722be1] --- .../samples/2_Advanced/hipblas_saxpy/Makefile | 34 +++++++ .../2_Advanced/hipblas_saxpy/saxpy.cublas.cpp | 94 +++++++++++++++++++ .../hipblas_saxpy/saxpy.hipblasref.cpp | 94 +++++++++++++++++++ 3 files changed, 222 insertions(+) create mode 100644 projects/hip/samples/2_Advanced/hipblas_saxpy/Makefile create mode 100644 projects/hip/samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp create mode 100644 projects/hip/samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp diff --git a/projects/hip/samples/2_Advanced/hipblas_saxpy/Makefile b/projects/hip/samples/2_Advanced/hipblas_saxpy/Makefile new file mode 100644 index 0000000000..ed88be2dd0 --- /dev/null +++ b/projects/hip/samples/2_Advanced/hipblas_saxpy/Makefile @@ -0,0 +1,34 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif +HIPCC=$(HIP_PATH)/bin/hipcc + +HIPCC_FLAGS += -std=c++11 +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) +ifeq (${HIP_PLATFORM}, nvcc) + LIBS = -lcublas +endif +ifeq (${HIP_PLATFORM}, hcc) + HCBLAS_ROOT?= $(wildcard /opt/rocm/hcblas) + HIPCC_FLAGS += -stdlib=libc++ -I$(HCBLAS_ROOT)/include + LIBS = -L$(HCBLAS_ROOT)/lib -lhcblas +endif + + +all: saxpy.hipblas.out + +saxpy.cublas.out : saxpy.cublas.cpp + nvcc -std=c++11 -I$(CUDA_HOME)/include saxpy.cublas.cpp -o $@ -L$(CUDA_HOME)/lib64 -lcublas + +# $HIPBLAS_ROOT/bin/hipifyblas ./saxpy.cublas.cpp > ./saxpy.hipblas.cpp +# Then review & finish port in saxpy.hipblas.cpp + +saxpy.hipblasref.o: saxpy.hipblasref.cpp + $(HIPCC) $(HIPCC_FLAGS) -c $< -o $@ + +saxpy.hipblas.out: saxpy.hipblasref.o + $(HIPCC) $< -o $@ $(LIBS) + +clean: + rm -f *.o *.out diff --git a/projects/hip/samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp b/projects/hip/samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp new file mode 100644 index 0000000000..03a38f3fb1 --- /dev/null +++ b/projects/hip/samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp @@ -0,0 +1,94 @@ + +#include +#include +#include +#include + +// header file for the GPU API +#include +#include + +#define N (1024 * 500) + +#define CHECK(cmd) \ +{\ + cudaError_t error = cmd; \ + if (error != cudaSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +#define CHECK_BLAS(cmd) \ +{\ + cublasStatus_t error = cmd;\ + if (error != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +int main() { + + const float a = 100.0f; + float x[N]; + float y[N], y_cpu_res[N], y_gpu_res[N]; + + // initialize the input data + std::default_random_engine random_gen; + std::uniform_real_distribution distribution(-N, N); + std::generate_n(x, N, [&]() { return distribution(random_gen); }); + std::generate_n(y, N, [&]() { return distribution(random_gen); }); + std::copy_n(y, N, y_cpu_res); + + // Explicit GPU code: + + size_t Nbytes = N*sizeof(float); + float *x_gpu, *y_gpu; + + cublasHandle_t handle; + + cudaDeviceProp props; + CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/)); + printf ("info: running on device %s\n", props.name); + + printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + CHECK(cudaMalloc(&x_gpu, Nbytes)); + CHECK(cudaMalloc(&y_gpu, Nbytes)); + + // Initialize the blas library + CHECK_BLAS ( cublasCreate(&handle)); + + // copy n elements from a vector in host memory space to a vector in GPU memory space + printf ("info: copy Host2Device\n"); + CHECK_BLAS ( cublasSetVector(N, sizeof(*x), x, 1, x_gpu, 1)); + CHECK_BLAS ( cublasSetVector(N, sizeof(*y), y, 1, y_gpu, 1)); + + printf ("info: launch 'saxpy' kernel\n"); + CHECK_BLAS ( cublasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1)); + + cudaDeviceSynchronize(); + + printf ("info: copy Device2Host\n"); + CHECK_BLAS ( cublasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1)); + + // CPU implementation of saxpy + for (int i = 0; i < N; i++) { + y_cpu_res[i] = a * x[i] + y[i]; + } + + // verify the results + int errors = 0; + for (int i = 0; i < N; i++) { + if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f)) + errors++; + } + std::cout << errors << " errors" << std::endl; + + CHECK( cudaFree(x_gpu)); + CHECK( cudaFree(y_gpu)); + CHECK_BLAS( cublasDestroy(handle)); + + return errors; +} diff --git a/projects/hip/samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp b/projects/hip/samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp new file mode 100644 index 0000000000..3f20c8a7cc --- /dev/null +++ b/projects/hip/samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp @@ -0,0 +1,94 @@ + +#include +#include +#include +#include + +// header file for the GPU API +#include +#include + +#define N (1024 * 500) + +#define CHECK(cmd) \ +{\ + hipError_t error = cmd; \ + if (error != hipSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +#define CHECK_BLAS(cmd) \ +{\ + hipblasStatus_t error = cmd;\ + if (error != HIPBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +int main() { + + const float a = 100.0f; + float x[N]; + float y[N], y_cpu_res[N], y_gpu_res[N]; + + // initialize the input data + std::default_random_engine random_gen; + std::uniform_real_distribution distribution(-N, N); + std::generate_n(x, N, [&]() { return distribution(random_gen); }); + std::generate_n(y, N, [&]() { return distribution(random_gen); }); + std::copy_n(y, N, y_cpu_res); + + // Explicit GPU code: + + size_t Nbytes = N*sizeof(float); + float *x_gpu, *y_gpu; + + hipblasHandle_t handle; + + hipDeviceProp_t props; + CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); + printf ("info: running on device %s\n", props.name); + + printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + CHECK(hipMalloc(&x_gpu, Nbytes)); + CHECK(hipMalloc(&y_gpu, Nbytes)); + + // Initialize the blas library + CHECK_BLAS ( hipblasCreate(&handle)); + + // copy n elements from a vector in host memory space to a vector in GPU memory space + printf ("info: copy Host2Device\n"); + CHECK_BLAS ( hipblasSetVector(N, sizeof(*x), x, 1, x_gpu, 1)); + CHECK_BLAS ( hipblasSetVector(N, sizeof(*y), y, 1, y_gpu, 1)); + + printf ("info: launch 'saxpy' kernel\n"); + CHECK_BLAS ( hipblasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1)); + + hipDeviceSynchronize(); + + printf ("info: copy Device2Host\n"); + CHECK_BLAS ( hipblasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1)); + + // CPU implementation of saxpy + for (int i = 0; i < N; i++) { + y_cpu_res[i] = a * x[i] + y[i]; + } + + // verify the results + int errors = 0; + for (int i = 0; i < N; i++) { + if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f)) + errors++; + } + std::cout << errors << " errors" << std::endl; + + CHECK( hipFree(x_gpu)); + CHECK( hipFree(y_gpu)); + CHECK_BLAS( hipblasDestroy(handle)); + + return errors; +}