Add simple hipblas saxpy sample

Change-Id: I67ae83e1e5397d5191a3c644aba068f06ff97830 [ROCm/hip commit: d192976b00]
2016-08-12 13:50:22 +05:30
@@ -0,0 +1,34 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+HIPCC_FLAGS += -std=c++11
+HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
+ifeq (${HIP_PLATFORM}, nvcc)
+	LIBS = -lcublas
+endif
+ifeq (${HIP_PLATFORM}, hcc)
+	HCBLAS_ROOT?= $(wildcard /opt/rocm/hcblas)
+	HIPCC_FLAGS += -stdlib=libc++ -I$(HCBLAS_ROOT)/include
+	LIBS = -L$(HCBLAS_ROOT)/lib -lhcblas
+endif
+
+
+all: saxpy.hipblas.out
+
+saxpy.cublas.out : saxpy.cublas.cpp
+	nvcc -std=c++11 -I$(CUDA_HOME)/include saxpy.cublas.cpp -o $@ -L$(CUDA_HOME)/lib64 -lcublas
+
+# $HIPBLAS_ROOT/bin/hipifyblas ./saxpy.cublas.cpp > ./saxpy.hipblas.cpp
+# Then review & finish port in saxpy.hipblas.cpp
+
+saxpy.hipblasref.o: saxpy.hipblasref.cpp
+	$(HIPCC) $(HIPCC_FLAGS) -c $< -o $@
+
+saxpy.hipblas.out: saxpy.hipblasref.o
+	$(HIPCC) $< -o $@ $(LIBS)
+
+clean:
+	rm -f *.o *.out	
@@ -0,0 +1,94 @@
+
+#include <random>
+#include <algorithm>
+#include <iostream>
+#include <cmath>
+
+// header file for the GPU API
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+#define N  (1024 * 500)
+
+#define CHECK(cmd) \
+{\
+    cudaError_t error  = cmd;   \
+    if (error != cudaSuccess) { \
+      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
+      exit(EXIT_FAILURE);\
+    }\
+}
+
+#define CHECK_BLAS(cmd)			\
+{\
+    cublasStatus_t error  = cmd;\
+    if (error != CUBLAS_STATUS_SUCCESS) { \
+      fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \
+      exit(EXIT_FAILURE);\
+    }\
+}
+
+int main() {
+
+  const float a = 100.0f;
+  float x[N];
+  float y[N], y_cpu_res[N], y_gpu_res[N];
+
+  // initialize the input data
+  std::default_random_engine random_gen;
+  std::uniform_real_distribution<float> distribution(-N, N);
+  std::generate_n(x, N, [&]() { return distribution(random_gen); });
+  std::generate_n(y, N, [&]() { return distribution(random_gen); });
+  std::copy_n(y, N, y_cpu_res);
+
+  // Explicit GPU code:  
+
+  size_t Nbytes = N*sizeof(float);
+  float *x_gpu, *y_gpu;
+
+  cublasHandle_t handle;
+
+  cudaDeviceProp props;
+  CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/));
+  printf ("info: running on device %s\n", props.name);
+
+  printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  CHECK(cudaMalloc(&x_gpu, Nbytes));
+  CHECK(cudaMalloc(&y_gpu, Nbytes));
+
+  // Initialize the blas library
+  CHECK_BLAS ( cublasCreate(&handle));
+
+  // copy n elements from a vector in host memory space to a vector in GPU memory space
+  printf ("info: copy Host2Device\n");
+  CHECK_BLAS ( cublasSetVector(N, sizeof(*x), x, 1, x_gpu, 1));
+  CHECK_BLAS ( cublasSetVector(N, sizeof(*y), y, 1, y_gpu, 1));
+
+  printf ("info: launch 'saxpy' kernel\n");
+  CHECK_BLAS ( cublasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1));  
+
+  cudaDeviceSynchronize();
+
+  printf ("info: copy Device2Host\n");
+  CHECK_BLAS ( cublasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1)); 
+
+  // CPU implementation of saxpy
+  for (int i = 0; i < N; i++) {
+    y_cpu_res[i] = a * x[i] + y[i];
+  }
+
+  // verify the results
+  int errors = 0;
+  for (int i = 0; i < N; i++) {
+    if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f))
+      errors++;
+  }
+  std::cout << errors << " errors" << std::endl;
+
+  CHECK( cudaFree(x_gpu));
+  CHECK( cudaFree(y_gpu));
+  CHECK_BLAS( cublasDestroy(handle));
+
+  return errors;
+}
@@ -0,0 +1,94 @@
+
+#include <random>
+#include <algorithm>
+#include <iostream>
+#include <cmath>
+
+// header file for the GPU API
+#include <hip_runtime.h>
+#include <hipblas.h>
+
+#define N  (1024 * 500)
+
+#define CHECK(cmd) \
+{\
+    hipError_t error  = cmd;   \
+    if (error != hipSuccess) { \
+      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
+      exit(EXIT_FAILURE);\
+    }\
+}
+
+#define CHECK_BLAS(cmd)			\
+{\
+    hipblasStatus_t error  = cmd;\
+    if (error != HIPBLAS_STATUS_SUCCESS) { \
+      fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \
+      exit(EXIT_FAILURE);\
+    }\
+}
+
+int main() {
+
+  const float a = 100.0f;
+  float x[N];
+  float y[N], y_cpu_res[N], y_gpu_res[N];
+
+  // initialize the input data
+  std::default_random_engine random_gen;
+  std::uniform_real_distribution<float> distribution(-N, N);
+  std::generate_n(x, N, [&]() { return distribution(random_gen); });
+  std::generate_n(y, N, [&]() { return distribution(random_gen); });
+  std::copy_n(y, N, y_cpu_res);
+
+  // Explicit GPU code:  
+
+  size_t Nbytes = N*sizeof(float);
+  float *x_gpu, *y_gpu;
+
+  hipblasHandle_t handle;
+
+  hipDeviceProp_t props;
+  CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
+  printf ("info: running on device %s\n", props.name);
+
+  printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  CHECK(hipMalloc(&x_gpu, Nbytes));
+  CHECK(hipMalloc(&y_gpu, Nbytes));
+
+  // Initialize the blas library
+  CHECK_BLAS ( hipblasCreate(&handle));
+
+  // copy n elements from a vector in host memory space to a vector in GPU memory space
+  printf ("info: copy Host2Device\n");
+  CHECK_BLAS ( hipblasSetVector(N, sizeof(*x), x, 1, x_gpu, 1));
+  CHECK_BLAS ( hipblasSetVector(N, sizeof(*y), y, 1, y_gpu, 1));
+
+  printf ("info: launch 'saxpy' kernel\n");
+  CHECK_BLAS ( hipblasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1));  
+
+  hipDeviceSynchronize();
+
+  printf ("info: copy Device2Host\n");
+  CHECK_BLAS ( hipblasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1)); 
+
+  // CPU implementation of saxpy
+  for (int i = 0; i < N; i++) {
+    y_cpu_res[i] = a * x[i] + y[i];
+  }
+
+  // verify the results
+  int errors = 0;
+  for (int i = 0; i < N; i++) {
+    if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f))
+      errors++;
+  }
+  std::cout << errors << " errors" << std::endl;
+
+  CHECK( hipFree(x_gpu));
+  CHECK( hipFree(y_gpu));
+  CHECK_BLAS( hipblasDestroy(handle));
+
+  return errors;
+}