[HIPIFY][SPARSE] Add 3 more CUDA 10.0 tests

+ lit update + fix typos [ROCm/clr commit: a421aff96d]
2018-12-04 19:24:29 +03:00
@@ -576,7 +576,7 @@
 |`cusparseXcoosortByColumn`                                 |`hipsparseXcoosortByColumn`                      |
 |`cusparseXcsrsort_bufferSizeExt`                           |`hipsparseXcsrsort_bufferSizeExt`                |
 |`cusparseXcsrsort`                                         |`hipsparseXcsrsort`                              |
-|`cusparseScusparseXcscsort_bufferSizeExtnnz`               |                                                 |
+|`cusparseXcscsort_bufferSizeExt`                           |                                                 |
 |`cusparseXcscsort`                                         |                                                 |
 |`cusparseCreateCsru2csrInfo`                               |                                                 |
 |`cusparseDestroyCsru2csrInfo`                              |                                                 |
@@ -554,7 +554,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
  {"cusparseXcsrsort_bufferSizeExt",              {"hipsparseXcsrsort_bufferSizeExt",              CONV_LIB_FUNC, API_SPARSE}},
  {"cusparseXcsrsort",                            {"hipsparseXcsrsort",                            CONV_LIB_FUNC, API_SPARSE}},

-  {"cusparseScusparseXcscsort_bufferSizeExtnnz",  {"hipsparseXcscsort_bufferSizeExt",              CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
+  {"cusparseXcscsort_bufferSizeExt",              {"hipsparseXcscsort_bufferSizeExt",              CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseXcscsort",                            {"hipsparseXcscsort",                            CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},

  {"cusparseCreateCsru2csrInfo",                  {"hipsparseCreateCsru2csrInfo",                  CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
@@ -277,8 +277,8 @@ int main(int argc, char*argv[])
  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
  /* step 4: prepare workspace */
  // NOTE: CUDA 10.0
-  // CHECK: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt(
-    status = cusparseSgtsvInterleavedBatch_bufferSizeExt(
+  // TODO: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt(
+  status = cusparseSgtsvInterleavedBatch_bufferSizeExt(
    cusparseH,
    algo,
    n,
@@ -299,7 +299,7 @@ int main(int argc, char*argv[])

  /* step 5: solve Aj*xj = bj */
  // NOTE: CUDA 10.0
-  // CHECK: status = hipsparseSgtsvInterleavedBatch(
+  // TODO: status = hipsparseSgtsvInterleavedBatch(
   status = cusparseSgtsvInterleavedBatch(
    cusparseH,
    algo,
@@ -0,0 +1,414 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK: #include <hipsparse.h>
+#include <cusparse.h>
+// CHECK: #include <hipblas.h>
+#include <cublas_v2.h>
+
+// NOTE: CUDA 10.0
+
+/*
+ * compute | b - A*x|_inf
+ */
+void residaul_eval(
+  int n,
+  const float *dl,
+  const float *d,
+  const float *du,
+  const float *b,
+  const float *x,
+  float *r_nrminf_ptr)
+{
+  float r_nrminf = 0;
+  for (int i = 0; i < n; i++) {
+    float dot = 0;
+    if (i > 0) {
+      dot += dl[i] * x[i - 1];
+    }
+    dot += d[i] * x[i];
+    if (i < (n - 1)) {
+      dot += du[i] * x[i + 1];
+    }
+    float ri = b[i] - dot;
+    r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri);
+  }
+
+  *r_nrminf_ptr = r_nrminf;
+}
+
+int main(int argc, char*argv[])
+{
+  // CHECK: hipsparseHandle_t cusparseH = NULL;
+  cusparseHandle_t cusparseH = NULL;
+  //  CHECK: hipblasHandle_t cublasH = NULL;
+  cublasHandle_t cublasH = NULL;
+  // CHECK: hipStream_t stream = NULL;
+  cudaStream_t stream = NULL;
+  // CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS;
+  cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS;
+  // CHECK: hipblasStatus_t cublasStat = HIPBLAS_STATUS_SUCCESS;
+  cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
+  // CHECK: hipError_t cudaStat1 = hipSuccess;
+  cudaError_t cudaStat1 = cudaSuccess;
+
+  const int n = 3;
+  const int batchSize = 2;
+  /*
+   *      |    1     6     0  |       | 1 |       | -0.603960 |
+   *  A1 =|    4     2     7  |, b1 = | 2 |, x1 = |  0.267327 |
+   *      |    0     5     3  |       | 3 |       |  0.554455 |
+   *
+   *      |    8    13     0  |       | 4 |       | -0.063291 |
+   *  A2 =|   11     9    14  |, b2 = | 5 |, x2 = |  0.346641 |
+   *      |    0    12    10  |       | 6 |       |  0.184031 |
+   */
+
+   /*
+    * A = (dl, d, du), B and X are in aggregate format
+    */
+  const float dl[n * batchSize] = { 0, 4, 5,  0, 11, 12 };
+  const float  d[n * batchSize] = { 1, 2, 3,  8,  9, 10 };
+  const float du[n * batchSize] = { 6, 7, 0, 13, 14,  0 };
+  const float  B[n * batchSize] = { 1, 2, 3,  4,  5,  6 };
+  float X[n * batchSize]; /* Xj = Aj \ Bj */
+
+/* device memory
+ * (d_dl0, d_d0, d_du0) is aggregate format
+ * (d_dl, d_d, d_du) is interleaved format
+ */
+  float *d_dl0 = NULL;
+  float *d_d0 = NULL;
+  float *d_du0 = NULL;
+  float *d_dl = NULL;
+  float *d_d = NULL;
+  float *d_du = NULL;
+  float *d_B = NULL;
+  float *d_X = NULL;
+
+  size_t lworkInBytes = 0;
+  char *d_work = NULL;
+
+  /*
+   * algo = 0: cuThomas (unstable)
+   * algo = 1: LU with pivoting (stable)
+   * algo = 2: QR (stable)
+   */
+  const int algo = 2;
+
+  const float h_one = 1;
+  const float h_zero = 0;
+
+  printf("example of gtsv (interleaved format) \n");
+  printf("choose algo = 0,1,2 to select different algorithms \n");
+  printf("n = %d, batchSize = %d, algo = %d \n", n, batchSize, algo);
+
+  /* step 1: create cusparse/cublas handle, bind a stream */
+  // CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
+  cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: status = hipsparseCreate(&cusparseH);
+  status = cusparseCreate(&cusparseH);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  // CHECK: status = hipsparseSetStream(cusparseH, stream);
+  status = cusparseSetStream(cusparseH, stream);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  // CHECK: cublasStat = hipblasCreate(&cublasH);
+  cublasStat = cublasCreate(&cublasH);
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  // CHECK: cublasStat = hipblasSetStream(cublasH, stream);
+  cublasStat = cublasSetStream(cublasH, stream);
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+
+  /* step 2: allocate device memory */
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_dl0, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_dl0, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_d0, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_d0, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_du0, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_du0, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_dl, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_dl, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_d, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_d, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_du, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_du, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_X, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_X, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+
+  /* step 3: prepare data in device, interleaved format */
+  // CHECK: cudaStat1 = hipMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_d0, d, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_d0, d, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_du0, du, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_du0, du, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: hipDeviceSynchronize();
+  cudaDeviceSynchronize();
+  /* convert dl to interleaved format
+   *  dl = transpose(dl0)
+   */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T,
+  // CHECK: HIPBLAS_OP_T,
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of dl */
+    n,         /* number of columns of dl */
+    &h_one,
+    d_dl0,  /* dl0 is n-by-batchSize */
+    n, /* leading dimension of dl0 */
+    &h_zero,
+    NULL,
+    n,         /* don't care */
+    d_dl,      /* dl is batchSize-by-n */
+    batchSize  /* leading dimension of dl */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  /* convert d to interleaved format
+       *  d = transpose(d0)
+       */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T
+  // CHECK: HIPBLAS_OP_T
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of d */
+    n,         /* number of columns of d */
+    &h_one,
+    d_d0, /* d0 is n-by-batchSize */
+    n, /* leading dimension of d0 */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_d,       /* d is batchSize-by-n */
+    batchSize  /* leading dimension of d */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+
+  /* convert du to interleaved format
+   *  du = transpose(du0)
+   */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T
+  // CHECK: HIPBLAS_OP_T
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of du */
+    n,         /* number of columns of du */
+    &h_one,
+    d_du0, /* du0 is n-by-batchSize */
+    n, /* leading dimension of du0 */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_du,      /* du is batchSize-by-n */
+    batchSize  /* leading dimension of du */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+
+  /* convert B to interleaved format
+   *  X = transpose(B)
+   */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T
+  // CHECK: HIPBLAS_OP_T
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of X */
+    n,         /* number of columns of X */
+    &h_one,
+    d_B, /* B is n-by-batchSize */
+    n, /* leading dimension of B */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_X,       /* X is batchSize-by-n */
+    batchSize  /* leading dimension of X */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  /* step 4: prepare workspace */
+  // NOTE: CUDA 10.0
+  // TODO: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt(
+  status = cusparseSgtsvInterleavedBatch_bufferSizeExt(
+    cusparseH,
+    algo,
+    n,
+    d_dl,
+    d_d,
+    d_du,
+    d_X,
+    batchSize,
+    &lworkInBytes);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+
+  printf("lworkInBytes = %lld \n", (long long)lworkInBytes);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes);
+  cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+
+  /* step 5: solve Aj*xj = bj */
+  // NOTE: CUDA 10.0
+  // TODO: status = hipsparseSgtsvInterleavedBatch(
+  status = cusparseSgtsvInterleavedBatch(
+    cusparseH,
+    algo,
+    n,
+    d_dl,
+    d_d,
+    d_du,
+    d_X,
+    batchSize,
+    d_work);
+  // CHECK: cudaStat1 = hipDeviceSynchronize();
+  cudaStat1 = cudaDeviceSynchronize();
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+
+  /* step 6: convert X back to aggregate format  */
+      /* B = transpose(X) */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T
+  // CHECK: HIPBLAS_OP_T
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    n,         /* number of rows of B */
+    batchSize, /* number of columns of B */
+    &h_one,
+    d_X,       /* X is batchSize-by-n */
+    batchSize, /* leading dimension of X */
+    &h_zero,
+    NULL,
+    n, /* don't cae */
+    d_B, /* B is n-by-batchSize */
+    n  /* leading dimension of B */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  // CHECK: hipDeviceSynchronize();
+  cudaDeviceSynchronize();
+
+  /* step 7: residual evaluation */
+  // CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*batchSize, hipMemcpyDeviceToHost);
+  cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*batchSize, cudaMemcpyDeviceToHost);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: hipDeviceSynchronize();
+  cudaDeviceSynchronize();
+
+  printf("==== x1 = inv(A1)*b1 \n");
+  for (int j = 0; j < n; j++) {
+    printf("x1[%d] = %f\n", j, X[j]);
+  }
+
+  float r1_nrminf;
+  residaul_eval(
+    n,
+    dl,
+    d,
+    du,
+    B,
+    X,
+    &r1_nrminf
+  );
+  printf("|b1 - A1*x1| = %E\n", r1_nrminf);
+
+  printf("\n==== x2 = inv(A2)*b2 \n");
+  for (int j = 0; j < n; j++) {
+    printf("x2[%d] = %f\n", j, X[n + j]);
+  }
+
+  float r2_nrminf;
+  residaul_eval(
+    n,
+    dl + n,
+    d + n,
+    du + n,
+    B + n,
+    X + n,
+    &r2_nrminf
+  );
+  printf("|b2 - A2*x2| = %E\n", r2_nrminf);
+
+  /* free resources */
+  // CHECK: if (d_dl0) hipFree(d_dl0);
+  if (d_dl0) cudaFree(d_dl0);
+  // CHECK: if (d_d0) hipFree(d_d0);
+  if (d_d0) cudaFree(d_d0);
+  // CHECK: if (d_du0) hipFree(d_du0);
+  if (d_du0) cudaFree(d_du0);
+  // CHECK: if (d_dl) hipFree(d_dl);
+  if (d_dl) cudaFree(d_dl);
+  // CHECK: if (d_d) hipFree(d_d);
+  if (d_d) cudaFree(d_d);
+  // CHECK: if (d_du) hipFree(d_du);
+  if (d_du) cudaFree(d_du);
+  // CHECK: if (d_B) hipFree(d_B);
+  if (d_B) cudaFree(d_B);
+  // CHECK: if (d_X) hipFree(d_X);
+  if (d_X) cudaFree(d_X);
+  // CHECK: if (cusparseH) hipsparseDestroy(cusparseH);
+  if (cusparseH) cusparseDestroy(cusparseH);
+  // CHECK: if (cublasH) hipblasDestroy(cublasH);
+  if (cublasH) cublasDestroy(cublasH);
+  // CHECK: if (stream) hipStreamDestroy(stream);
+  if (stream) cudaStreamDestroy(stream);
+  // CHECK: hipDeviceReset();
+  cudaDeviceReset();
+
+  return 0;
+}
@@ -0,0 +1,507 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK: #include <hipsparse.h>
+#include <cusparse.h>
+// CHECK: #include <hipblas.h>
+#include <cublas_v2.h>
+
+// NOTE: CUDA 10.0
+
+/*
+ * compute | b - A*x|_inf
+ */
+void residaul_eval(
+  int n,
+  const float *ds,
+  const float *dl,
+  const float *d,
+  const float *du,
+  const float *dw,
+  const float *b,
+  const float *x,
+  float *r_nrminf_ptr)
+{
+  float r_nrminf = 0;
+  for (int i = 0; i < n; i++) {
+    float dot = 0;
+    if (i > 1) {
+      dot += ds[i] * x[i - 2];
+    }
+    if (i > 0) {
+      dot += dl[i] * x[i - 1];
+    }
+    dot += d[i] * x[i];
+    if (i < (n - 1)) {
+      dot += du[i] * x[i + 1];
+    }
+    if (i < (n - 2)) {
+      dot += dw[i] * x[i + 2];
+    }
+    float ri = b[i] - dot;
+    r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri);
+  }
+
+  *r_nrminf_ptr = r_nrminf;
+}
+
+int main(int argc, char*argv[])
+{
+  // CHECK: hipsparseHandle_t cusparseH = NULL;
+  cusparseHandle_t cusparseH = NULL;
+  // CHECK: hipblasHandle_t cublasH = NULL;
+  cublasHandle_t cublasH = NULL;
+  // CHECK: hipStream_t stream = NULL;
+  cudaStream_t stream = NULL;
+  // CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS;
+  cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS;
+  // CHECK: hipblasStatus_t cublasStat = HIPBLAS_STATUS_SUCCESS;
+  cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
+  // CHECK: hipError_t cudaStat1 = hipSuccess;
+  cudaError_t cudaStat1 = cudaSuccess;
+
+  const int n = 4;
+  const int batchSize = 2;
+
+  /*
+   *      |  1    8   13   0  |       | 1 |       | -0.0592 |
+   *  A1 =|  5    2    9  14  |, b1 = | 2 |, x1 = |  0.3428 |
+   *      | 11    6    3  10  |       | 3 |       | -0.1295 |
+   *      |  0   12    7   4  |       | 4 |       |  0.1982 |
+   *
+   *      | 15   22   27   0  |       | 5 |       | -0.0012 |
+   *  A2 =| 19   16   23  28  |, b2 = | 6 |, x2 = |  0.2792 |
+   *      | 25   20   17  24  |       | 7 |       | -0.0416 |
+   *      |  0   26   21  18  |       | 8 |       |  0.0898 |
+   */
+
+   /*
+    * A = (ds, dl, d, du, dw), B and X are in aggregate format
+    */
+  const float ds[n * batchSize] = { 0, 0, 11, 12,  0,  0, 25, 26 };
+  const float dl[n * batchSize] = { 0, 5,  6,  7,  0, 19, 20, 21 };
+  const float  d[n * batchSize] = { 1, 2,  3,  4, 15, 16, 17, 18 };
+  const float du[n * batchSize] = { 8, 9, 10,  0, 22, 23, 24,  0 };
+  const float dw[n * batchSize] = { 13,14,  0,  0, 27, 28,  0,  0 };
+  const float  B[n * batchSize] = { 1, 2,  3,  4,  5,  6,  7,  8 };
+  float X[n * batchSize]; /* Xj = Aj \ Bj */
+
+/* device memory
+ * (d_ds0, d_dl0, d_d0, d_du0, d_dw0) is aggregate format
+ * (d_ds, d_dl, d_d, d_du, d_dw) is interleaved format
+ */
+  float *d_ds0 = NULL;
+  float *d_dl0 = NULL;
+  float *d_d0 = NULL;
+  float *d_du0 = NULL;
+  float *d_dw0 = NULL;
+  float *d_ds = NULL;
+  float *d_dl = NULL;
+  float *d_d = NULL;
+  float *d_du = NULL;
+  float *d_dw = NULL;
+  float *d_B = NULL;
+  float *d_X = NULL;
+
+  size_t lworkInBytes = 0;
+  char *d_work = NULL;
+
+  const float h_one = 1;
+  const float h_zero = 0;
+
+  int algo = 0; /* QR factorization */
+
+  printf("example of gpsv (interleaved format) \n");
+  printf("n = %d, batchSize = %d\n", n, batchSize);
+
+  /* step 1: create cusparse/cublas handle, bind a stream */
+  // CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
+  cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: status = hipsparseCreate(&cusparseH);
+  status = cusparseCreate(&cusparseH);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  // CHECK: status = hipsparseSetStream(cusparseH, stream);
+  status = cusparseSetStream(cusparseH, stream);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  // CHECK: cublasStat = hipblasCreate(&cublasH);
+  cublasStat = cublasCreate(cublasH);
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  // CHECK: cublasStat = hipblasSetStream(cublasH, stream);
+  cublasStat = cublasSetStream(cublasH, stream);
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  /* step 2: allocate device memory */
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_ds0, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_ds0, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_dl0, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_dl0, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_d0, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_d0, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_du0, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_du0, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_dw0, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_dw0, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_ds, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_ds, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_dl, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_dl, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_d, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_d, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_du, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_du, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_dw, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_dw, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_X, sizeof(float)*n*batchSize);
+  cudaStat1 = cudaMalloc((void**)&d_X, sizeof(float)*n*batchSize);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  /* step 3: prepare data in device, interleaved format */
+  // CHECK: cudaStat1 = hipMemcpy(d_ds0, ds, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_ds0, ds, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_d0, d, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_d0, d, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_du0, du, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_du0, du, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_dw0, dw, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_dw0, dw, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: hipDeviceSynchronize();
+  cudaDeviceSynchronize();
+  /* convert ds to interleaved format
+   *  ds = transpose(ds0) */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T,
+  // CHECK: HIPBLAS_OP_T,
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of ds */
+    n,         /* number of columns of ds */
+    &h_one,
+    d_ds0,  /* ds0 is n-by-batchSize */
+    n, /* leading dimension of ds0 */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_ds,      /* ds is batchSize-by-n */
+    batchSize);  /* leading dimension of ds */
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  /* convert dl to interleaved format
+     *  dl = transpose(dl0)
+     */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T,
+  // CHECK: HIPBLAS_OP_T,
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of dl */
+    n,         /* number of columns of dl */
+    &h_one,
+    d_dl0,  /* dl0 is n-by-batchSize */
+    n, /* leading dimension of dl0 */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_dl,      /* dl is batchSize-by-n */
+    batchSize  /* leading dimension of dl */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+
+  /* convert d to interleaved format
+   *  d = transpose(d0)
+   */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T,
+  // CHECK: HIPBLAS_OP_T,
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of d */
+    n,         /* number of columns of d */
+    &h_one,
+    d_d0, /* d0 is n-by-batchSize */
+    n, /* leading dimension of d0 */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_d,       /* d is batchSize-by-n */
+    batchSize  /* leading dimension of d */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+
+  /* convert du to interleaved format
+    *  du = transpose(du0)
+    */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T,
+  // CHECK: HIPBLAS_OP_T,
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of du */
+    n,         /* number of columns of du */
+    &h_one,
+    d_du0, /* du0 is n-by-batchSize */
+    n, /* leading dimension of du0 */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_du,      /* du is batchSize-by-n */
+    batchSize  /* leading dimension of du */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  /* convert dw to interleaved format
+    *  dw = transpose(dw0)
+    */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T,
+  // CHECK: HIPBLAS_OP_T,
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of dw */
+    n,         /* number of columns of dw */
+    &h_one,
+    d_dw0, /* dw0 is n-by-batchSize */
+    n, /* leading dimension of dw0 */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_dw,      /* dw is batchSize-by-n */
+    batchSize  /* leading dimension of dw */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+
+  /* convert B to interleaved format
+   *  X = transpose(B)
+   */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T,
+  // CHECK: HIPBLAS_OP_T,
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    batchSize, /* number of rows of X */
+    n,         /* number of columns of X */
+    &h_one,
+    d_B, /* B is n-by-batchSize */
+    n, /* leading dimension of B */
+    &h_zero,
+    NULL,
+    n,         /* don't cae */
+    d_X,       /* X is batchSize-by-n */
+    batchSize  /* leading dimension of X */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+
+  /* step 4: prepare workspace */
+  // NOTE: CUDA 10.0
+  // TODO: status = hipsparseSgpsvInterleavedBatch_bufferSizeExt(
+  status = cusparseSgpsvInterleavedBatch_bufferSizeExt(
+    cusparseH,
+    algo,
+    n,
+    d_ds,
+    d_dl,
+    d_d,
+    d_du,
+    d_dw,
+    d_X,
+    batchSize,
+    &lworkInBytes);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+
+  printf("lworkInBytes = %lld \n", (long long)lworkInBytes);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes);
+  cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  /* step 5: solve Aj*xj = bj */
+  // NOTE: CUDA 10.0
+  // TODO: status = hipsparseSgpsvInterleavedBatch(
+  status = cusparseSgpsvInterleavedBatch(
+    cusparseH,
+    algo,
+    n,
+    d_ds,
+    d_dl,
+    d_d,
+    d_du,
+    d_dw,
+    d_X,
+    batchSize,
+    d_work);
+  // CHECK: cudaStat1 = hipDeviceSynchronize();
+  cudaStat1 = cudaDeviceSynchronize();
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+
+  /* step 6: convert X back to aggregate format  */
+      /* B = transpose(X) */
+  // CHECK: cublasStat = hipblasSgeam(
+  // CHECK: HIPBLAS_OP_T,
+  // CHECK: HIPBLAS_OP_T,
+  cublasStat = cublasSgeam(
+    cublasH,
+    CUBLAS_OP_T, /* transa */
+    CUBLAS_OP_T, /* transb, don't care */
+    n,         /* number of rows of B */
+    batchSize, /* number of columns of B */
+    &h_one,
+    d_X,       /* X is batchSize-by-n */
+    batchSize, /* leading dimension of X */
+    &h_zero,
+    NULL,
+    n, /* don't cae */
+    d_B, /* B is n-by-batchSize */
+    n  /* leading dimension of B */
+  );
+  // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
+  assert(CUBLAS_STATUS_SUCCESS == cublasStat);
+  // CHECK: hipDeviceSynchronize();
+  cudaDeviceSynchronize();
+
+  /* step 7: residual evaluation */
+  // CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*batchSize, hipMemcpyDeviceToHost);
+  cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*batchSize, cudaMemcpyDeviceToHost);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: hipDeviceSynchronize();
+  cudaDeviceSynchronize();
+
+  printf("==== x1 = inv(A1)*b1 \n");
+  for (int j = 0; j < n; j++) {
+    printf("x1[%d] = %f\n", j, X[j]);
+  }
+
+  float r1_nrminf;
+  residaul_eval(
+    n,
+    ds,
+    dl,
+    d,
+    du,
+    dw,
+    B,
+    X,
+    &r1_nrminf
+  );
+  printf("|b1 - A1*x1| = %E\n", r1_nrminf);
+  printf("\n==== x2 = inv(A2)*b2 \n");
+  for (int j = 0; j < n; j++) {
+    printf("x2[%d] = %f\n", j, X[n + j]);
+  }
+
+  float r2_nrminf;
+  residaul_eval(
+    n,
+    ds + n,
+    dl + n,
+    d + n,
+    du + n,
+    dw + n,
+    B + n,
+    X + n,
+    &r2_nrminf
+  );
+  printf("|b2 - A2*x2| = %E\n", r2_nrminf);
+
+  /* free resources */
+  // CHECK: if (d_ds0) hipFree(d_ds0);
+  if (d_ds0) cudaFree(d_ds0);
+  // CHECK: if (d_dl0) hipFree(d_dl0);
+  if (d_dl0) cudaFree(d_dl0);
+  // CHECK: if (d_d0) hipFree(d_d0);
+  if (d_d0) cudaFree(d_d0);
+  // CHECK: if (d_du0) hipFree(d_du0);
+  if (d_du0) cudaFree(d_du0);
+  // CHECK: if (d_dw0) hipFree(d_dw0);
+  if (d_dw0) cudaFree(d_dw0);
+  // CHECK: if (d_ds) hipFree(d_ds);
+  if (d_ds) cudaFree(d_ds);
+  // CHECK: if (d_dl) hipFree(d_dl);
+  if (d_dl) cudaFree(d_dl);
+  // CHECK: if (d_d) hipFree(d_d);
+  if (d_d) cudaFree(d_d);
+  // CHECK: if (d_du) hipFree(d_du);
+  if (d_du) cudaFree(d_du);
+  // CHECK: if (d_dw) hipFree(d_dw);
+  if (d_dw) cudaFree(d_dw);
+  // CHECK: if (d_B) hipFree(d_B);
+  if (d_B) cudaFree(d_B);
+  // CHECK: if (d_X) hipFree(d_X);
+  if (d_X) cudaFree(d_X);
+  // CHECK: if (cusparseH) hipsparseDestroy(cusparseH);
+  if (cusparseH) cusparseDestroy(cusparseH);
+  // CHECK: if (cublasH) hipblasDestroy(cublasH);
+  if (cublasH) cublasDestroy(cublasH);
+  // CHECK: if (stream) hipStreamDestroy(stream);
+  if (stream) cudaStreamDestroy(stream);
+  // CHECK: hipDeviceReset();
+  cudaDeviceReset();
+
+  return 0;
+}
@@ -0,0 +1,327 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK: #include <hipsparse.h>
+#include <cusparse.h>
+
+// NOTE: CUDA 10.0
+
+/* compute | b - A*x|_inf */
+void residaul_eval(
+  int n,
+  // CHECK: const hipsparseMatDescr_t descrA,
+  const cusparseMatDescr_t descrA,
+  const float *csrVal,
+  const int *csrRowPtr,
+  const int *csrColInd,
+  const float *b,
+  const float *x,
+  float *r_nrminf_ptr)
+{
+  // CHECK: const int base = (hipsparseGetMatIndexBase(descrA) != HIPSPARSE_INDEX_BASE_ONE) ? 0 : 1;
+  const int base = (cusparseGetMatIndexBase(descrA) != CUSPARSE_INDEX_BASE_ONE) ? 0 : 1;
+  // CHECK: const int lower = (HIPSPARSE_FILL_MODE_LOWER == hipsparseGetMatFillMode(descrA)) ? 1 : 0;
+  const int lower = (CUSPARSE_FILL_MODE_LOWER == cusparseGetMatFillMode(descrA)) ? 1 : 0;
+  // CHECK: const int unit = (HIPSPARSE_DIAG_TYPE_UNIT == hipsparseGetMatDiagType(descrA)) ? 1 : 0;
+  const int unit = (CUSPARSE_DIAG_TYPE_UNIT == cusparseGetMatDiagType(descrA)) ? 1 : 0;
+
+  float r_nrminf = 0;
+  for (int row = 0; row < n; row++) {
+    const int start = csrRowPtr[row] - base;
+    const int end = csrRowPtr[row + 1] - base;
+    float dot = 0;
+    for (int colidx = start; colidx < end; colidx++) {
+      const int col = csrColInd[colidx] - base;
+      float Aij = csrVal[colidx];
+      float xj = x[col];
+      if ((row == col) && unit) {
+        Aij = 1.0;
+      }
+      int valid = (row >= col) && lower ||
+        (row <= col) && !lower;
+      if (valid) {
+        dot += Aij * xj;
+      }
+    }
+    float ri = b[row] - dot;
+    r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri);
+  }
+  *r_nrminf_ptr = r_nrminf;
+}
+
+int main(int argc, char*argv[])
+{
+  // CHECK: hipsparseHandle_t handle = NULL;
+  cusparseHandle_t handle = NULL;
+  // CHECK: hipStream_t stream = NULL;
+  cudaStream_t stream = NULL;
+  // CHECK: hipsparseMatDescr_t descrA = NULL;
+  cusparseMatDescr_t descrA = NULL;
+  // NOTE: CUDA 10.0
+  // TODO: csrsm2Info_t info = NULL;
+  csrsm2Info_t info = NULL;
+  // CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS;
+  cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS;
+  // CHECK: hipError_t cudaStat1 = hipSuccess;
+  cudaError_t cudaStat1 = cudaSuccess;
+  const int nrhs = 2;
+  const int n = 4;
+  const int nnzA = 9;
+  // CHECK: const hipsparseSolvePolicy_t policy = HIPSPARSE_SOLVE_POLICY_NO_LEVEL;
+  const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
+  const float h_one = 1.0;
+  /*
+   *      |    1     0     2     -3  |
+   *      |    0     4     0     0   |
+   *  A = |    5     0     6     7   |
+   *      |    0     8     0     9   |
+   *
+   *  Regard A as a lower triangle matrix L with non-unit diagonal.
+   *             | 1  5 |              | 1          5      |
+   *  Given  B = | 2  6 |, X = L \ B = | 0.5       1.5     |
+   *             | 3  7 |              | -0.3333   -3      |
+   *             | 4  8 |              |  0        -0.4444 |
+   */
+  const int csrRowPtrA[n + 1] = { 1, 4, 5, 8, 10 };
+  const int csrColIndA[nnzA] = { 1, 3, 4, 2, 1, 3, 4, 2, 4 };
+  const float csrValA[nnzA] = { 1, 2, -3, 4, 5, 6, 7, 8, 9 };
+  const float B[n*nrhs] = { 1,2,3,4,5,6,7,8 };
+  float X[n*nrhs];
+
+  int *d_csrRowPtrA = NULL;
+  int *d_csrColIndA = NULL;
+  float *d_csrValA = NULL;
+  float *d_B = NULL;
+
+  size_t lworkInBytes = 0;
+  char *d_work = NULL;
+
+  const int algo = 0; /* non-block version */
+
+  printf("example of csrsm2 \n");
+
+  /* step 1: create cusparse handle, bind a stream */
+  // CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
+  cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: status = hipsparseCreate(&handle);
+  status = cusparseCreate(&handle);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+
+  status = cusparseSetStream(handle, stream);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+
+  // NOTE: CUDA 10.0
+  // TODO: status = hipsparseCreateCsrsm2Info(&info);
+  status = cusparseCreateCsrsm2Info(&info);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+
+  /* step 2: configuration of matrix A */
+  status = cusparseCreateMatDescr(&descrA);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  /* A is base-1*/
+  // CHECK: hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ONE);
+  cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE);
+  // CHECK: hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
+  /* A is lower triangle */
+  // CHECK: hipsparseSetMatFillMode(descrA, HIPSPARSE_FILL_MODE_LOWER);
+  cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER);
+  /* A has non unit diagonal */
+  // CHECK: hipsparseSetMatDiagType(descrA, HIPSPARSE_DIAG_TYPE_NON_UNIT);
+  cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_csrRowPtrA, sizeof(int)*(n + 1));
+  cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int)*(n + 1));
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_csrColIndA, sizeof(int)*nnzA);
+  cudaStat1 = cudaMalloc((void**)&d_csrColIndA, sizeof(int)*nnzA);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_csrValA, sizeof(float)*nnzA);
+  cudaStat1 = cudaMalloc((void**)&d_csrValA, sizeof(float)*nnzA);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*nrhs);
+  cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*nrhs);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int)*(n + 1), hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int)*(n + 1), cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_csrColIndA, csrColIndA, sizeof(int)*nnzA, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int)*nnzA, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_csrValA, csrValA, sizeof(float)*nnzA, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_csrValA, csrValA, sizeof(float)*nnzA, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*nrhs, hipMemcpyHostToDevice);
+  cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*nrhs, cudaMemcpyHostToDevice);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+
+  /* step 3: query workspace */
+  // NOTE: CUDA 10.0
+  // TODO: status = hipsparseScsrsm2_bufferSizeExt(
+  // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
+  // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
+  status = cusparseScsrsm2_bufferSizeExt(
+    handle,
+    algo,
+    CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */
+    CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */
+    n,
+    nrhs,
+    nnzA,
+    &h_one,
+    descrA,
+    d_csrValA,
+    d_csrRowPtrA,
+    d_csrColIndA,
+    d_B,
+    n,   /* ldb */
+    info,
+    policy,
+    &lworkInBytes);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+
+  printf("lworkInBytes  = %lld \n", (long long)lworkInBytes);
+  // CHECK: if (NULL != d_work) { hipFree(d_work); }
+  if (NULL != d_work) { cudaFree(d_work); }
+  // CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes);
+  cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+
+  /* step 4: analysis */
+  // NOTE: CUDA 10.0
+  // TODO: status = hipsparseScsrsm2_analysis(
+  // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
+  // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
+  status = cusparseScsrsm2_analysis(
+    handle,
+    algo,
+    CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */
+    CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */
+    n,
+    nrhs,
+    nnzA,
+    &h_one,
+    descrA,
+    d_csrValA,
+    d_csrRowPtrA,
+    d_csrColIndA,
+    d_B,
+    n,   /* ldb */
+    info,
+    policy,
+    d_work);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  /* step 5: solve L * X = B */
+  // NOTE: CUDA 10.0
+  // TODO: status = hipsparseScsrsm2_solve(
+  // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
+  // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
+  status = cusparseScsrsm2_solve(
+    handle,
+    algo,
+    CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */
+    CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */
+    n,
+    nrhs,
+    nnzA,
+    &h_one,
+    descrA,
+    d_csrValA,
+    d_csrRowPtrA,
+    d_csrColIndA,
+    d_B,
+    n,   /* ldb */
+    info,
+    policy,
+    d_work);
+  // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
+  assert(CUSPARSE_STATUS_SUCCESS == status);
+  // CHECK: cudaStat1 = hipDeviceSynchronize();
+  cudaStat1 = cudaDeviceSynchronize();
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+
+  /* step 6:measure residual B - A*X */
+  // CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*nrhs, hipMemcpyDeviceToHost);
+  cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*nrhs, cudaMemcpyDeviceToHost);
+  // CHECK: assert(hipSuccess == cudaStat1);
+  assert(cudaSuccess == cudaStat1);
+  // CHECK: hipDeviceSynchronize();
+  cudaDeviceSynchronize();
+
+  printf("==== x1 = inv(A)*b1 \n");
+  for (int j = 0; j < n; j++) {
+    printf("x1[%d] = %f\n", j, X[j]);
+  }
+  float r1_nrminf;
+  residaul_eval(
+    n,
+    descrA,
+    csrValA,
+    csrRowPtrA,
+    csrColIndA,
+    B,
+    X,
+    &r1_nrminf
+  );
+  printf("|b1 - A*x1| = %E\n", r1_nrminf);
+
+  printf("==== x2 = inv(A)*b2 \n");
+  for (int j = 0; j < n; j++) {
+    printf("x2[%d] = %f\n", j, X[n + j]);
+  }
+  float r2_nrminf;
+  residaul_eval(
+    n,
+    descrA,
+    csrValA,
+    csrRowPtrA,
+    csrColIndA,
+    B + n,
+    X + n,
+    &r2_nrminf
+  );
+  printf("|b2 - A*x2| = %E\n", r2_nrminf);
+
+  /* free resources */
+  // CHECK: if (d_csrRowPtrA) hipFree(d_csrRowPtrA);
+  if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
+  // CHECK: if (d_csrColIndA) hipFree(d_csrColIndA);
+  if (d_csrColIndA) cudaFree(d_csrColIndA);
+  // CHECK: if (d_csrValA) hipFree(d_csrValA);
+  if (d_csrValA) cudaFree(d_csrValA);
+  // CHECK: if (d_B) hipFree(d_B);
+  if (d_B) cudaFree(d_B);
+  // CHECK: if (handle) hipsparseDestroy(handle);
+  if (handle) cusparseDestroy(handle);
+  // CHECK: if (stream) hipStreamDestroy(stream);
+  if (stream) cudaStreamDestroy(stream);
+  // CHECK: if (descrA) hipsparseDestroyMatDescr(descrA);
+  if (descrA) cusparseDestroyMatDescr(descrA);
+  // NOTE: CUDA 10.0
+  // TODO: if (info) hipsparseDestroyCsrsm2Info(info);
+  if (info) cusparseDestroyCsrsm2Info(info);
+  // CHECK: hipDeviceReset();
+  cudaDeviceReset();
+
+  return 0;
+}
@@ -26,6 +26,9 @@ config.excludes = ['cmdparser.hpp']
 config.cuda_version = "@CUDA_VERSION@"
 if config.cuda_version not in ['10.0']:
    config.excludes.append('cuSPARSE_08.cu')
+    config.excludes.append('cuSPARSE_09.cu')
+    config.excludes.append('cuSPARSE_10.cu')
+    config.excludes.append('cuSPARSE_11.cu')

 # test_exec_root: The path where tests are located (default is the test suite root).
 #config.test_exec_root = config.test_source_root