[HIPIFY][SPARSE] Add 3 more CUDA 10.0 tests

+ lit update
+ fix typos
This commit is contained in:
Evgeny Mankov
2018-12-04 19:24:29 +03:00
parent f6ec2236cd
commit a421aff96d
7 changed files with 1256 additions and 5 deletions
@@ -576,7 +576,7 @@
|`cusparseXcoosortByColumn` |`hipsparseXcoosortByColumn` |
|`cusparseXcsrsort_bufferSizeExt` |`hipsparseXcsrsort_bufferSizeExt` |
|`cusparseXcsrsort` |`hipsparseXcsrsort` |
|`cusparseScusparseXcscsort_bufferSizeExtnnz` | |
|`cusparseXcscsort_bufferSizeExt` | |
|`cusparseXcscsort` | |
|`cusparseCreateCsru2csrInfo` | |
|`cusparseDestroyCsru2csrInfo` | |
@@ -554,7 +554,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
{"cusparseXcsrsort_bufferSizeExt", {"hipsparseXcsrsort_bufferSizeExt", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseXcsrsort", {"hipsparseXcsrsort", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseScusparseXcscsort_bufferSizeExtnnz", {"hipsparseXcscsort_bufferSizeExt", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseXcscsort_bufferSizeExt", {"hipsparseXcscsort_bufferSizeExt", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseXcscsort", {"hipsparseXcscsort", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseCreateCsru2csrInfo", {"hipsparseCreateCsru2csrInfo", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
@@ -277,8 +277,8 @@ int main(int argc, char*argv[])
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* step 4: prepare workspace */
// NOTE: CUDA 10.0
// CHECK: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt(
status = cusparseSgtsvInterleavedBatch_bufferSizeExt(
// TODO: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt(
status = cusparseSgtsvInterleavedBatch_bufferSizeExt(
cusparseH,
algo,
n,
@@ -299,7 +299,7 @@ int main(int argc, char*argv[])
/* step 5: solve Aj*xj = bj */
// NOTE: CUDA 10.0
// CHECK: status = hipsparseSgtsvInterleavedBatch(
// TODO: status = hipsparseSgtsvInterleavedBatch(
status = cusparseSgtsvInterleavedBatch(
cusparseH,
algo,
@@ -0,0 +1,414 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
// CHECK: #include <hip/hip_runtime.h>
#include <cuda_runtime.h>
// CHECK: #include <hipsparse.h>
#include <cusparse.h>
// CHECK: #include <hipblas.h>
#include <cublas_v2.h>
// NOTE: CUDA 10.0
/*
* compute | b - A*x|_inf
*/
void residaul_eval(
int n,
const float *dl,
const float *d,
const float *du,
const float *b,
const float *x,
float *r_nrminf_ptr)
{
float r_nrminf = 0;
for (int i = 0; i < n; i++) {
float dot = 0;
if (i > 0) {
dot += dl[i] * x[i - 1];
}
dot += d[i] * x[i];
if (i < (n - 1)) {
dot += du[i] * x[i + 1];
}
float ri = b[i] - dot;
r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri);
}
*r_nrminf_ptr = r_nrminf;
}
int main(int argc, char*argv[])
{
// CHECK: hipsparseHandle_t cusparseH = NULL;
cusparseHandle_t cusparseH = NULL;
// CHECK: hipblasHandle_t cublasH = NULL;
cublasHandle_t cublasH = NULL;
// CHECK: hipStream_t stream = NULL;
cudaStream_t stream = NULL;
// CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS;
cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS;
// CHECK: hipblasStatus_t cublasStat = HIPBLAS_STATUS_SUCCESS;
cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
// CHECK: hipError_t cudaStat1 = hipSuccess;
cudaError_t cudaStat1 = cudaSuccess;
const int n = 3;
const int batchSize = 2;
/*
* | 1 6 0 | | 1 | | -0.603960 |
* A1 =| 4 2 7 |, b1 = | 2 |, x1 = | 0.267327 |
* | 0 5 3 | | 3 | | 0.554455 |
*
* | 8 13 0 | | 4 | | -0.063291 |
* A2 =| 11 9 14 |, b2 = | 5 |, x2 = | 0.346641 |
* | 0 12 10 | | 6 | | 0.184031 |
*/
/*
* A = (dl, d, du), B and X are in aggregate format
*/
const float dl[n * batchSize] = { 0, 4, 5, 0, 11, 12 };
const float d[n * batchSize] = { 1, 2, 3, 8, 9, 10 };
const float du[n * batchSize] = { 6, 7, 0, 13, 14, 0 };
const float B[n * batchSize] = { 1, 2, 3, 4, 5, 6 };
float X[n * batchSize]; /* Xj = Aj \ Bj */
/* device memory
* (d_dl0, d_d0, d_du0) is aggregate format
* (d_dl, d_d, d_du) is interleaved format
*/
float *d_dl0 = NULL;
float *d_d0 = NULL;
float *d_du0 = NULL;
float *d_dl = NULL;
float *d_d = NULL;
float *d_du = NULL;
float *d_B = NULL;
float *d_X = NULL;
size_t lworkInBytes = 0;
char *d_work = NULL;
/*
* algo = 0: cuThomas (unstable)
* algo = 1: LU with pivoting (stable)
* algo = 2: QR (stable)
*/
const int algo = 2;
const float h_one = 1;
const float h_zero = 0;
printf("example of gtsv (interleaved format) \n");
printf("choose algo = 0,1,2 to select different algorithms \n");
printf("n = %d, batchSize = %d, algo = %d \n", n, batchSize, algo);
/* step 1: create cusparse/cublas handle, bind a stream */
// CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: status = hipsparseCreate(&cusparseH);
status = cusparseCreate(&cusparseH);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
// CHECK: status = hipsparseSetStream(cusparseH, stream);
status = cusparseSetStream(cusparseH, stream);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
// CHECK: cublasStat = hipblasCreate(&cublasH);
cublasStat = cublasCreate(&cublasH);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
// CHECK: cublasStat = hipblasSetStream(cublasH, stream);
cublasStat = cublasSetStream(cublasH, stream);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* step 2: allocate device memory */
// CHECK: cudaStat1 = hipMalloc((void**)&d_dl0, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_dl0, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_d0, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_d0, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_du0, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_du0, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_dl, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_dl, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_d, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_d, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_du, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_du, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_X, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_X, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 3: prepare data in device, interleaved format */
// CHECK: cudaStat1 = hipMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_d0, d, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_d0, d, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_du0, du, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_du0, du, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: hipDeviceSynchronize();
cudaDeviceSynchronize();
/* convert dl to interleaved format
* dl = transpose(dl0)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T,
// CHECK: HIPBLAS_OP_T,
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of dl */
n, /* number of columns of dl */
&h_one,
d_dl0, /* dl0 is n-by-batchSize */
n, /* leading dimension of dl0 */
&h_zero,
NULL,
n, /* don't care */
d_dl, /* dl is batchSize-by-n */
batchSize /* leading dimension of dl */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* convert d to interleaved format
* d = transpose(d0)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T
// CHECK: HIPBLAS_OP_T
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of d */
n, /* number of columns of d */
&h_one,
d_d0, /* d0 is n-by-batchSize */
n, /* leading dimension of d0 */
&h_zero,
NULL,
n, /* don't cae */
d_d, /* d is batchSize-by-n */
batchSize /* leading dimension of d */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* convert du to interleaved format
* du = transpose(du0)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T
// CHECK: HIPBLAS_OP_T
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of du */
n, /* number of columns of du */
&h_one,
d_du0, /* du0 is n-by-batchSize */
n, /* leading dimension of du0 */
&h_zero,
NULL,
n, /* don't cae */
d_du, /* du is batchSize-by-n */
batchSize /* leading dimension of du */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* convert B to interleaved format
* X = transpose(B)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T
// CHECK: HIPBLAS_OP_T
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of X */
n, /* number of columns of X */
&h_one,
d_B, /* B is n-by-batchSize */
n, /* leading dimension of B */
&h_zero,
NULL,
n, /* don't cae */
d_X, /* X is batchSize-by-n */
batchSize /* leading dimension of X */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* step 4: prepare workspace */
// NOTE: CUDA 10.0
// TODO: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt(
status = cusparseSgtsvInterleavedBatch_bufferSizeExt(
cusparseH,
algo,
n,
d_dl,
d_d,
d_du,
d_X,
batchSize,
&lworkInBytes);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
printf("lworkInBytes = %lld \n", (long long)lworkInBytes);
// CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes);
cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 5: solve Aj*xj = bj */
// NOTE: CUDA 10.0
// TODO: status = hipsparseSgtsvInterleavedBatch(
status = cusparseSgtsvInterleavedBatch(
cusparseH,
algo,
n,
d_dl,
d_d,
d_du,
d_X,
batchSize,
d_work);
// CHECK: cudaStat1 = hipDeviceSynchronize();
cudaStat1 = cudaDeviceSynchronize();
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 6: convert X back to aggregate format */
/* B = transpose(X) */
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T
// CHECK: HIPBLAS_OP_T
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
n, /* number of rows of B */
batchSize, /* number of columns of B */
&h_one,
d_X, /* X is batchSize-by-n */
batchSize, /* leading dimension of X */
&h_zero,
NULL,
n, /* don't cae */
d_B, /* B is n-by-batchSize */
n /* leading dimension of B */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
// CHECK: hipDeviceSynchronize();
cudaDeviceSynchronize();
/* step 7: residual evaluation */
// CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*batchSize, hipMemcpyDeviceToHost);
cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*batchSize, cudaMemcpyDeviceToHost);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: hipDeviceSynchronize();
cudaDeviceSynchronize();
printf("==== x1 = inv(A1)*b1 \n");
for (int j = 0; j < n; j++) {
printf("x1[%d] = %f\n", j, X[j]);
}
float r1_nrminf;
residaul_eval(
n,
dl,
d,
du,
B,
X,
&r1_nrminf
);
printf("|b1 - A1*x1| = %E\n", r1_nrminf);
printf("\n==== x2 = inv(A2)*b2 \n");
for (int j = 0; j < n; j++) {
printf("x2[%d] = %f\n", j, X[n + j]);
}
float r2_nrminf;
residaul_eval(
n,
dl + n,
d + n,
du + n,
B + n,
X + n,
&r2_nrminf
);
printf("|b2 - A2*x2| = %E\n", r2_nrminf);
/* free resources */
// CHECK: if (d_dl0) hipFree(d_dl0);
if (d_dl0) cudaFree(d_dl0);
// CHECK: if (d_d0) hipFree(d_d0);
if (d_d0) cudaFree(d_d0);
// CHECK: if (d_du0) hipFree(d_du0);
if (d_du0) cudaFree(d_du0);
// CHECK: if (d_dl) hipFree(d_dl);
if (d_dl) cudaFree(d_dl);
// CHECK: if (d_d) hipFree(d_d);
if (d_d) cudaFree(d_d);
// CHECK: if (d_du) hipFree(d_du);
if (d_du) cudaFree(d_du);
// CHECK: if (d_B) hipFree(d_B);
if (d_B) cudaFree(d_B);
// CHECK: if (d_X) hipFree(d_X);
if (d_X) cudaFree(d_X);
// CHECK: if (cusparseH) hipsparseDestroy(cusparseH);
if (cusparseH) cusparseDestroy(cusparseH);
// CHECK: if (cublasH) hipblasDestroy(cublasH);
if (cublasH) cublasDestroy(cublasH);
// CHECK: if (stream) hipStreamDestroy(stream);
if (stream) cudaStreamDestroy(stream);
// CHECK: hipDeviceReset();
cudaDeviceReset();
return 0;
}
@@ -0,0 +1,507 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
// CHECK: #include <hip/hip_runtime.h>
#include <cuda_runtime.h>
// CHECK: #include <hipsparse.h>
#include <cusparse.h>
// CHECK: #include <hipblas.h>
#include <cublas_v2.h>
// NOTE: CUDA 10.0
/*
* compute | b - A*x|_inf
*/
void residaul_eval(
int n,
const float *ds,
const float *dl,
const float *d,
const float *du,
const float *dw,
const float *b,
const float *x,
float *r_nrminf_ptr)
{
float r_nrminf = 0;
for (int i = 0; i < n; i++) {
float dot = 0;
if (i > 1) {
dot += ds[i] * x[i - 2];
}
if (i > 0) {
dot += dl[i] * x[i - 1];
}
dot += d[i] * x[i];
if (i < (n - 1)) {
dot += du[i] * x[i + 1];
}
if (i < (n - 2)) {
dot += dw[i] * x[i + 2];
}
float ri = b[i] - dot;
r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri);
}
*r_nrminf_ptr = r_nrminf;
}
int main(int argc, char*argv[])
{
// CHECK: hipsparseHandle_t cusparseH = NULL;
cusparseHandle_t cusparseH = NULL;
// CHECK: hipblasHandle_t cublasH = NULL;
cublasHandle_t cublasH = NULL;
// CHECK: hipStream_t stream = NULL;
cudaStream_t stream = NULL;
// CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS;
cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS;
// CHECK: hipblasStatus_t cublasStat = HIPBLAS_STATUS_SUCCESS;
cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
// CHECK: hipError_t cudaStat1 = hipSuccess;
cudaError_t cudaStat1 = cudaSuccess;
const int n = 4;
const int batchSize = 2;
/*
* | 1 8 13 0 | | 1 | | -0.0592 |
* A1 =| 5 2 9 14 |, b1 = | 2 |, x1 = | 0.3428 |
* | 11 6 3 10 | | 3 | | -0.1295 |
* | 0 12 7 4 | | 4 | | 0.1982 |
*
* | 15 22 27 0 | | 5 | | -0.0012 |
* A2 =| 19 16 23 28 |, b2 = | 6 |, x2 = | 0.2792 |
* | 25 20 17 24 | | 7 | | -0.0416 |
* | 0 26 21 18 | | 8 | | 0.0898 |
*/
/*
* A = (ds, dl, d, du, dw), B and X are in aggregate format
*/
const float ds[n * batchSize] = { 0, 0, 11, 12, 0, 0, 25, 26 };
const float dl[n * batchSize] = { 0, 5, 6, 7, 0, 19, 20, 21 };
const float d[n * batchSize] = { 1, 2, 3, 4, 15, 16, 17, 18 };
const float du[n * batchSize] = { 8, 9, 10, 0, 22, 23, 24, 0 };
const float dw[n * batchSize] = { 13,14, 0, 0, 27, 28, 0, 0 };
const float B[n * batchSize] = { 1, 2, 3, 4, 5, 6, 7, 8 };
float X[n * batchSize]; /* Xj = Aj \ Bj */
/* device memory
* (d_ds0, d_dl0, d_d0, d_du0, d_dw0) is aggregate format
* (d_ds, d_dl, d_d, d_du, d_dw) is interleaved format
*/
float *d_ds0 = NULL;
float *d_dl0 = NULL;
float *d_d0 = NULL;
float *d_du0 = NULL;
float *d_dw0 = NULL;
float *d_ds = NULL;
float *d_dl = NULL;
float *d_d = NULL;
float *d_du = NULL;
float *d_dw = NULL;
float *d_B = NULL;
float *d_X = NULL;
size_t lworkInBytes = 0;
char *d_work = NULL;
const float h_one = 1;
const float h_zero = 0;
int algo = 0; /* QR factorization */
printf("example of gpsv (interleaved format) \n");
printf("n = %d, batchSize = %d\n", n, batchSize);
/* step 1: create cusparse/cublas handle, bind a stream */
// CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: status = hipsparseCreate(&cusparseH);
status = cusparseCreate(&cusparseH);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
// CHECK: status = hipsparseSetStream(cusparseH, stream);
status = cusparseSetStream(cusparseH, stream);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
// CHECK: cublasStat = hipblasCreate(&cublasH);
cublasStat = cublasCreate(cublasH);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
// CHECK: cublasStat = hipblasSetStream(cublasH, stream);
cublasStat = cublasSetStream(cublasH, stream);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* step 2: allocate device memory */
// CHECK: cudaStat1 = hipMalloc((void**)&d_ds0, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_ds0, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_dl0, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_dl0, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_d0, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_d0, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_du0, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_du0, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_dw0, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_dw0, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_ds, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_ds, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_dl, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_dl, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_d, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_d, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_du, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_du, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_dw, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_dw, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_X, sizeof(float)*n*batchSize);
cudaStat1 = cudaMalloc((void**)&d_X, sizeof(float)*n*batchSize);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 3: prepare data in device, interleaved format */
// CHECK: cudaStat1 = hipMemcpy(d_ds0, ds, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_ds0, ds, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_d0, d, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_d0, d, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_du0, du, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_du0, du, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_dw0, dw, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_dw0, dw, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*batchSize, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: hipDeviceSynchronize();
cudaDeviceSynchronize();
/* convert ds to interleaved format
* ds = transpose(ds0) */
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T,
// CHECK: HIPBLAS_OP_T,
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of ds */
n, /* number of columns of ds */
&h_one,
d_ds0, /* ds0 is n-by-batchSize */
n, /* leading dimension of ds0 */
&h_zero,
NULL,
n, /* don't cae */
d_ds, /* ds is batchSize-by-n */
batchSize); /* leading dimension of ds */
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* convert dl to interleaved format
* dl = transpose(dl0)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T,
// CHECK: HIPBLAS_OP_T,
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of dl */
n, /* number of columns of dl */
&h_one,
d_dl0, /* dl0 is n-by-batchSize */
n, /* leading dimension of dl0 */
&h_zero,
NULL,
n, /* don't cae */
d_dl, /* dl is batchSize-by-n */
batchSize /* leading dimension of dl */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* convert d to interleaved format
* d = transpose(d0)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T,
// CHECK: HIPBLAS_OP_T,
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of d */
n, /* number of columns of d */
&h_one,
d_d0, /* d0 is n-by-batchSize */
n, /* leading dimension of d0 */
&h_zero,
NULL,
n, /* don't cae */
d_d, /* d is batchSize-by-n */
batchSize /* leading dimension of d */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* convert du to interleaved format
* du = transpose(du0)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T,
// CHECK: HIPBLAS_OP_T,
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of du */
n, /* number of columns of du */
&h_one,
d_du0, /* du0 is n-by-batchSize */
n, /* leading dimension of du0 */
&h_zero,
NULL,
n, /* don't cae */
d_du, /* du is batchSize-by-n */
batchSize /* leading dimension of du */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* convert dw to interleaved format
* dw = transpose(dw0)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T,
// CHECK: HIPBLAS_OP_T,
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of dw */
n, /* number of columns of dw */
&h_one,
d_dw0, /* dw0 is n-by-batchSize */
n, /* leading dimension of dw0 */
&h_zero,
NULL,
n, /* don't cae */
d_dw, /* dw is batchSize-by-n */
batchSize /* leading dimension of dw */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* convert B to interleaved format
* X = transpose(B)
*/
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T,
// CHECK: HIPBLAS_OP_T,
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
batchSize, /* number of rows of X */
n, /* number of columns of X */
&h_one,
d_B, /* B is n-by-batchSize */
n, /* leading dimension of B */
&h_zero,
NULL,
n, /* don't cae */
d_X, /* X is batchSize-by-n */
batchSize /* leading dimension of X */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
/* step 4: prepare workspace */
// NOTE: CUDA 10.0
// TODO: status = hipsparseSgpsvInterleavedBatch_bufferSizeExt(
status = cusparseSgpsvInterleavedBatch_bufferSizeExt(
cusparseH,
algo,
n,
d_ds,
d_dl,
d_d,
d_du,
d_dw,
d_X,
batchSize,
&lworkInBytes);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
printf("lworkInBytes = %lld \n", (long long)lworkInBytes);
// CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes);
cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 5: solve Aj*xj = bj */
// NOTE: CUDA 10.0
// TODO: status = hipsparseSgpsvInterleavedBatch(
status = cusparseSgpsvInterleavedBatch(
cusparseH,
algo,
n,
d_ds,
d_dl,
d_d,
d_du,
d_dw,
d_X,
batchSize,
d_work);
// CHECK: cudaStat1 = hipDeviceSynchronize();
cudaStat1 = cudaDeviceSynchronize();
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 6: convert X back to aggregate format */
/* B = transpose(X) */
// CHECK: cublasStat = hipblasSgeam(
// CHECK: HIPBLAS_OP_T,
// CHECK: HIPBLAS_OP_T,
cublasStat = cublasSgeam(
cublasH,
CUBLAS_OP_T, /* transa */
CUBLAS_OP_T, /* transb, don't care */
n, /* number of rows of B */
batchSize, /* number of columns of B */
&h_one,
d_X, /* X is batchSize-by-n */
batchSize, /* leading dimension of X */
&h_zero,
NULL,
n, /* don't cae */
d_B, /* B is n-by-batchSize */
n /* leading dimension of B */
);
// CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
// CHECK: hipDeviceSynchronize();
cudaDeviceSynchronize();
/* step 7: residual evaluation */
// CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*batchSize, hipMemcpyDeviceToHost);
cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*batchSize, cudaMemcpyDeviceToHost);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: hipDeviceSynchronize();
cudaDeviceSynchronize();
printf("==== x1 = inv(A1)*b1 \n");
for (int j = 0; j < n; j++) {
printf("x1[%d] = %f\n", j, X[j]);
}
float r1_nrminf;
residaul_eval(
n,
ds,
dl,
d,
du,
dw,
B,
X,
&r1_nrminf
);
printf("|b1 - A1*x1| = %E\n", r1_nrminf);
printf("\n==== x2 = inv(A2)*b2 \n");
for (int j = 0; j < n; j++) {
printf("x2[%d] = %f\n", j, X[n + j]);
}
float r2_nrminf;
residaul_eval(
n,
ds + n,
dl + n,
d + n,
du + n,
dw + n,
B + n,
X + n,
&r2_nrminf
);
printf("|b2 - A2*x2| = %E\n", r2_nrminf);
/* free resources */
// CHECK: if (d_ds0) hipFree(d_ds0);
if (d_ds0) cudaFree(d_ds0);
// CHECK: if (d_dl0) hipFree(d_dl0);
if (d_dl0) cudaFree(d_dl0);
// CHECK: if (d_d0) hipFree(d_d0);
if (d_d0) cudaFree(d_d0);
// CHECK: if (d_du0) hipFree(d_du0);
if (d_du0) cudaFree(d_du0);
// CHECK: if (d_dw0) hipFree(d_dw0);
if (d_dw0) cudaFree(d_dw0);
// CHECK: if (d_ds) hipFree(d_ds);
if (d_ds) cudaFree(d_ds);
// CHECK: if (d_dl) hipFree(d_dl);
if (d_dl) cudaFree(d_dl);
// CHECK: if (d_d) hipFree(d_d);
if (d_d) cudaFree(d_d);
// CHECK: if (d_du) hipFree(d_du);
if (d_du) cudaFree(d_du);
// CHECK: if (d_dw) hipFree(d_dw);
if (d_dw) cudaFree(d_dw);
// CHECK: if (d_B) hipFree(d_B);
if (d_B) cudaFree(d_B);
// CHECK: if (d_X) hipFree(d_X);
if (d_X) cudaFree(d_X);
// CHECK: if (cusparseH) hipsparseDestroy(cusparseH);
if (cusparseH) cusparseDestroy(cusparseH);
// CHECK: if (cublasH) hipblasDestroy(cublasH);
if (cublasH) cublasDestroy(cublasH);
// CHECK: if (stream) hipStreamDestroy(stream);
if (stream) cudaStreamDestroy(stream);
// CHECK: hipDeviceReset();
cudaDeviceReset();
return 0;
}
@@ -0,0 +1,327 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
// CHECK: #include <hip/hip_runtime.h>
#include <cuda_runtime.h>
// CHECK: #include <hipsparse.h>
#include <cusparse.h>
// NOTE: CUDA 10.0
/* compute | b - A*x|_inf */
void residaul_eval(
int n,
// CHECK: const hipsparseMatDescr_t descrA,
const cusparseMatDescr_t descrA,
const float *csrVal,
const int *csrRowPtr,
const int *csrColInd,
const float *b,
const float *x,
float *r_nrminf_ptr)
{
// CHECK: const int base = (hipsparseGetMatIndexBase(descrA) != HIPSPARSE_INDEX_BASE_ONE) ? 0 : 1;
const int base = (cusparseGetMatIndexBase(descrA) != CUSPARSE_INDEX_BASE_ONE) ? 0 : 1;
// CHECK: const int lower = (HIPSPARSE_FILL_MODE_LOWER == hipsparseGetMatFillMode(descrA)) ? 1 : 0;
const int lower = (CUSPARSE_FILL_MODE_LOWER == cusparseGetMatFillMode(descrA)) ? 1 : 0;
// CHECK: const int unit = (HIPSPARSE_DIAG_TYPE_UNIT == hipsparseGetMatDiagType(descrA)) ? 1 : 0;
const int unit = (CUSPARSE_DIAG_TYPE_UNIT == cusparseGetMatDiagType(descrA)) ? 1 : 0;
float r_nrminf = 0;
for (int row = 0; row < n; row++) {
const int start = csrRowPtr[row] - base;
const int end = csrRowPtr[row + 1] - base;
float dot = 0;
for (int colidx = start; colidx < end; colidx++) {
const int col = csrColInd[colidx] - base;
float Aij = csrVal[colidx];
float xj = x[col];
if ((row == col) && unit) {
Aij = 1.0;
}
int valid = (row >= col) && lower ||
(row <= col) && !lower;
if (valid) {
dot += Aij * xj;
}
}
float ri = b[row] - dot;
r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri);
}
*r_nrminf_ptr = r_nrminf;
}
int main(int argc, char*argv[])
{
// CHECK: hipsparseHandle_t handle = NULL;
cusparseHandle_t handle = NULL;
// CHECK: hipStream_t stream = NULL;
cudaStream_t stream = NULL;
// CHECK: hipsparseMatDescr_t descrA = NULL;
cusparseMatDescr_t descrA = NULL;
// NOTE: CUDA 10.0
// TODO: csrsm2Info_t info = NULL;
csrsm2Info_t info = NULL;
// CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS;
cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS;
// CHECK: hipError_t cudaStat1 = hipSuccess;
cudaError_t cudaStat1 = cudaSuccess;
const int nrhs = 2;
const int n = 4;
const int nnzA = 9;
// CHECK: const hipsparseSolvePolicy_t policy = HIPSPARSE_SOLVE_POLICY_NO_LEVEL;
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
const float h_one = 1.0;
/*
* | 1 0 2 -3 |
* | 0 4 0 0 |
* A = | 5 0 6 7 |
* | 0 8 0 9 |
*
* Regard A as a lower triangle matrix L with non-unit diagonal.
* | 1 5 | | 1 5 |
* Given B = | 2 6 |, X = L \ B = | 0.5 1.5 |
* | 3 7 | | -0.3333 -3 |
* | 4 8 | | 0 -0.4444 |
*/
const int csrRowPtrA[n + 1] = { 1, 4, 5, 8, 10 };
const int csrColIndA[nnzA] = { 1, 3, 4, 2, 1, 3, 4, 2, 4 };
const float csrValA[nnzA] = { 1, 2, -3, 4, 5, 6, 7, 8, 9 };
const float B[n*nrhs] = { 1,2,3,4,5,6,7,8 };
float X[n*nrhs];
int *d_csrRowPtrA = NULL;
int *d_csrColIndA = NULL;
float *d_csrValA = NULL;
float *d_B = NULL;
size_t lworkInBytes = 0;
char *d_work = NULL;
const int algo = 0; /* non-block version */
printf("example of csrsm2 \n");
/* step 1: create cusparse handle, bind a stream */
// CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: status = hipsparseCreate(&handle);
status = cusparseCreate(&handle);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
status = cusparseSetStream(handle, stream);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
// NOTE: CUDA 10.0
// TODO: status = hipsparseCreateCsrsm2Info(&info);
status = cusparseCreateCsrsm2Info(&info);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
/* step 2: configuration of matrix A */
status = cusparseCreateMatDescr(&descrA);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
/* A is base-1*/
// CHECK: hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ONE);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE);
// CHECK: hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
/* A is lower triangle */
// CHECK: hipsparseSetMatFillMode(descrA, HIPSPARSE_FILL_MODE_LOWER);
cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER);
/* A has non unit diagonal */
// CHECK: hipsparseSetMatDiagType(descrA, HIPSPARSE_DIAG_TYPE_NON_UNIT);
cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT);
// CHECK: cudaStat1 = hipMalloc((void**)&d_csrRowPtrA, sizeof(int)*(n + 1));
cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int)*(n + 1));
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_csrColIndA, sizeof(int)*nnzA);
cudaStat1 = cudaMalloc((void**)&d_csrColIndA, sizeof(int)*nnzA);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_csrValA, sizeof(float)*nnzA);
cudaStat1 = cudaMalloc((void**)&d_csrValA, sizeof(float)*nnzA);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*nrhs);
cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*nrhs);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int)*(n + 1), hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int)*(n + 1), cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_csrColIndA, csrColIndA, sizeof(int)*nnzA, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int)*nnzA, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_csrValA, csrValA, sizeof(float)*nnzA, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_csrValA, csrValA, sizeof(float)*nnzA, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*nrhs, hipMemcpyHostToDevice);
cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*nrhs, cudaMemcpyHostToDevice);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 3: query workspace */
// NOTE: CUDA 10.0
// TODO: status = hipsparseScsrsm2_bufferSizeExt(
// CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
// CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
status = cusparseScsrsm2_bufferSizeExt(
handle,
algo,
CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */
CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */
n,
nrhs,
nnzA,
&h_one,
descrA,
d_csrValA,
d_csrRowPtrA,
d_csrColIndA,
d_B,
n, /* ldb */
info,
policy,
&lworkInBytes);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
printf("lworkInBytes = %lld \n", (long long)lworkInBytes);
// CHECK: if (NULL != d_work) { hipFree(d_work); }
if (NULL != d_work) { cudaFree(d_work); }
// CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes);
cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 4: analysis */
// NOTE: CUDA 10.0
// TODO: status = hipsparseScsrsm2_analysis(
// CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
// CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
status = cusparseScsrsm2_analysis(
handle,
algo,
CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */
CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */
n,
nrhs,
nnzA,
&h_one,
descrA,
d_csrValA,
d_csrRowPtrA,
d_csrColIndA,
d_B,
n, /* ldb */
info,
policy,
d_work);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
/* step 5: solve L * X = B */
// NOTE: CUDA 10.0
// TODO: status = hipsparseScsrsm2_solve(
// CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
// CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE,
status = cusparseScsrsm2_solve(
handle,
algo,
CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */
CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */
n,
nrhs,
nnzA,
&h_one,
descrA,
d_csrValA,
d_csrRowPtrA,
d_csrColIndA,
d_B,
n, /* ldb */
info,
policy,
d_work);
// CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status);
assert(CUSPARSE_STATUS_SUCCESS == status);
// CHECK: cudaStat1 = hipDeviceSynchronize();
cudaStat1 = cudaDeviceSynchronize();
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
/* step 6:measure residual B - A*X */
// CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*nrhs, hipMemcpyDeviceToHost);
cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*nrhs, cudaMemcpyDeviceToHost);
// CHECK: assert(hipSuccess == cudaStat1);
assert(cudaSuccess == cudaStat1);
// CHECK: hipDeviceSynchronize();
cudaDeviceSynchronize();
printf("==== x1 = inv(A)*b1 \n");
for (int j = 0; j < n; j++) {
printf("x1[%d] = %f\n", j, X[j]);
}
float r1_nrminf;
residaul_eval(
n,
descrA,
csrValA,
csrRowPtrA,
csrColIndA,
B,
X,
&r1_nrminf
);
printf("|b1 - A*x1| = %E\n", r1_nrminf);
printf("==== x2 = inv(A)*b2 \n");
for (int j = 0; j < n; j++) {
printf("x2[%d] = %f\n", j, X[n + j]);
}
float r2_nrminf;
residaul_eval(
n,
descrA,
csrValA,
csrRowPtrA,
csrColIndA,
B + n,
X + n,
&r2_nrminf
);
printf("|b2 - A*x2| = %E\n", r2_nrminf);
/* free resources */
// CHECK: if (d_csrRowPtrA) hipFree(d_csrRowPtrA);
if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
// CHECK: if (d_csrColIndA) hipFree(d_csrColIndA);
if (d_csrColIndA) cudaFree(d_csrColIndA);
// CHECK: if (d_csrValA) hipFree(d_csrValA);
if (d_csrValA) cudaFree(d_csrValA);
// CHECK: if (d_B) hipFree(d_B);
if (d_B) cudaFree(d_B);
// CHECK: if (handle) hipsparseDestroy(handle);
if (handle) cusparseDestroy(handle);
// CHECK: if (stream) hipStreamDestroy(stream);
if (stream) cudaStreamDestroy(stream);
// CHECK: if (descrA) hipsparseDestroyMatDescr(descrA);
if (descrA) cusparseDestroyMatDescr(descrA);
// NOTE: CUDA 10.0
// TODO: if (info) hipsparseDestroyCsrsm2Info(info);
if (info) cusparseDestroyCsrsm2Info(info);
// CHECK: hipDeviceReset();
cudaDeviceReset();
return 0;
}
+3
View File
@@ -26,6 +26,9 @@ config.excludes = ['cmdparser.hpp']
config.cuda_version = "@CUDA_VERSION@"
if config.cuda_version not in ['10.0']:
config.excludes.append('cuSPARSE_08.cu')
config.excludes.append('cuSPARSE_09.cu')
config.excludes.append('cuSPARSE_10.cu')
config.excludes.append('cuSPARSE_11.cu')
# test_exec_root: The path where tests are located (default is the test suite root).
#config.test_exec_root = config.test_source_root