diff --git a/projects/clr/hipamd/docs/markdown/CUSPARSE_API_supported_by_HIP.md b/projects/clr/hipamd/docs/markdown/CUSPARSE_API_supported_by_HIP.md index f552246f11..4dadea2c93 100644 --- a/projects/clr/hipamd/docs/markdown/CUSPARSE_API_supported_by_HIP.md +++ b/projects/clr/hipamd/docs/markdown/CUSPARSE_API_supported_by_HIP.md @@ -576,7 +576,7 @@ |`cusparseXcoosortByColumn` |`hipsparseXcoosortByColumn` | |`cusparseXcsrsort_bufferSizeExt` |`hipsparseXcsrsort_bufferSizeExt` | |`cusparseXcsrsort` |`hipsparseXcsrsort` | -|`cusparseScusparseXcscsort_bufferSizeExtnnz` | | +|`cusparseXcscsort_bufferSizeExt` | | |`cusparseXcscsort` | | |`cusparseCreateCsru2csrInfo` | | |`cusparseDestroyCsru2csrInfo` | | diff --git a/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_SPARSE_API_functions.cpp b/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_SPARSE_API_functions.cpp index f3b0f0eb99..29726a5048 100644 --- a/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_SPARSE_API_functions.cpp +++ b/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_SPARSE_API_functions.cpp @@ -554,7 +554,7 @@ const std::map CUDA_SPARSE_FUNCTION_MAP{ {"cusparseXcsrsort_bufferSizeExt", {"hipsparseXcsrsort_bufferSizeExt", CONV_LIB_FUNC, API_SPARSE}}, {"cusparseXcsrsort", {"hipsparseXcsrsort", CONV_LIB_FUNC, API_SPARSE}}, - {"cusparseScusparseXcscsort_bufferSizeExtnnz", {"hipsparseXcscsort_bufferSizeExt", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, + {"cusparseXcscsort_bufferSizeExt", {"hipsparseXcscsort_bufferSizeExt", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseXcscsort", {"hipsparseXcscsort", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseCreateCsru2csrInfo", {"hipsparseCreateCsru2csrInfo", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, diff --git a/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_08.cu b/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_08.cu index fcfde8d3b2..cf788c8f33 100644 --- a/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_08.cu +++ b/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_08.cu @@ -277,8 +277,8 @@ int main(int argc, char*argv[]) assert(CUBLAS_STATUS_SUCCESS == cublasStat); /* step 4: prepare workspace */ // NOTE: CUDA 10.0 - // CHECK: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt( - status = cusparseSgtsvInterleavedBatch_bufferSizeExt( + // TODO: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt( + status = cusparseSgtsvInterleavedBatch_bufferSizeExt( cusparseH, algo, n, @@ -299,7 +299,7 @@ int main(int argc, char*argv[]) /* step 5: solve Aj*xj = bj */ // NOTE: CUDA 10.0 - // CHECK: status = hipsparseSgtsvInterleavedBatch( + // TODO: status = hipsparseSgtsvInterleavedBatch( status = cusparseSgtsvInterleavedBatch( cusparseH, algo, diff --git a/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_09.cu b/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_09.cu new file mode 100644 index 0000000000..9b1aaf623f --- /dev/null +++ b/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_09.cu @@ -0,0 +1,414 @@ +// RUN: %run_test hipify "%s" "%t" %cuda_args +#include +#include +#include +// CHECK: #include +#include +// CHECK: #include +#include +// CHECK: #include +#include + +// NOTE: CUDA 10.0 + +/* + * compute | b - A*x|_inf + */ +void residaul_eval( + int n, + const float *dl, + const float *d, + const float *du, + const float *b, + const float *x, + float *r_nrminf_ptr) +{ + float r_nrminf = 0; + for (int i = 0; i < n; i++) { + float dot = 0; + if (i > 0) { + dot += dl[i] * x[i - 1]; + } + dot += d[i] * x[i]; + if (i < (n - 1)) { + dot += du[i] * x[i + 1]; + } + float ri = b[i] - dot; + r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri); + } + + *r_nrminf_ptr = r_nrminf; +} + +int main(int argc, char*argv[]) +{ + // CHECK: hipsparseHandle_t cusparseH = NULL; + cusparseHandle_t cusparseH = NULL; + // CHECK: hipblasHandle_t cublasH = NULL; + cublasHandle_t cublasH = NULL; + // CHECK: hipStream_t stream = NULL; + cudaStream_t stream = NULL; + // CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS; + cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS; + // CHECK: hipblasStatus_t cublasStat = HIPBLAS_STATUS_SUCCESS; + cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS; + // CHECK: hipError_t cudaStat1 = hipSuccess; + cudaError_t cudaStat1 = cudaSuccess; + + const int n = 3; + const int batchSize = 2; + /* + * | 1 6 0 | | 1 | | -0.603960 | + * A1 =| 4 2 7 |, b1 = | 2 |, x1 = | 0.267327 | + * | 0 5 3 | | 3 | | 0.554455 | + * + * | 8 13 0 | | 4 | | -0.063291 | + * A2 =| 11 9 14 |, b2 = | 5 |, x2 = | 0.346641 | + * | 0 12 10 | | 6 | | 0.184031 | + */ + + /* + * A = (dl, d, du), B and X are in aggregate format + */ + const float dl[n * batchSize] = { 0, 4, 5, 0, 11, 12 }; + const float d[n * batchSize] = { 1, 2, 3, 8, 9, 10 }; + const float du[n * batchSize] = { 6, 7, 0, 13, 14, 0 }; + const float B[n * batchSize] = { 1, 2, 3, 4, 5, 6 }; + float X[n * batchSize]; /* Xj = Aj \ Bj */ + +/* device memory + * (d_dl0, d_d0, d_du0) is aggregate format + * (d_dl, d_d, d_du) is interleaved format + */ + float *d_dl0 = NULL; + float *d_d0 = NULL; + float *d_du0 = NULL; + float *d_dl = NULL; + float *d_d = NULL; + float *d_du = NULL; + float *d_B = NULL; + float *d_X = NULL; + + size_t lworkInBytes = 0; + char *d_work = NULL; + + /* + * algo = 0: cuThomas (unstable) + * algo = 1: LU with pivoting (stable) + * algo = 2: QR (stable) + */ + const int algo = 2; + + const float h_one = 1; + const float h_zero = 0; + + printf("example of gtsv (interleaved format) \n"); + printf("choose algo = 0,1,2 to select different algorithms \n"); + printf("n = %d, batchSize = %d, algo = %d \n", n, batchSize, algo); + + /* step 1: create cusparse/cublas handle, bind a stream */ + // CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking); + cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: status = hipsparseCreate(&cusparseH); + status = cusparseCreate(&cusparseH); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + // CHECK: status = hipsparseSetStream(cusparseH, stream); + status = cusparseSetStream(cusparseH, stream); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + // CHECK: cublasStat = hipblasCreate(&cublasH); + cublasStat = cublasCreate(&cublasH); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + // CHECK: cublasStat = hipblasSetStream(cublasH, stream); + cublasStat = cublasSetStream(cublasH, stream); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + + /* step 2: allocate device memory */ + // CHECK: cudaStat1 = hipMalloc((void**)&d_dl0, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_dl0, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_d0, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_d0, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_du0, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_du0, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_dl, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_dl, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_d, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_d, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_du, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_du, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_X, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_X, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + + /* step 3: prepare data in device, interleaved format */ + // CHECK: cudaStat1 = hipMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_d0, d, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_d0, d, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_du0, du, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_du0, du, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: hipDeviceSynchronize(); + cudaDeviceSynchronize(); + /* convert dl to interleaved format + * dl = transpose(dl0) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T, + // CHECK: HIPBLAS_OP_T, + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of dl */ + n, /* number of columns of dl */ + &h_one, + d_dl0, /* dl0 is n-by-batchSize */ + n, /* leading dimension of dl0 */ + &h_zero, + NULL, + n, /* don't care */ + d_dl, /* dl is batchSize-by-n */ + batchSize /* leading dimension of dl */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + /* convert d to interleaved format + * d = transpose(d0) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T + // CHECK: HIPBLAS_OP_T + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of d */ + n, /* number of columns of d */ + &h_one, + d_d0, /* d0 is n-by-batchSize */ + n, /* leading dimension of d0 */ + &h_zero, + NULL, + n, /* don't cae */ + d_d, /* d is batchSize-by-n */ + batchSize /* leading dimension of d */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + + /* convert du to interleaved format + * du = transpose(du0) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T + // CHECK: HIPBLAS_OP_T + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of du */ + n, /* number of columns of du */ + &h_one, + d_du0, /* du0 is n-by-batchSize */ + n, /* leading dimension of du0 */ + &h_zero, + NULL, + n, /* don't cae */ + d_du, /* du is batchSize-by-n */ + batchSize /* leading dimension of du */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + + /* convert B to interleaved format + * X = transpose(B) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T + // CHECK: HIPBLAS_OP_T + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of X */ + n, /* number of columns of X */ + &h_one, + d_B, /* B is n-by-batchSize */ + n, /* leading dimension of B */ + &h_zero, + NULL, + n, /* don't cae */ + d_X, /* X is batchSize-by-n */ + batchSize /* leading dimension of X */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + /* step 4: prepare workspace */ + // NOTE: CUDA 10.0 + // TODO: status = hipsparseSgtsvInterleavedBatch_bufferSizeExt( + status = cusparseSgtsvInterleavedBatch_bufferSizeExt( + cusparseH, + algo, + n, + d_dl, + d_d, + d_du, + d_X, + batchSize, + &lworkInBytes); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + + printf("lworkInBytes = %lld \n", (long long)lworkInBytes); + // CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes); + cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + + /* step 5: solve Aj*xj = bj */ + // NOTE: CUDA 10.0 + // TODO: status = hipsparseSgtsvInterleavedBatch( + status = cusparseSgtsvInterleavedBatch( + cusparseH, + algo, + n, + d_dl, + d_d, + d_du, + d_X, + batchSize, + d_work); + // CHECK: cudaStat1 = hipDeviceSynchronize(); + cudaStat1 = cudaDeviceSynchronize(); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + + /* step 6: convert X back to aggregate format */ + /* B = transpose(X) */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T + // CHECK: HIPBLAS_OP_T + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + n, /* number of rows of B */ + batchSize, /* number of columns of B */ + &h_one, + d_X, /* X is batchSize-by-n */ + batchSize, /* leading dimension of X */ + &h_zero, + NULL, + n, /* don't cae */ + d_B, /* B is n-by-batchSize */ + n /* leading dimension of B */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + // CHECK: hipDeviceSynchronize(); + cudaDeviceSynchronize(); + + /* step 7: residual evaluation */ + // CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*batchSize, hipMemcpyDeviceToHost); + cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*batchSize, cudaMemcpyDeviceToHost); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: hipDeviceSynchronize(); + cudaDeviceSynchronize(); + + printf("==== x1 = inv(A1)*b1 \n"); + for (int j = 0; j < n; j++) { + printf("x1[%d] = %f\n", j, X[j]); + } + + float r1_nrminf; + residaul_eval( + n, + dl, + d, + du, + B, + X, + &r1_nrminf + ); + printf("|b1 - A1*x1| = %E\n", r1_nrminf); + + printf("\n==== x2 = inv(A2)*b2 \n"); + for (int j = 0; j < n; j++) { + printf("x2[%d] = %f\n", j, X[n + j]); + } + + float r2_nrminf; + residaul_eval( + n, + dl + n, + d + n, + du + n, + B + n, + X + n, + &r2_nrminf + ); + printf("|b2 - A2*x2| = %E\n", r2_nrminf); + + /* free resources */ + // CHECK: if (d_dl0) hipFree(d_dl0); + if (d_dl0) cudaFree(d_dl0); + // CHECK: if (d_d0) hipFree(d_d0); + if (d_d0) cudaFree(d_d0); + // CHECK: if (d_du0) hipFree(d_du0); + if (d_du0) cudaFree(d_du0); + // CHECK: if (d_dl) hipFree(d_dl); + if (d_dl) cudaFree(d_dl); + // CHECK: if (d_d) hipFree(d_d); + if (d_d) cudaFree(d_d); + // CHECK: if (d_du) hipFree(d_du); + if (d_du) cudaFree(d_du); + // CHECK: if (d_B) hipFree(d_B); + if (d_B) cudaFree(d_B); + // CHECK: if (d_X) hipFree(d_X); + if (d_X) cudaFree(d_X); + // CHECK: if (cusparseH) hipsparseDestroy(cusparseH); + if (cusparseH) cusparseDestroy(cusparseH); + // CHECK: if (cublasH) hipblasDestroy(cublasH); + if (cublasH) cublasDestroy(cublasH); + // CHECK: if (stream) hipStreamDestroy(stream); + if (stream) cudaStreamDestroy(stream); + // CHECK: hipDeviceReset(); + cudaDeviceReset(); + + return 0; +} diff --git a/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_10.cu b/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_10.cu new file mode 100644 index 0000000000..326c231de4 --- /dev/null +++ b/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_10.cu @@ -0,0 +1,507 @@ +// RUN: %run_test hipify "%s" "%t" %cuda_args +#include +#include +#include +// CHECK: #include +#include +// CHECK: #include +#include +// CHECK: #include +#include + +// NOTE: CUDA 10.0 + +/* + * compute | b - A*x|_inf + */ +void residaul_eval( + int n, + const float *ds, + const float *dl, + const float *d, + const float *du, + const float *dw, + const float *b, + const float *x, + float *r_nrminf_ptr) +{ + float r_nrminf = 0; + for (int i = 0; i < n; i++) { + float dot = 0; + if (i > 1) { + dot += ds[i] * x[i - 2]; + } + if (i > 0) { + dot += dl[i] * x[i - 1]; + } + dot += d[i] * x[i]; + if (i < (n - 1)) { + dot += du[i] * x[i + 1]; + } + if (i < (n - 2)) { + dot += dw[i] * x[i + 2]; + } + float ri = b[i] - dot; + r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri); + } + + *r_nrminf_ptr = r_nrminf; +} + +int main(int argc, char*argv[]) +{ + // CHECK: hipsparseHandle_t cusparseH = NULL; + cusparseHandle_t cusparseH = NULL; + // CHECK: hipblasHandle_t cublasH = NULL; + cublasHandle_t cublasH = NULL; + // CHECK: hipStream_t stream = NULL; + cudaStream_t stream = NULL; + // CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS; + cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS; + // CHECK: hipblasStatus_t cublasStat = HIPBLAS_STATUS_SUCCESS; + cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS; + // CHECK: hipError_t cudaStat1 = hipSuccess; + cudaError_t cudaStat1 = cudaSuccess; + + const int n = 4; + const int batchSize = 2; + + /* + * | 1 8 13 0 | | 1 | | -0.0592 | + * A1 =| 5 2 9 14 |, b1 = | 2 |, x1 = | 0.3428 | + * | 11 6 3 10 | | 3 | | -0.1295 | + * | 0 12 7 4 | | 4 | | 0.1982 | + * + * | 15 22 27 0 | | 5 | | -0.0012 | + * A2 =| 19 16 23 28 |, b2 = | 6 |, x2 = | 0.2792 | + * | 25 20 17 24 | | 7 | | -0.0416 | + * | 0 26 21 18 | | 8 | | 0.0898 | + */ + + /* + * A = (ds, dl, d, du, dw), B and X are in aggregate format + */ + const float ds[n * batchSize] = { 0, 0, 11, 12, 0, 0, 25, 26 }; + const float dl[n * batchSize] = { 0, 5, 6, 7, 0, 19, 20, 21 }; + const float d[n * batchSize] = { 1, 2, 3, 4, 15, 16, 17, 18 }; + const float du[n * batchSize] = { 8, 9, 10, 0, 22, 23, 24, 0 }; + const float dw[n * batchSize] = { 13,14, 0, 0, 27, 28, 0, 0 }; + const float B[n * batchSize] = { 1, 2, 3, 4, 5, 6, 7, 8 }; + float X[n * batchSize]; /* Xj = Aj \ Bj */ + +/* device memory + * (d_ds0, d_dl0, d_d0, d_du0, d_dw0) is aggregate format + * (d_ds, d_dl, d_d, d_du, d_dw) is interleaved format + */ + float *d_ds0 = NULL; + float *d_dl0 = NULL; + float *d_d0 = NULL; + float *d_du0 = NULL; + float *d_dw0 = NULL; + float *d_ds = NULL; + float *d_dl = NULL; + float *d_d = NULL; + float *d_du = NULL; + float *d_dw = NULL; + float *d_B = NULL; + float *d_X = NULL; + + size_t lworkInBytes = 0; + char *d_work = NULL; + + const float h_one = 1; + const float h_zero = 0; + + int algo = 0; /* QR factorization */ + + printf("example of gpsv (interleaved format) \n"); + printf("n = %d, batchSize = %d\n", n, batchSize); + + /* step 1: create cusparse/cublas handle, bind a stream */ + // CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking); + cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: status = hipsparseCreate(&cusparseH); + status = cusparseCreate(&cusparseH); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + // CHECK: status = hipsparseSetStream(cusparseH, stream); + status = cusparseSetStream(cusparseH, stream); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + // CHECK: cublasStat = hipblasCreate(&cublasH); + cublasStat = cublasCreate(cublasH); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + // CHECK: cublasStat = hipblasSetStream(cublasH, stream); + cublasStat = cublasSetStream(cublasH, stream); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + /* step 2: allocate device memory */ + // CHECK: cudaStat1 = hipMalloc((void**)&d_ds0, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_ds0, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_dl0, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_dl0, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_d0, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_d0, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_du0, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_du0, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_dw0, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_dw0, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_ds, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_ds, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_dl, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_dl, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_d, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_d, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_du, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_du, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_dw, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_dw, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_X, sizeof(float)*n*batchSize); + cudaStat1 = cudaMalloc((void**)&d_X, sizeof(float)*n*batchSize); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + /* step 3: prepare data in device, interleaved format */ + // CHECK: cudaStat1 = hipMemcpy(d_ds0, ds, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_ds0, ds, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_dl0, dl, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_d0, d, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_d0, d, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_du0, du, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_du0, du, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_dw0, dw, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_dw0, dw, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*batchSize, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*batchSize, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: hipDeviceSynchronize(); + cudaDeviceSynchronize(); + /* convert ds to interleaved format + * ds = transpose(ds0) */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T, + // CHECK: HIPBLAS_OP_T, + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of ds */ + n, /* number of columns of ds */ + &h_one, + d_ds0, /* ds0 is n-by-batchSize */ + n, /* leading dimension of ds0 */ + &h_zero, + NULL, + n, /* don't cae */ + d_ds, /* ds is batchSize-by-n */ + batchSize); /* leading dimension of ds */ + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + /* convert dl to interleaved format + * dl = transpose(dl0) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T, + // CHECK: HIPBLAS_OP_T, + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of dl */ + n, /* number of columns of dl */ + &h_one, + d_dl0, /* dl0 is n-by-batchSize */ + n, /* leading dimension of dl0 */ + &h_zero, + NULL, + n, /* don't cae */ + d_dl, /* dl is batchSize-by-n */ + batchSize /* leading dimension of dl */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + + /* convert d to interleaved format + * d = transpose(d0) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T, + // CHECK: HIPBLAS_OP_T, + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of d */ + n, /* number of columns of d */ + &h_one, + d_d0, /* d0 is n-by-batchSize */ + n, /* leading dimension of d0 */ + &h_zero, + NULL, + n, /* don't cae */ + d_d, /* d is batchSize-by-n */ + batchSize /* leading dimension of d */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + + /* convert du to interleaved format + * du = transpose(du0) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T, + // CHECK: HIPBLAS_OP_T, + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of du */ + n, /* number of columns of du */ + &h_one, + d_du0, /* du0 is n-by-batchSize */ + n, /* leading dimension of du0 */ + &h_zero, + NULL, + n, /* don't cae */ + d_du, /* du is batchSize-by-n */ + batchSize /* leading dimension of du */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + /* convert dw to interleaved format + * dw = transpose(dw0) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T, + // CHECK: HIPBLAS_OP_T, + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of dw */ + n, /* number of columns of dw */ + &h_one, + d_dw0, /* dw0 is n-by-batchSize */ + n, /* leading dimension of dw0 */ + &h_zero, + NULL, + n, /* don't cae */ + d_dw, /* dw is batchSize-by-n */ + batchSize /* leading dimension of dw */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + + /* convert B to interleaved format + * X = transpose(B) + */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T, + // CHECK: HIPBLAS_OP_T, + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + batchSize, /* number of rows of X */ + n, /* number of columns of X */ + &h_one, + d_B, /* B is n-by-batchSize */ + n, /* leading dimension of B */ + &h_zero, + NULL, + n, /* don't cae */ + d_X, /* X is batchSize-by-n */ + batchSize /* leading dimension of X */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + + /* step 4: prepare workspace */ + // NOTE: CUDA 10.0 + // TODO: status = hipsparseSgpsvInterleavedBatch_bufferSizeExt( + status = cusparseSgpsvInterleavedBatch_bufferSizeExt( + cusparseH, + algo, + n, + d_ds, + d_dl, + d_d, + d_du, + d_dw, + d_X, + batchSize, + &lworkInBytes); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + + printf("lworkInBytes = %lld \n", (long long)lworkInBytes); + // CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes); + cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + /* step 5: solve Aj*xj = bj */ + // NOTE: CUDA 10.0 + // TODO: status = hipsparseSgpsvInterleavedBatch( + status = cusparseSgpsvInterleavedBatch( + cusparseH, + algo, + n, + d_ds, + d_dl, + d_d, + d_du, + d_dw, + d_X, + batchSize, + d_work); + // CHECK: cudaStat1 = hipDeviceSynchronize(); + cudaStat1 = cudaDeviceSynchronize(); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + + /* step 6: convert X back to aggregate format */ + /* B = transpose(X) */ + // CHECK: cublasStat = hipblasSgeam( + // CHECK: HIPBLAS_OP_T, + // CHECK: HIPBLAS_OP_T, + cublasStat = cublasSgeam( + cublasH, + CUBLAS_OP_T, /* transa */ + CUBLAS_OP_T, /* transb, don't care */ + n, /* number of rows of B */ + batchSize, /* number of columns of B */ + &h_one, + d_X, /* X is batchSize-by-n */ + batchSize, /* leading dimension of X */ + &h_zero, + NULL, + n, /* don't cae */ + d_B, /* B is n-by-batchSize */ + n /* leading dimension of B */ + ); + // CHECK: assert(HIPBLAS_STATUS_SUCCESS == cublasStat); + assert(CUBLAS_STATUS_SUCCESS == cublasStat); + // CHECK: hipDeviceSynchronize(); + cudaDeviceSynchronize(); + + /* step 7: residual evaluation */ + // CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*batchSize, hipMemcpyDeviceToHost); + cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*batchSize, cudaMemcpyDeviceToHost); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: hipDeviceSynchronize(); + cudaDeviceSynchronize(); + + printf("==== x1 = inv(A1)*b1 \n"); + for (int j = 0; j < n; j++) { + printf("x1[%d] = %f\n", j, X[j]); + } + + float r1_nrminf; + residaul_eval( + n, + ds, + dl, + d, + du, + dw, + B, + X, + &r1_nrminf + ); + printf("|b1 - A1*x1| = %E\n", r1_nrminf); + printf("\n==== x2 = inv(A2)*b2 \n"); + for (int j = 0; j < n; j++) { + printf("x2[%d] = %f\n", j, X[n + j]); + } + + float r2_nrminf; + residaul_eval( + n, + ds + n, + dl + n, + d + n, + du + n, + dw + n, + B + n, + X + n, + &r2_nrminf + ); + printf("|b2 - A2*x2| = %E\n", r2_nrminf); + + /* free resources */ + // CHECK: if (d_ds0) hipFree(d_ds0); + if (d_ds0) cudaFree(d_ds0); + // CHECK: if (d_dl0) hipFree(d_dl0); + if (d_dl0) cudaFree(d_dl0); + // CHECK: if (d_d0) hipFree(d_d0); + if (d_d0) cudaFree(d_d0); + // CHECK: if (d_du0) hipFree(d_du0); + if (d_du0) cudaFree(d_du0); + // CHECK: if (d_dw0) hipFree(d_dw0); + if (d_dw0) cudaFree(d_dw0); + // CHECK: if (d_ds) hipFree(d_ds); + if (d_ds) cudaFree(d_ds); + // CHECK: if (d_dl) hipFree(d_dl); + if (d_dl) cudaFree(d_dl); + // CHECK: if (d_d) hipFree(d_d); + if (d_d) cudaFree(d_d); + // CHECK: if (d_du) hipFree(d_du); + if (d_du) cudaFree(d_du); + // CHECK: if (d_dw) hipFree(d_dw); + if (d_dw) cudaFree(d_dw); + // CHECK: if (d_B) hipFree(d_B); + if (d_B) cudaFree(d_B); + // CHECK: if (d_X) hipFree(d_X); + if (d_X) cudaFree(d_X); + // CHECK: if (cusparseH) hipsparseDestroy(cusparseH); + if (cusparseH) cusparseDestroy(cusparseH); + // CHECK: if (cublasH) hipblasDestroy(cublasH); + if (cublasH) cublasDestroy(cublasH); + // CHECK: if (stream) hipStreamDestroy(stream); + if (stream) cudaStreamDestroy(stream); + // CHECK: hipDeviceReset(); + cudaDeviceReset(); + + return 0; +} diff --git a/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_11.cu b/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_11.cu new file mode 100644 index 0000000000..2d905bcf6d --- /dev/null +++ b/projects/clr/hipamd/tests/hipify-clang/cuSPARSE/cuSPARSE_11.cu @@ -0,0 +1,327 @@ +// RUN: %run_test hipify "%s" "%t" %cuda_args +#include +#include +#include +// CHECK: #include +#include +// CHECK: #include +#include + +// NOTE: CUDA 10.0 + +/* compute | b - A*x|_inf */ +void residaul_eval( + int n, + // CHECK: const hipsparseMatDescr_t descrA, + const cusparseMatDescr_t descrA, + const float *csrVal, + const int *csrRowPtr, + const int *csrColInd, + const float *b, + const float *x, + float *r_nrminf_ptr) +{ + // CHECK: const int base = (hipsparseGetMatIndexBase(descrA) != HIPSPARSE_INDEX_BASE_ONE) ? 0 : 1; + const int base = (cusparseGetMatIndexBase(descrA) != CUSPARSE_INDEX_BASE_ONE) ? 0 : 1; + // CHECK: const int lower = (HIPSPARSE_FILL_MODE_LOWER == hipsparseGetMatFillMode(descrA)) ? 1 : 0; + const int lower = (CUSPARSE_FILL_MODE_LOWER == cusparseGetMatFillMode(descrA)) ? 1 : 0; + // CHECK: const int unit = (HIPSPARSE_DIAG_TYPE_UNIT == hipsparseGetMatDiagType(descrA)) ? 1 : 0; + const int unit = (CUSPARSE_DIAG_TYPE_UNIT == cusparseGetMatDiagType(descrA)) ? 1 : 0; + + float r_nrminf = 0; + for (int row = 0; row < n; row++) { + const int start = csrRowPtr[row] - base; + const int end = csrRowPtr[row + 1] - base; + float dot = 0; + for (int colidx = start; colidx < end; colidx++) { + const int col = csrColInd[colidx] - base; + float Aij = csrVal[colidx]; + float xj = x[col]; + if ((row == col) && unit) { + Aij = 1.0; + } + int valid = (row >= col) && lower || + (row <= col) && !lower; + if (valid) { + dot += Aij * xj; + } + } + float ri = b[row] - dot; + r_nrminf = (r_nrminf > fabs(ri)) ? r_nrminf : fabs(ri); + } + *r_nrminf_ptr = r_nrminf; +} + +int main(int argc, char*argv[]) +{ + // CHECK: hipsparseHandle_t handle = NULL; + cusparseHandle_t handle = NULL; + // CHECK: hipStream_t stream = NULL; + cudaStream_t stream = NULL; + // CHECK: hipsparseMatDescr_t descrA = NULL; + cusparseMatDescr_t descrA = NULL; + // NOTE: CUDA 10.0 + // TODO: csrsm2Info_t info = NULL; + csrsm2Info_t info = NULL; + // CHECK: hipsparseStatus_t status = HIPSPARSE_STATUS_SUCCESS; + cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS; + // CHECK: hipError_t cudaStat1 = hipSuccess; + cudaError_t cudaStat1 = cudaSuccess; + const int nrhs = 2; + const int n = 4; + const int nnzA = 9; + // CHECK: const hipsparseSolvePolicy_t policy = HIPSPARSE_SOLVE_POLICY_NO_LEVEL; + const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_NO_LEVEL; + const float h_one = 1.0; + /* + * | 1 0 2 -3 | + * | 0 4 0 0 | + * A = | 5 0 6 7 | + * | 0 8 0 9 | + * + * Regard A as a lower triangle matrix L with non-unit diagonal. + * | 1 5 | | 1 5 | + * Given B = | 2 6 |, X = L \ B = | 0.5 1.5 | + * | 3 7 | | -0.3333 -3 | + * | 4 8 | | 0 -0.4444 | + */ + const int csrRowPtrA[n + 1] = { 1, 4, 5, 8, 10 }; + const int csrColIndA[nnzA] = { 1, 3, 4, 2, 1, 3, 4, 2, 4 }; + const float csrValA[nnzA] = { 1, 2, -3, 4, 5, 6, 7, 8, 9 }; + const float B[n*nrhs] = { 1,2,3,4,5,6,7,8 }; + float X[n*nrhs]; + + int *d_csrRowPtrA = NULL; + int *d_csrColIndA = NULL; + float *d_csrValA = NULL; + float *d_B = NULL; + + size_t lworkInBytes = 0; + char *d_work = NULL; + + const int algo = 0; /* non-block version */ + + printf("example of csrsm2 \n"); + + /* step 1: create cusparse handle, bind a stream */ + // CHECK: cudaStat1 = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking); + cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: status = hipsparseCreate(&handle); + status = cusparseCreate(&handle); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + + status = cusparseSetStream(handle, stream); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + + // NOTE: CUDA 10.0 + // TODO: status = hipsparseCreateCsrsm2Info(&info); + status = cusparseCreateCsrsm2Info(&info); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + + /* step 2: configuration of matrix A */ + status = cusparseCreateMatDescr(&descrA); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + /* A is base-1*/ + // CHECK: hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ONE); + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE); + // CHECK: hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); + /* A is lower triangle */ + // CHECK: hipsparseSetMatFillMode(descrA, HIPSPARSE_FILL_MODE_LOWER); + cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER); + /* A has non unit diagonal */ + // CHECK: hipsparseSetMatDiagType(descrA, HIPSPARSE_DIAG_TYPE_NON_UNIT); + cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT); + // CHECK: cudaStat1 = hipMalloc((void**)&d_csrRowPtrA, sizeof(int)*(n + 1)); + cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int)*(n + 1)); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_csrColIndA, sizeof(int)*nnzA); + cudaStat1 = cudaMalloc((void**)&d_csrColIndA, sizeof(int)*nnzA); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_csrValA, sizeof(float)*nnzA); + cudaStat1 = cudaMalloc((void**)&d_csrValA, sizeof(float)*nnzA); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMalloc((void**)&d_B, sizeof(float)*n*nrhs); + cudaStat1 = cudaMalloc((void**)&d_B, sizeof(float)*n*nrhs); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int)*(n + 1), hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int)*(n + 1), cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_csrColIndA, csrColIndA, sizeof(int)*nnzA, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int)*nnzA, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_csrValA, csrValA, sizeof(float)*nnzA, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_csrValA, csrValA, sizeof(float)*nnzA, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: cudaStat1 = hipMemcpy(d_B, B, sizeof(float)*n*nrhs, hipMemcpyHostToDevice); + cudaStat1 = cudaMemcpy(d_B, B, sizeof(float)*n*nrhs, cudaMemcpyHostToDevice); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + + /* step 3: query workspace */ + // NOTE: CUDA 10.0 + // TODO: status = hipsparseScsrsm2_bufferSizeExt( + // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE, + // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE, + status = cusparseScsrsm2_bufferSizeExt( + handle, + algo, + CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */ + CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */ + n, + nrhs, + nnzA, + &h_one, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_B, + n, /* ldb */ + info, + policy, + &lworkInBytes); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + + printf("lworkInBytes = %lld \n", (long long)lworkInBytes); + // CHECK: if (NULL != d_work) { hipFree(d_work); } + if (NULL != d_work) { cudaFree(d_work); } + // CHECK: cudaStat1 = hipMalloc((void**)&d_work, lworkInBytes); + cudaStat1 = cudaMalloc((void**)&d_work, lworkInBytes); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + + /* step 4: analysis */ + // NOTE: CUDA 10.0 + // TODO: status = hipsparseScsrsm2_analysis( + // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE, + // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE, + status = cusparseScsrsm2_analysis( + handle, + algo, + CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */ + CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */ + n, + nrhs, + nnzA, + &h_one, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_B, + n, /* ldb */ + info, + policy, + d_work); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + /* step 5: solve L * X = B */ + // NOTE: CUDA 10.0 + // TODO: status = hipsparseScsrsm2_solve( + // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE, + // CHECK: HIPSPARSE_OPERATION_NON_TRANSPOSE, + status = cusparseScsrsm2_solve( + handle, + algo, + CUSPARSE_OPERATION_NON_TRANSPOSE, /* transA */ + CUSPARSE_OPERATION_NON_TRANSPOSE, /* transB */ + n, + nrhs, + nnzA, + &h_one, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_B, + n, /* ldb */ + info, + policy, + d_work); + // CHECK: assert(HIPSPARSE_STATUS_SUCCESS == status); + assert(CUSPARSE_STATUS_SUCCESS == status); + // CHECK: cudaStat1 = hipDeviceSynchronize(); + cudaStat1 = cudaDeviceSynchronize(); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + + /* step 6:measure residual B - A*X */ + // CHECK: cudaStat1 = hipMemcpy(X, d_B, sizeof(float)*n*nrhs, hipMemcpyDeviceToHost); + cudaStat1 = cudaMemcpy(X, d_B, sizeof(float)*n*nrhs, cudaMemcpyDeviceToHost); + // CHECK: assert(hipSuccess == cudaStat1); + assert(cudaSuccess == cudaStat1); + // CHECK: hipDeviceSynchronize(); + cudaDeviceSynchronize(); + + printf("==== x1 = inv(A)*b1 \n"); + for (int j = 0; j < n; j++) { + printf("x1[%d] = %f\n", j, X[j]); + } + float r1_nrminf; + residaul_eval( + n, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + B, + X, + &r1_nrminf + ); + printf("|b1 - A*x1| = %E\n", r1_nrminf); + + printf("==== x2 = inv(A)*b2 \n"); + for (int j = 0; j < n; j++) { + printf("x2[%d] = %f\n", j, X[n + j]); + } + float r2_nrminf; + residaul_eval( + n, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + B + n, + X + n, + &r2_nrminf + ); + printf("|b2 - A*x2| = %E\n", r2_nrminf); + + /* free resources */ + // CHECK: if (d_csrRowPtrA) hipFree(d_csrRowPtrA); + if (d_csrRowPtrA) cudaFree(d_csrRowPtrA); + // CHECK: if (d_csrColIndA) hipFree(d_csrColIndA); + if (d_csrColIndA) cudaFree(d_csrColIndA); + // CHECK: if (d_csrValA) hipFree(d_csrValA); + if (d_csrValA) cudaFree(d_csrValA); + // CHECK: if (d_B) hipFree(d_B); + if (d_B) cudaFree(d_B); + // CHECK: if (handle) hipsparseDestroy(handle); + if (handle) cusparseDestroy(handle); + // CHECK: if (stream) hipStreamDestroy(stream); + if (stream) cudaStreamDestroy(stream); + // CHECK: if (descrA) hipsparseDestroyMatDescr(descrA); + if (descrA) cusparseDestroyMatDescr(descrA); + // NOTE: CUDA 10.0 + // TODO: if (info) hipsparseDestroyCsrsm2Info(info); + if (info) cusparseDestroyCsrsm2Info(info); + // CHECK: hipDeviceReset(); + cudaDeviceReset(); + + return 0; +} diff --git a/projects/clr/hipamd/tests/hipify-clang/lit.cfg b/projects/clr/hipamd/tests/hipify-clang/lit.cfg index b9ff0b2495..f959147cd6 100644 --- a/projects/clr/hipamd/tests/hipify-clang/lit.cfg +++ b/projects/clr/hipamd/tests/hipify-clang/lit.cfg @@ -26,6 +26,9 @@ config.excludes = ['cmdparser.hpp'] config.cuda_version = "@CUDA_VERSION@" if config.cuda_version not in ['10.0']: config.excludes.append('cuSPARSE_08.cu') + config.excludes.append('cuSPARSE_09.cu') + config.excludes.append('cuSPARSE_10.cu') + config.excludes.append('cuSPARSE_11.cu') # test_exec_root: The path where tests are located (default is the test suite root). #config.test_exec_root = config.test_source_root