Files
rocm-systems/projects/rocprofiler-systems/examples/openmp/LU/lu.cpp
T
Jonathan R. Madsen 1cdf67257e Remove temporary files (#223)
* config updates

- remove temporary file when destroyed
- extend parse_numeric_range: support vector, increment, better delimit
- git version info in omnitrace banner
- use OMNITRACE_BASIC_VERBOSE instead of OMNITRACE_VERBOSE

* Fix array-bounds warning in openmp lu.cpp

* Bump version to 1.7.4

* Update run-ci.py

- options to repeat
  - until-pass (default: 3)
  - until-fail (default: None)
  - after-timeout (default: 2)

[ROCm/rocprofiler-systems commit: 2096f1f0c2]
2023-01-10 01:06:12 -06:00

3564 строки
139 KiB
C++

/*
MIT License
Copyright (c) 2021 Parallel Applications Modelling Group - GMAP
GMAP website: https://gmap.pucrs.br
Pontifical Catholic University of Rio Grande do Sul (PUCRS)
Av. Ipiranga, 6681, Porto Alegre - Brazil, 90619-900
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
------------------------------------------------------------------------------
The original NPB 3.4.1 version was written in Fortran and belongs to:
http://www.nas.nasa.gov/Software/NPB/
Authors of the Fortran code:
S. Weeratunga
V. Venkatakrishnan
E. Barszcz
M. Yarrow
H. Jin
------------------------------------------------------------------------------
The serial C++ version is a translation of the original NPB 3.4.1
Serial C++ version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-SER
Authors of the C++ code:
Dalvan Griebler <dalvangriebler@gmail.com>
Gabriell Araujo <hexenoften@gmail.com>
Júnior Löff <loffjh@gmail.com>
------------------------------------------------------------------------------
The OpenMP version is a parallel implementation of the serial C++ version
OpenMP version: https://github.com/GMAP/NPB-CPP/tree/master/NPB-OMP
Authors of the OpenMP code:
Júnior Löff <loffjh@gmail.com>
*/
#include "../common/npb-CPP.hpp"
#include "npbparams.hpp"
/*
* ---------------------------------------------------------------------
* driver for the performance evaluation of the solver for
* five coupled parabolic/elliptic partial differential equations
* ---------------------------------------------------------------------
* parameters which can be overridden in runtime config file
* isiz1,isiz2,isiz3 give the maximum size
* ipr = 1 to print out verbose information
* omega = 2.0 is correct for all classes
* tolrsd is tolerance levels for steady state residuals
* ---------------------------------------------------------------------
* field variables and residuals
* to improve cache performance, second two dimensions padded by 1
* for even number sizes only.
* note: corresponding array (called "v") in routines blts, buts,
* and l2norm are similarly padded
* ---------------------------------------------------------------------
*/
#define IPR_DEFAULT 1
#define OMEGA_DEFAULT 1.2
#define TOLRSD1_DEF 1.0e-08
#define TOLRSD2_DEF 1.0e-08
#define TOLRSD3_DEF 1.0e-08
#define TOLRSD4_DEF 1.0e-08
#define TOLRSD5_DEF 1.0e-08
#define C1 1.40e+00
#define C2 0.40e+00
#define C3 1.00e-01
#define C4 1.00e+00
#define C5 1.40e+00
#define T_TOTAL 1
#define T_RHSX 2
#define T_RHSY 3
#define T_RHSZ 4
#define T_RHS 5
#define T_JACLD 6
#define T_BLTS 7
#define T_JACU 8
#define T_BUTS 9
#define T_ADD 10
#define T_L2NORM 11
#define T_LAST 11
/* global variables */
#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION)
static double u[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5];
static double rsd[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5];
static double frct[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5];
static double flux[ISIZ1][5];
static double qs[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1];
static double rho_i[ISIZ3][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1];
static double a[ISIZ2][ISIZ1 / 2 * 2 + 1][5][5];
static double b[ISIZ2][ISIZ1 / 2 * 2 + 1][5][5];
static double c[ISIZ2][ISIZ1 / 2 * 2 + 1][5][5];
static double d[ISIZ2][ISIZ1 / 2 * 2 + 1][5][5];
static double ce[13][5];
#else
static double (*u)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5] =
(double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5])
malloc(sizeof(double) *
((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1) * (5)));
static double (*rsd)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5] =
(double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5])
malloc(sizeof(double) *
((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1) * (5)));
static double (*frct)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5] =
(double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5])
malloc(sizeof(double) *
((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1) * (5)));
static double (*flux)[5] = (double (*)[5]) malloc(sizeof(double) * ((ISIZ1) * (5)));
static double (*qs)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1] =
(double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1])
malloc(sizeof(double) * ((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1)));
static double (*rho_i)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1] =
(double (*)[ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1])
malloc(sizeof(double) * ((ISIZ3) * (ISIZ2 / 2 * 2 + 1) * (ISIZ1 / 2 * 2 + 1)));
static double (*a)[ISIZ1 / 2 * 2 + 1][5][5] = (double (*)[ISIZ1 / 2 * 2 + 1][5][5])
malloc(sizeof(double) * ((ISIZ2) * (ISIZ1 / 2 * 2 + 1) * (5) * (5)));
static double (*b)[ISIZ1 / 2 * 2 + 1][5][5] = (double (*)[ISIZ1 / 2 * 2 + 1][5][5])
malloc(sizeof(double) * ((ISIZ2) * (ISIZ1 / 2 * 2 + 1) * (5) * (5)));
static double (*c)[ISIZ1 / 2 * 2 + 1][5][5] = (double (*)[ISIZ1 / 2 * 2 + 1][5][5])
malloc(sizeof(double) * ((ISIZ2) * (ISIZ1 / 2 * 2 + 1) * (5) * (5)));
static double (*d)[ISIZ1 / 2 * 2 + 1][5][5] = (double (*)[ISIZ1 / 2 * 2 + 1][5][5])
malloc(sizeof(double) * ((ISIZ2) * (ISIZ1 / 2 * 2 + 1) * (5) * (5)));
static double (*ce)[5] = (double (*)[5]) malloc(sizeof(double) * ((13) * (5)));
#endif
/* grid */
static double dxi, deta, dzeta;
static double tx1, tx2, tx3;
static double ty1, ty2, ty3;
static double tz1, tz2, tz3;
static int nx, ny, nz;
static int nx0, ny0, nz0;
static int ist, iend;
static int jst, jend;
static int ii1, ii2;
static int ji1, ji2;
static int ki1, ki2;
/* dissipation */
static double dx1, dx2, dx3, dx4, dx5;
static double dy1, dy2, dy3, dy4, dy5;
static double dz1, dz2, dz3, dz4, dz5;
static double dssp;
/* output control parameters */
static int ipr, inorm;
/* newton-raphson iteration control parameters */
static double dt, omega, tolrsd[5], rsdnm[5], errnm[5], frc;
static int itmax;
/* timer */
static double maxtime;
static boolean timeron;
/* function prototypes */
void
blts(int nx, int ny, int nz, int k, double omega,
double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5],
double ldz[][ISIZ1 / 2 * 2 + 1][5][5], double ldy[][ISIZ1 / 2 * 2 + 1][5][5],
double ldx[][ISIZ1 / 2 * 2 + 1][5][5], double d[][ISIZ1 / 2 * 2 + 1][5][5], int ist,
int iend, int jst, int jend, int nx0, int ny0);
void
buts(int nx, int ny, int nz, int k, double omega,
double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], void* pointer_tv,
double d[][ISIZ1 / 2 * 2 + 1][5][5], double udx[][ISIZ1 / 2 * 2 + 1][5][5],
double udy[][ISIZ1 / 2 * 2 + 1][5][5], double udz[][ISIZ1 / 2 * 2 + 1][5][5],
int ist, int iend, int jst, int jend, int nx0, int ny0);
void
domain();
void
erhs();
void
error();
void
exact(int i, int j, int k, double u000ijk[]);
void
jacld(int k);
void
jacu(int k);
void
l2norm(int nx0, int ny0, int nz0, int ist, int iend, int jst, int jend,
double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], double sum[5]);
void
pintgr();
void
read_input();
void
rhs();
void
setbv();
void
setcoeff();
void
setiv();
void
ssor(int niter);
void
verify(double xcr[], double xce[], double xci, char* class_npb, boolean* verified);
static boolean flag[ISIZ1 / 2 * 2 + 1];
static boolean flag2[ISIZ1 / 2 * 2 + 1];
/* lu */
int
main(int, char*[])
{
#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION)
printf(
" DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION mode on\n");
#endif
char class_npb;
boolean verified;
double mflops;
double t, tmax, trecs[T_LAST + 1];
int i;
char* t_names[T_LAST + 1];
/*
* ---------------------------------------------------------------------
* setup info for timers
* ---------------------------------------------------------------------
*/
FILE* fp;
if((fp = fopen("timer.flag", "r")) != NULL)
{
timeron = TRUE;
t_names[T_TOTAL] = (char*) "total";
t_names[T_RHSX] = (char*) "rhsx";
t_names[T_RHSY] = (char*) "rhsy";
t_names[T_RHSZ] = (char*) "rhsz";
t_names[T_RHS] = (char*) "rhs";
t_names[T_JACLD] = (char*) "jacld";
t_names[T_BLTS] = (char*) "blts";
t_names[T_JACU] = (char*) "jacu";
t_names[T_BUTS] = (char*) "buts";
t_names[T_ADD] = (char*) "add";
t_names[T_L2NORM] = (char*) "l2norm";
fclose(fp);
}
else
{
timeron = FALSE;
}
/*
* ---------------------------------------------------------------------
* read input data
* ---------------------------------------------------------------------
*/
read_input();
/*
* ---------------------------------------------------------------------
* set up domain sizes
* ---------------------------------------------------------------------
*/
domain();
/*
* ---------------------------------------------------------------------
* set up coefficients
* ---------------------------------------------------------------------
*/
setcoeff();
#pragma omp parallel
{
/*
* ---------------------------------------------------------------------
* set the boundary values for dependent variables
* ---------------------------------------------------------------------
*/
setbv();
/*
* ---------------------------------------------------------------------
* set the initial values for dependent variables
* ---------------------------------------------------------------------
*/
setiv();
/*
* ---------------------------------------------------------------------
* compute the forcing term based on prescribed exact solution
* ---------------------------------------------------------------------
*/
erhs();
} /* end parallel */
/*
* ---------------------------------------------------------------------
* perform one SSOR iteration to touch all pages
* ---------------------------------------------------------------------
*/
ssor(1);
#pragma omp parallel
{
/*
* ---------------------------------------------------------------------
* reset the boundary and initial values
* ---------------------------------------------------------------------
*/
setbv();
setiv();
}
/*
* ---------------------------------------------------------------------
* perform the SSOR iterations
* ---------------------------------------------------------------------
*/
ssor(itmax);
/*
* ---------------------------------------------------------------------
* compute the solution error
* ---------------------------------------------------------------------
*/
error();
/*
* ---------------------------------------------------------------------
* compute the surface integral
* ---------------------------------------------------------------------
*/
pintgr();
/*
* ---------------------------------------------------------------------
* verification test
* ---------------------------------------------------------------------
*/
verify(rsdnm, errnm, frc, &class_npb, &verified);
mflops = (double) itmax *
(1984.77 * (double) nx0 * (double) ny0 * (double) nz0 -
10923.3 * pow(((double) (nx0 + ny0 + nz0) / 3.0), 2.0) +
27770.9 * (double) (nx0 + ny0 + nz0) / 3.0 - 144010.0) /
(maxtime * 1000000.0);
setenv("OMP_NUM_THREADS", "1", 0);
c_print_results((char*) "LU", class_npb, nx0, ny0, nz0, itmax, maxtime, mflops,
(char*) " floating point", verified, (char*) NPBVERSION,
(char*) COMPILETIME, (char*) COMPILERVERSION, (char*) LIBVERSION,
std::getenv("OMP_NUM_THREADS"), (char*) CS1, (char*) CS2, (char*) CS3,
(char*) CS4, (char*) CS5, (char*) CS6, (char*) "(none)");
/*
* ---------------------------------------------------------------------
* more timers
* ---------------------------------------------------------------------
*/
if(timeron)
{
for(i = 1; i <= T_LAST; i++)
{
trecs[i] = timer_read(i);
}
tmax = maxtime;
if(tmax == 0.0)
{
tmax = 1.0;
}
printf(" SECTION Time (secs)\n");
for(i = 1; i <= T_LAST; i++)
{
printf(" %-8s:%9.3f (%6.2f%%)\n", t_names[i], trecs[i],
trecs[i] * 100. / tmax);
if(i == T_RHS)
{
t = trecs[T_RHSX] + trecs[T_RHSY] + trecs[T_RHSZ];
printf(" --> %8s:%9.3f (%6.2f%%)\n", "sub-rhs", t, t * 100. / tmax);
t = trecs[i] - t;
printf(" --> %8s:%9.3f (%6.2f%%)\n", "rest-rhs", t, t * 100. / tmax);
}
}
}
return 0;
}
/*
* ---------------------------------------------------------------------
* compute the regular-sparse, block lower triangular solution:
* v <-- ( L-inv ) * v
* ---------------------------------------------------------------------
* to improve cache performance, second two dimensions padded by 1
* for even number sizes only. only needed in v.
* ---------------------------------------------------------------------
*/
void
blts(int /*nx*/, int /*ny*/, int /*nz*/, int k, double omega,
double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5],
double ldz[][ISIZ1 / 2 * 2 + 1][5][5], double ldy[][ISIZ1 / 2 * 2 + 1][5][5],
double ldx[][ISIZ1 / 2 * 2 + 1][5][5], double d[][ISIZ1 / 2 * 2 + 1][5][5], int ist,
int iend, int jst, int jend, int /*nx0*/, int /*ny0*/)
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, m;
double tmp, tmp1;
double tmat[5][5], tv[5];
#pragma omp for nowait schedule(static)
for(j = jst; j < jend; j++)
{
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
v[k][j][i][m] =
v[k][j][i][m] - omega * (ldz[j][i][0][m] * v[k - 1][j][i][0] +
ldz[j][i][1][m] * v[k - 1][j][i][1] +
ldz[j][i][2][m] * v[k - 1][j][i][2] +
ldz[j][i][3][m] * v[k - 1][j][i][3] +
ldz[j][i][4][m] * v[k - 1][j][i][4]);
}
}
}
#pragma omp for nowait schedule(static)
for(j = jst; j < jend; j++)
{
if(j != jst)
{
while(flag[j - 1] == 0)
{
#pragma omp flush
;
}
}
if(j != jend - 1)
{
while(flag[j] == 1)
{
#pragma omp flush
;
}
}
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
tv[m] = v[k][j][i][m] - omega * (ldy[j][i][0][m] * v[k][j - 1][i][0] +
ldx[j][i][0][m] * v[k][j][i - 1][0] +
ldy[j][i][1][m] * v[k][j - 1][i][1] +
ldx[j][i][1][m] * v[k][j][i - 1][1] +
ldy[j][i][2][m] * v[k][j - 1][i][2] +
ldx[j][i][2][m] * v[k][j][i - 1][2] +
ldy[j][i][3][m] * v[k][j - 1][i][3] +
ldx[j][i][3][m] * v[k][j][i - 1][3] +
ldy[j][i][4][m] * v[k][j - 1][i][4] +
ldx[j][i][4][m] * v[k][j][i - 1][4]);
}
/*
* ---------------------------------------------------------------------
* diagonal block inversion
*
* forward elimination
* ---------------------------------------------------------------------
*/
for(m = 0; m < 5; m++)
{
tmat[0][m] = d[j][i][0][m];
tmat[1][m] = d[j][i][1][m];
tmat[2][m] = d[j][i][2][m];
tmat[3][m] = d[j][i][3][m];
tmat[4][m] = d[j][i][4][m];
}
/* */
tmp1 = 1.0 / tmat[0][0];
tmp = tmp1 * tmat[0][1];
tmat[1][1] = tmat[1][1] - tmp * tmat[1][0];
tmat[2][1] = tmat[2][1] - tmp * tmat[2][0];
tmat[3][1] = tmat[3][1] - tmp * tmat[3][0];
tmat[4][1] = tmat[4][1] - tmp * tmat[4][0];
tv[1] = tv[1] - tv[0] * tmp;
/* */
tmp = tmp1 * tmat[0][2];
tmat[1][2] = tmat[1][2] - tmp * tmat[1][0];
tmat[2][2] = tmat[2][2] - tmp * tmat[2][0];
tmat[3][2] = tmat[3][2] - tmp * tmat[3][0];
tmat[4][2] = tmat[4][2] - tmp * tmat[4][0];
tv[2] = tv[2] - tv[0] * tmp;
/* */
tmp = tmp1 * tmat[0][3];
tmat[1][3] = tmat[1][3] - tmp * tmat[1][0];
tmat[2][3] = tmat[2][3] - tmp * tmat[2][0];
tmat[3][3] = tmat[3][3] - tmp * tmat[3][0];
tmat[4][3] = tmat[4][3] - tmp * tmat[4][0];
tv[3] = tv[3] - tv[0] * tmp;
/* */
tmp = tmp1 * tmat[0][4];
tmat[1][4] = tmat[1][4] - tmp * tmat[1][0];
tmat[2][4] = tmat[2][4] - tmp * tmat[2][0];
tmat[3][4] = tmat[3][4] - tmp * tmat[3][0];
tmat[4][4] = tmat[4][4] - tmp * tmat[4][0];
tv[4] = tv[4] - tv[0] * tmp;
/* */
tmp1 = 1.0 / tmat[1][1];
tmp = tmp1 * tmat[1][2];
tmat[2][2] = tmat[2][2] - tmp * tmat[2][1];
tmat[3][2] = tmat[3][2] - tmp * tmat[3][1];
tmat[4][2] = tmat[4][2] - tmp * tmat[4][1];
tv[2] = tv[2] - tv[1] * tmp;
/* */
tmp = tmp1 * tmat[1][3];
tmat[2][3] = tmat[2][3] - tmp * tmat[2][1];
tmat[3][3] = tmat[3][3] - tmp * tmat[3][1];
tmat[4][3] = tmat[4][3] - tmp * tmat[4][1];
tv[3] = tv[3] - tv[1] * tmp;
/* */
tmp = tmp1 * tmat[1][4];
tmat[2][4] = tmat[2][4] - tmp * tmat[2][1];
tmat[3][4] = tmat[3][4] - tmp * tmat[3][1];
tmat[4][4] = tmat[4][4] - tmp * tmat[4][1];
tv[4] = tv[4] - tv[1] * tmp;
/* */
tmp1 = 1.0 / tmat[2][2];
tmp = tmp1 * tmat[2][3];
tmat[3][3] = tmat[3][3] - tmp * tmat[3][2];
tmat[4][3] = tmat[4][3] - tmp * tmat[4][2];
tv[3] = tv[3] - tv[2] * tmp;
/* */
tmp = tmp1 * tmat[2][4];
tmat[3][4] = tmat[3][4] - tmp * tmat[3][2];
tmat[4][4] = tmat[4][4] - tmp * tmat[4][2];
tv[4] = tv[4] - tv[2] * tmp;
/* */
tmp1 = 1.0 / tmat[3][3];
tmp = tmp1 * tmat[3][4];
tmat[4][4] = tmat[4][4] - tmp * tmat[4][3];
tv[4] = tv[4] - tv[3] * tmp;
/*
* ---------------------------------------------------------------------
* back substitution
* ---------------------------------------------------------------------
*/
v[k][j][i][4] = tv[4] / tmat[4][4];
tv[3] = tv[3] - tmat[4][3] * v[k][j][i][4];
v[k][j][i][3] = tv[3] / tmat[3][3];
tv[2] = tv[2] - tmat[3][2] * v[k][j][i][3] - tmat[4][2] * v[k][j][i][4];
v[k][j][i][2] = tv[2] / tmat[2][2];
tv[1] = tv[1] - tmat[2][1] * v[k][j][i][2] - tmat[3][1] * v[k][j][i][3] -
tmat[4][1] * v[k][j][i][4];
v[k][j][i][1] = tv[1] / tmat[1][1];
tv[0] = tv[0] - tmat[1][0] * v[k][j][i][1] - tmat[2][0] * v[k][j][i][2] -
tmat[3][0] * v[k][j][i][3] - tmat[4][0] * v[k][j][i][4];
v[k][j][i][0] = tv[0] / tmat[0][0];
}
if(j != jend - 1) flag[j] = 1;
if(j != jst) flag[j - 1] = 0;
}
}
/*
* ---------------------------------------------------------------------
* compute the regular-sparse, block upper triangular solution:
* v <-- ( U-inv ) * v
* ---------------------------------------------------------------------
* to improve cache performance, second two dimensions padded by 1
* for even number sizes only. only needed in v.
* ---------------------------------------------------------------------
*/
void
buts(int /*nx*/, int /*ny*/, int /*nz*/, int k, double omega,
double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], void* pointer_tv,
double d[][ISIZ1 / 2 * 2 + 1][5][5], double udx[][ISIZ1 / 2 * 2 + 1][5][5],
double udy[][ISIZ1 / 2 * 2 + 1][5][5], double udz[][ISIZ1 / 2 * 2 + 1][5][5],
int ist, int iend, int jst, int jend, int /*nx0*/, int /*ny0*/)
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
double(*tv)[ISIZ1 / 2 * 2 + 1][5] = (double(*)[ISIZ1 / 2 * 2 + 1][5]) pointer_tv;
int i, j, m;
double tmp, tmp1;
double tmat[5][5];
#pragma omp for nowait schedule(static)
for(j = jend - 1; j >= jst; j--)
{
for(i = iend - 1; i >= ist; i--)
{
for(m = 0; m < 5; m++)
{
tv[j][i][m] = omega * (udz[j][i][0][m] * v[k + 1][j][i][0] +
udz[j][i][1][m] * v[k + 1][j][i][1] +
udz[j][i][2][m] * v[k + 1][j][i][2] +
udz[j][i][3][m] * v[k + 1][j][i][3] +
udz[j][i][4][m] * v[k + 1][j][i][4]);
}
}
}
#pragma omp for nowait schedule(static)
for(j = jend - 1; j >= jst; j--)
{
if(j != jend - 1)
{
while(flag2[j + 1] == 0)
{
#pragma omp flush
;
}
}
if(j != jst)
{
while(flag2[j] == 1)
{
#pragma omp flush
;
}
}
for(i = iend - 1; i >= ist; i--)
{
for(m = 0; m < 5; m++)
{
tv[j][i][m] = tv[j][i][m] + omega * (udy[j][i][0][m] * v[k][j + 1][i][0] +
udx[j][i][0][m] * v[k][j][i + 1][0] +
udy[j][i][1][m] * v[k][j + 1][i][1] +
udx[j][i][1][m] * v[k][j][i + 1][1] +
udy[j][i][2][m] * v[k][j + 1][i][2] +
udx[j][i][2][m] * v[k][j][i + 1][2] +
udy[j][i][3][m] * v[k][j + 1][i][3] +
udx[j][i][3][m] * v[k][j][i + 1][3] +
udy[j][i][4][m] * v[k][j + 1][i][4] +
udx[j][i][4][m] * v[k][j][i + 1][4]);
}
/*
* ---------------------------------------------------------------------
* diagonal block inversion
* ---------------------------------------------------------------------
*/
for(m = 0; m < 5; m++)
{
tmat[0][m] = d[j][i][0][m];
tmat[1][m] = d[j][i][1][m];
tmat[2][m] = d[j][i][2][m];
tmat[3][m] = d[j][i][3][m];
tmat[4][m] = d[j][i][4][m];
}
/* */
tmp1 = 1.0 / tmat[0][0];
tmp = tmp1 * tmat[0][1];
tmat[1][1] = tmat[1][1] - tmp * tmat[1][0];
tmat[2][1] = tmat[2][1] - tmp * tmat[2][0];
tmat[3][1] = tmat[3][1] - tmp * tmat[3][0];
tmat[4][1] = tmat[4][1] - tmp * tmat[4][0];
tv[j][i][1] = tv[j][i][1] - tv[j][i][0] * tmp;
/* */
tmp = tmp1 * tmat[0][2];
tmat[1][2] = tmat[1][2] - tmp * tmat[1][0];
tmat[2][2] = tmat[2][2] - tmp * tmat[2][0];
tmat[3][2] = tmat[3][2] - tmp * tmat[3][0];
tmat[4][2] = tmat[4][2] - tmp * tmat[4][0];
tv[j][i][2] = tv[j][i][2] - tv[j][i][0] * tmp;
/* */
tmp = tmp1 * tmat[0][3];
tmat[1][3] = tmat[1][3] - tmp * tmat[1][0];
tmat[2][3] = tmat[2][3] - tmp * tmat[2][0];
tmat[3][3] = tmat[3][3] - tmp * tmat[3][0];
tmat[4][3] = tmat[4][3] - tmp * tmat[4][0];
tv[j][i][3] = tv[j][i][3] - tv[j][i][0] * tmp;
/* */
tmp = tmp1 * tmat[0][4];
tmat[1][4] = tmat[1][4] - tmp * tmat[1][0];
tmat[2][4] = tmat[2][4] - tmp * tmat[2][0];
tmat[3][4] = tmat[3][4] - tmp * tmat[3][0];
tmat[4][4] = tmat[4][4] - tmp * tmat[4][0];
tv[j][i][4] = tv[j][i][4] - tv[j][i][0] * tmp;
/* */
tmp1 = 1.0 / tmat[1][1];
tmp = tmp1 * tmat[1][2];
tmat[2][2] = tmat[2][2] - tmp * tmat[2][1];
tmat[3][2] = tmat[3][2] - tmp * tmat[3][1];
tmat[4][2] = tmat[4][2] - tmp * tmat[4][1];
tv[j][i][2] = tv[j][i][2] - tv[j][i][1] * tmp;
/* */
tmp = tmp1 * tmat[1][3];
tmat[2][3] = tmat[2][3] - tmp * tmat[2][1];
tmat[3][3] = tmat[3][3] - tmp * tmat[3][1];
tmat[4][3] = tmat[4][3] - tmp * tmat[4][1];
tv[j][i][3] = tv[j][i][3] - tv[j][i][1] * tmp;
/* */
tmp = tmp1 * tmat[1][4];
tmat[2][4] = tmat[2][4] - tmp * tmat[2][1];
tmat[3][4] = tmat[3][4] - tmp * tmat[3][1];
tmat[4][4] = tmat[4][4] - tmp * tmat[4][1];
tv[j][i][4] = tv[j][i][4] - tv[j][i][1] * tmp;
/* */
tmp1 = 1.0 / tmat[2][2];
tmp = tmp1 * tmat[2][3];
tmat[3][3] = tmat[3][3] - tmp * tmat[3][2];
tmat[4][3] = tmat[4][3] - tmp * tmat[4][2];
tv[j][i][3] = tv[j][i][3] - tv[j][i][2] * tmp;
/* */
tmp = tmp1 * tmat[2][4];
tmat[3][4] = tmat[3][4] - tmp * tmat[3][2];
tmat[4][4] = tmat[4][4] - tmp * tmat[4][2];
tv[j][i][4] = tv[j][i][4] - tv[j][i][2] * tmp;
/* */
tmp1 = 1.0 / tmat[3][3];
tmp = tmp1 * tmat[3][4];
tmat[4][4] = tmat[4][4] - tmp * tmat[4][3];
tv[j][i][4] = tv[j][i][4] - tv[j][i][3] * tmp;
/*
* ---------------------------------------------------------------------
* back substitution
* ---------------------------------------------------------------------
*/
tv[j][i][4] = tv[j][i][4] / tmat[4][4];
tv[j][i][3] = tv[j][i][3] - tmat[4][3] * tv[j][i][4];
tv[j][i][3] = tv[j][i][3] / tmat[3][3];
tv[j][i][2] =
tv[j][i][2] - tmat[3][2] * tv[j][i][3] - tmat[4][2] * tv[j][i][4];
tv[j][i][2] = tv[j][i][2] / tmat[2][2];
tv[j][i][1] = tv[j][i][1] - tmat[2][1] * tv[j][i][2] -
tmat[3][1] * tv[j][i][3] - tmat[4][1] * tv[j][i][4];
tv[j][i][1] = tv[j][i][1] / tmat[1][1];
tv[j][i][0] = tv[j][i][0] - tmat[1][0] * tv[j][i][1] -
tmat[2][0] * tv[j][i][2] - tmat[3][0] * tv[j][i][3] -
tmat[4][0] * tv[j][i][4];
tv[j][i][0] = tv[j][i][0] / tmat[0][0];
v[k][j][i][0] = v[k][j][i][0] - tv[j][i][0];
v[k][j][i][1] = v[k][j][i][1] - tv[j][i][1];
v[k][j][i][2] = v[k][j][i][2] - tv[j][i][2];
v[k][j][i][3] = v[k][j][i][3] - tv[j][i][3];
v[k][j][i][4] = v[k][j][i][4] - tv[j][i][4];
}
if(j != jend - 1) flag2[j + 1] = 0;
if(j != jst) flag2[j] = 1;
}
}
void
domain()
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
nx = nx0;
ny = ny0;
nz = nz0;
/*
* ---------------------------------------------------------------------
* check the sub-domain size
* ---------------------------------------------------------------------
*/
if((nx < 4) || (ny < 4) || (nz < 4))
{
printf(" SUBDOMAIN SIZE IS TOO SMALL - \n"
" ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS\n"
" SO THAT NX, NY AND NZ ARE GREATER THAN OR EQUAL\n"
" TO 4 THEY ARE CURRENTLY%3d%3d%3d\n",
nx, ny, nz);
exit(EXIT_FAILURE);
}
if((nx > ISIZ1) || (ny > ISIZ2) || (nz > ISIZ3))
{
printf(" SUBDOMAIN SIZE IS TOO LARGE - \n"
" ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS\n"
" SO THAT NX, NY AND NZ ARE LESS THAN OR EQUAL TO \n"
" ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY. THEY ARE\n"
" CURRENTLYi%4d%4d%4d\n",
nx, ny, nz);
exit(EXIT_FAILURE);
}
/*
* ---------------------------------------------------------------------
* set up the start and end in i and j extents for all processors
* ---------------------------------------------------------------------
*/
ist = 1;
iend = nx - 1;
jst = 1;
jend = ny - 1;
ii1 = 1;
ii2 = nx0 - 1;
ji1 = 1;
ji2 = ny0 - 2;
ki1 = 2;
ki2 = nz0 - 1;
}
/*
* ---------------------------------------------------------------------
* compute the right hand side based on exact solution
* ---------------------------------------------------------------------
*/
void
erhs()
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, k, m;
double xi, eta, zeta;
double q;
double u21, u31, u41;
double tmp;
double u21i, u31i, u41i, u51i;
double u21j, u31j, u41j, u51j;
double u21k, u31k, u41k, u51k;
double u21im1, u31im1, u41im1, u51im1;
double u21jm1, u31jm1, u41jm1, u51jm1;
double u21km1, u31km1, u41km1, u51km1;
double flux[ISIZ1][5];
#pragma omp for
for(k = 0; k < nz; k++)
{
for(j = 0; j < ny; j++)
{
for(i = 0; i < nx; i++)
{
for(m = 0; m < 5; m++)
{
frct[k][j][i][m] = 0.0;
}
}
}
}
#pragma omp for
for(k = 0; k < nz; k++)
{
zeta = ((double) k) / (nz - 1);
for(j = 0; j < ny; j++)
{
eta = ((double) j) / (ny0 - 1);
for(i = 0; i < nx; i++)
{
xi = ((double) i) / (nx0 - 1);
for(m = 0; m < 5; m++)
{
rsd[k][j][i][m] =
ce[0][m] +
(ce[1][m] + (ce[4][m] + (ce[7][m] + ce[10][m] * xi) * xi) * xi) *
xi +
(ce[2][m] +
(ce[5][m] + (ce[8][m] + ce[11][m] * eta) * eta) * eta) *
eta +
(ce[3][m] +
(ce[6][m] + (ce[9][m] + ce[12][m] * zeta) * zeta) * zeta) *
zeta;
}
}
}
}
/*
* ---------------------------------------------------------------------
* xi-direction flux differences
* ---------------------------------------------------------------------
*/
#pragma omp for
for(k = 1; k < nz - 1; k++)
{
for(j = jst; j < jend; j++)
{
for(i = 0; i < nx; i++)
{
flux[i][0] = rsd[k][j][i][1];
u21 = rsd[k][j][i][1] / rsd[k][j][i][0];
q = 0.50 *
(rsd[k][j][i][1] * rsd[k][j][i][1] +
rsd[k][j][i][2] * rsd[k][j][i][2] +
rsd[k][j][i][3] * rsd[k][j][i][3]) /
rsd[k][j][i][0];
flux[i][1] = rsd[k][j][i][1] * u21 + C2 * (rsd[k][j][i][4] - q);
flux[i][2] = rsd[k][j][i][2] * u21;
flux[i][3] = rsd[k][j][i][3] * u21;
flux[i][4] = (C1 * rsd[k][j][i][4] - C2 * q) * u21;
}
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
frct[k][j][i][m] =
frct[k][j][i][m] - tx2 * (flux[i + 1][m] - flux[i - 1][m]);
}
}
for(i = ist; i < nx; i++)
{
tmp = 1.0 / rsd[k][j][i][0];
u21i = tmp * rsd[k][j][i][1];
u31i = tmp * rsd[k][j][i][2];
u41i = tmp * rsd[k][j][i][3];
u51i = tmp * rsd[k][j][i][4];
tmp = 1.0 / rsd[k][j][i - 1][0];
u21im1 = tmp * rsd[k][j][i - 1][1];
u31im1 = tmp * rsd[k][j][i - 1][2];
u41im1 = tmp * rsd[k][j][i - 1][3];
u51im1 = tmp * rsd[k][j][i - 1][4];
flux[i][1] = (4.0 / 3.0) * tx3 * (u21i - u21im1);
flux[i][2] = tx3 * (u31i - u31im1);
flux[i][3] = tx3 * (u41i - u41im1);
flux[i][4] = 0.50 * (1.0 - C1 * C5) * tx3 *
((u21i * u21i + u31i * u31i + u41i * u41i) -
(u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1)) +
(1.0 / 6.0) * tx3 * (u21i * u21i - u21im1 * u21im1) +
C1 * C5 * tx3 * (u51i - u51im1);
}
for(i = ist; i < iend; i++)
{
frct[k][j][i][0] =
frct[k][j][i][0] + dx1 * tx1 *
(rsd[k][j][i - 1][0] - 2.0 * rsd[k][j][i][0] +
rsd[k][j][i + 1][0]);
frct[k][j][i][1] = frct[k][j][i][1] +
tx3 * C3 * C4 * (flux[i + 1][1] - flux[i][1]) +
dx2 * tx1 *
(rsd[k][j][i - 1][1] - 2.0 * rsd[k][j][i][1] +
rsd[k][j][i + 1][1]);
frct[k][j][i][2] = frct[k][j][i][2] +
tx3 * C3 * C4 * (flux[i + 1][2] - flux[i][2]) +
dx3 * tx1 *
(rsd[k][j][i - 1][2] - 2.0 * rsd[k][j][i][2] +
rsd[k][j][i + 1][2]);
frct[k][j][i][3] = frct[k][j][i][3] +
tx3 * C3 * C4 * (flux[i + 1][3] - flux[i][3]) +
dx4 * tx1 *
(rsd[k][j][i - 1][3] - 2.0 * rsd[k][j][i][3] +
rsd[k][j][i + 1][3]);
frct[k][j][i][4] = frct[k][j][i][4] +
tx3 * C3 * C4 * (flux[i + 1][4] - flux[i][4]) +
dx5 * tx1 *
(rsd[k][j][i - 1][4] - 2.0 * rsd[k][j][i][4] +
rsd[k][j][i + 1][4]);
}
/*
* ---------------------------------------------------------------------
* fourth-order dissipation
* ---------------------------------------------------------------------
*/
for(m = 0; m < 5; m++)
{
frct[k][j][1][m] =
frct[k][j][1][m] - dssp * (+5.0 * rsd[k][j][1][m] -
4.0 * rsd[k][j][2][m] + rsd[k][j][3][m]);
frct[k][j][2][m] =
frct[k][j][2][m] -
dssp * (-4.0 * rsd[k][j][1][m] + 6.0 * rsd[k][j][2][m] -
4.0 * rsd[k][j][3][m] + rsd[k][j][4][m]);
}
for(i = 3; i < nx - 3; i++)
{
for(m = 0; m < 5; m++)
{
frct[k][j][i][m] =
frct[k][j][i][m] -
dssp * (rsd[k][j][i - 2][m] - 4.0 * rsd[k][j][i - 1][m] +
6.0 * rsd[k][j][i][m] - 4.0 * rsd[k][j][i + 1][m] +
rsd[k][j][i + 2][m]);
}
}
for(m = 0; m < 5; m++)
{
frct[k][j][nx - 3][m] =
frct[k][j][nx - 3][m] -
dssp * (rsd[k][j][nx - 5][m] - 4.0 * rsd[k][j][nx - 4][m] +
6.0 * rsd[k][j][nx - 3][m] - 4.0 * rsd[k][j][nx - 2][m]);
frct[k][j][nx - 2][m] =
frct[k][j][nx - 2][m] -
dssp * (rsd[k][j][nx - 4][m] - 4.0 * rsd[k][j][nx - 3][m] +
5.0 * rsd[k][j][nx - 2][m]);
}
}
}
/*
* ---------------------------------------------------------------------
* eta-direction flux differences
* ---------------------------------------------------------------------
*/
#pragma omp for
for(k = 1; k < nz - 1; k++)
{
for(i = ist; i < iend; i++)
{
for(j = 0; j < ny; j++)
{
flux[j][0] = rsd[k][j][i][2];
u31 = rsd[k][j][i][2] / rsd[k][j][i][0];
q = 0.50 *
(rsd[k][j][i][1] * rsd[k][j][i][1] +
rsd[k][j][i][2] * rsd[k][j][i][2] +
rsd[k][j][i][3] * rsd[k][j][i][3]) /
rsd[k][j][i][0];
flux[j][1] = rsd[k][j][i][1] * u31;
flux[j][2] = rsd[k][j][i][2] * u31 + C2 * (rsd[k][j][i][4] - q);
flux[j][3] = rsd[k][j][i][3] * u31;
flux[j][4] = (C1 * rsd[k][j][i][4] - C2 * q) * u31;
}
for(j = jst; j < jend; j++)
{
for(m = 0; m < 5; m++)
{
frct[k][j][i][m] =
frct[k][j][i][m] - ty2 * (flux[j + 1][m] - flux[j - 1][m]);
}
}
for(j = jst; j < ny; j++)
{
tmp = 1.0 / rsd[k][j][i][0];
u21j = tmp * rsd[k][j][i][1];
u31j = tmp * rsd[k][j][i][2];
u41j = tmp * rsd[k][j][i][3];
u51j = tmp * rsd[k][j][i][4];
tmp = 1.0 / rsd[k][j - 1][i][0];
u21jm1 = tmp * rsd[k][j - 1][i][1];
u31jm1 = tmp * rsd[k][j - 1][i][2];
u41jm1 = tmp * rsd[k][j - 1][i][3];
u51jm1 = tmp * rsd[k][j - 1][i][4];
flux[j][1] = ty3 * (u21j - u21jm1);
flux[j][2] = (4.0 / 3.0) * ty3 * (u31j - u31jm1);
flux[j][3] = ty3 * (u41j - u41jm1);
flux[j][4] = 0.50 * (1.0 - C1 * C5) * ty3 *
((u21j * u21j + u31j * u31j + u41j * u41j) -
(u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1)) +
(1.0 / 6.0) * ty3 * (u31j * u31j - u31jm1 * u31jm1) +
C1 * C5 * ty3 * (u51j - u51jm1);
}
for(j = jst; j < jend; j++)
{
frct[k][j][i][0] =
frct[k][j][i][0] + dy1 * ty1 *
(rsd[k][j - 1][i][0] - 2.0 * rsd[k][j][i][0] +
rsd[k][j + 1][i][0]);
frct[k][j][i][1] = frct[k][j][i][1] +
ty3 * C3 * C4 * (flux[j + 1][1] - flux[j][1]) +
dy2 * ty1 *
(rsd[k][j - 1][i][1] - 2.0 * rsd[k][j][i][1] +
rsd[k][j + 1][i][1]);
frct[k][j][i][2] = frct[k][j][i][2] +
ty3 * C3 * C4 * (flux[j + 1][2] - flux[j][2]) +
dy3 * ty1 *
(rsd[k][j - 1][i][2] - 2.0 * rsd[k][j][i][2] +
rsd[k][j + 1][i][2]);
frct[k][j][i][3] = frct[k][j][i][3] +
ty3 * C3 * C4 * (flux[j + 1][3] - flux[j][3]) +
dy4 * ty1 *
(rsd[k][j - 1][i][3] - 2.0 * rsd[k][j][i][3] +
rsd[k][j + 1][i][3]);
frct[k][j][i][4] = frct[k][j][i][4] +
ty3 * C3 * C4 * (flux[j + 1][4] - flux[j][4]) +
dy5 * ty1 *
(rsd[k][j - 1][i][4] - 2.0 * rsd[k][j][i][4] +
rsd[k][j + 1][i][4]);
}
/*
* ---------------------------------------------------------------------
* fourth-order dissipation
* ---------------------------------------------------------------------
*/
for(m = 0; m < 5; m++)
{
frct[k][1][i][m] =
frct[k][1][i][m] - dssp * (+5.0 * rsd[k][1][i][m] -
4.0 * rsd[k][2][i][m] + rsd[k][3][i][m]);
frct[k][2][i][m] =
frct[k][2][i][m] -
dssp * (-4.0 * rsd[k][1][i][m] + 6.0 * rsd[k][2][i][m] -
4.0 * rsd[k][3][i][m] + rsd[k][4][i][m]);
}
for(j = 3; j < ny - 3; j++)
{
for(m = 0; m < 5; m++)
{
frct[k][j][i][m] =
frct[k][j][i][m] -
dssp * (rsd[k][j - 2][i][m] - 4.0 * rsd[k][j - 1][i][m] +
6.0 * rsd[k][j][i][m] - 4.0 * rsd[k][j + 1][i][m] +
rsd[k][j + 2][i][m]);
}
}
for(m = 0; m < 5; m++)
{
frct[k][ny - 3][i][m] =
frct[k][ny - 3][i][m] -
dssp * (rsd[k][ny - 5][i][m] - 4.0 * rsd[k][ny - 4][i][m] +
6.0 * rsd[k][ny - 3][i][m] - 4.0 * rsd[k][ny - 2][i][m]);
frct[k][ny - 2][i][m] =
frct[k][ny - 2][i][m] -
dssp * (rsd[k][ny - 4][i][m] - 4.0 * rsd[k][ny - 3][i][m] +
5.0 * rsd[k][ny - 2][i][m]);
}
}
}
/*
* ---------------------------------------------------------------------
* zeta-direction flux differences
* ---------------------------------------------------------------------
*/
#pragma omp for
for(j = jst; j < jend; j++)
{
for(i = ist; i < iend; i++)
{
for(k = 0; k < nz; k++)
{
flux[k][0] = rsd[k][j][i][3];
u41 = rsd[k][j][i][3] / rsd[k][j][i][0];
q = 0.50 *
(rsd[k][j][i][1] * rsd[k][j][i][1] +
rsd[k][j][i][2] * rsd[k][j][i][2] +
rsd[k][j][i][3] * rsd[k][j][i][3]) /
rsd[k][j][i][0];
flux[k][1] = rsd[k][j][i][1] * u41;
flux[k][2] = rsd[k][j][i][2] * u41;
flux[k][3] = rsd[k][j][i][3] * u41 + C2 * (rsd[k][j][i][4] - q);
flux[k][4] = (C1 * rsd[k][j][i][4] - C2 * q) * u41;
}
for(k = 1; k < nz - 1; k++)
{
for(m = 0; m < 5; m++)
{
frct[k][j][i][m] =
frct[k][j][i][m] - tz2 * (flux[k + 1][m] - flux[k - 1][m]);
}
}
for(k = 1; k < nz; k++)
{
tmp = 1.0 / rsd[k][j][i][0];
u21k = tmp * rsd[k][j][i][1];
u31k = tmp * rsd[k][j][i][2];
u41k = tmp * rsd[k][j][i][3];
u51k = tmp * rsd[k][j][i][4];
tmp = 1.0 / rsd[k - 1][j][i][0];
u21km1 = tmp * rsd[k - 1][j][i][1];
u31km1 = tmp * rsd[k - 1][j][i][2];
u41km1 = tmp * rsd[k - 1][j][i][3];
u51km1 = tmp * rsd[k - 1][j][i][4];
flux[k][1] = tz3 * (u21k - u21km1);
flux[k][2] = tz3 * (u31k - u31km1);
flux[k][3] = (4.0 / 3.0) * tz3 * (u41k - u41km1);
flux[k][4] = 0.50 * (1.0 - C1 * C5) * tz3 *
((u21k * u21k + u31k * u31k + u41k * u41k) -
(u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1)) +
(1.0 / 6.0) * tz3 * (u41k * u41k - u41km1 * u41km1) +
C1 * C5 * tz3 * (u51k - u51km1);
}
for(k = 1; k < nz - 1; k++)
{
frct[k][j][i][0] =
frct[k][j][i][0] + dz1 * tz1 *
(rsd[k + 1][j][i][0] - 2.0 * rsd[k][j][i][0] +
rsd[k - 1][j][i][0]);
frct[k][j][i][1] = frct[k][j][i][1] +
tz3 * C3 * C4 * (flux[k + 1][1] - flux[k][1]) +
dz2 * tz1 *
(rsd[k + 1][j][i][1] - 2.0 * rsd[k][j][i][1] +
rsd[k - 1][j][i][1]);
frct[k][j][i][2] = frct[k][j][i][2] +
tz3 * C3 * C4 * (flux[k + 1][2] - flux[k][2]) +
dz3 * tz1 *
(rsd[k + 1][j][i][2] - 2.0 * rsd[k][j][i][2] +
rsd[k - 1][j][i][2]);
frct[k][j][i][3] = frct[k][j][i][3] +
tz3 * C3 * C4 * (flux[k + 1][3] - flux[k][3]) +
dz4 * tz1 *
(rsd[k + 1][j][i][3] - 2.0 * rsd[k][j][i][3] +
rsd[k - 1][j][i][3]);
frct[k][j][i][4] = frct[k][j][i][4] +
tz3 * C3 * C4 * (flux[k + 1][4] - flux[k][4]) +
dz5 * tz1 *
(rsd[k + 1][j][i][4] - 2.0 * rsd[k][j][i][4] +
rsd[k - 1][j][i][4]);
}
/*
* ---------------------------------------------------------------------
* fourth-order dissipation
* ---------------------------------------------------------------------
*/
for(m = 0; m < 5; m++)
{
frct[1][j][i][m] =
frct[1][j][i][m] - dssp * (+5.0 * rsd[1][j][i][m] -
4.0 * rsd[2][j][i][m] + rsd[3][j][i][m]);
frct[2][j][i][m] =
frct[2][j][i][m] -
dssp * (-4.0 * rsd[1][j][i][m] + 6.0 * rsd[2][j][i][m] -
4.0 * rsd[3][j][i][m] + rsd[4][j][i][m]);
}
for(k = 3; k < nz - 3; k++)
{
for(m = 0; m < 5; m++)
{
frct[k][j][i][m] =
frct[k][j][i][m] -
dssp * (rsd[k - 2][j][i][m] - 4.0 * rsd[k - 1][j][i][m] +
6.0 * rsd[k][j][i][m] - 4.0 * rsd[k + 1][j][i][m] +
rsd[k + 2][j][i][m]);
}
}
for(m = 0; m < 5; m++)
{
frct[nz - 3][j][i][m] =
frct[nz - 3][j][i][m] -
dssp * (rsd[nz - 5][j][i][m] - 4.0 * rsd[nz - 4][j][i][m] +
6.0 * rsd[nz - 3][j][i][m] - 4.0 * rsd[nz - 2][j][i][m]);
frct[nz - 2][j][i][m] =
frct[nz - 2][j][i][m] -
dssp * (rsd[nz - 4][j][i][m] - 4.0 * rsd[nz - 3][j][i][m] +
5.0 * rsd[nz - 2][j][i][m]);
}
}
}
}
/*
* ---------------------------------------------------------------------
* compute the solution error
* ---------------------------------------------------------------------
*/
void
error()
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, k, m;
double tmp;
double u000ijk[5];
for(m = 0; m < 5; m++)
{
errnm[m] = 0.0;
}
for(k = 1; k < nz - 1; k++)
{
for(j = jst; j < jend; j++)
{
for(i = ist; i < iend; i++)
{
exact(i, j, k, u000ijk);
for(m = 0; m < 5; m++)
{
tmp = (u000ijk[m] - u[k][j][i][m]);
errnm[m] = errnm[m] + tmp * tmp;
}
}
}
}
for(m = 0; m < 5; m++)
{
errnm[m] = sqrt(errnm[m] / ((nx0 - 2) * (ny0 - 2) * (nz0 - 2)));
}
}
/*
* ---------------------------------------------------------------------
* compute the exact solution at (i,j,k)
* ---------------------------------------------------------------------
*/
void
exact(int i, int j, int k, double u000ijk[])
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int m;
double xi, eta, zeta;
xi = ((double) i) / (nx0 - 1);
eta = ((double) j) / (ny0 - 1);
zeta = ((double) k) / (nz - 1);
for(m = 0; m < 5; m++)
{
u000ijk[m] =
ce[0][m] +
(ce[1][m] + (ce[4][m] + (ce[7][m] + ce[10][m] * xi) * xi) * xi) * xi +
(ce[2][m] + (ce[5][m] + (ce[8][m] + ce[11][m] * eta) * eta) * eta) * eta +
(ce[3][m] + (ce[6][m] + (ce[9][m] + ce[12][m] * zeta) * zeta) * zeta) * zeta;
}
}
/*
* ---------------------------------------------------------------------
* compute the lower triangular part of the jacobian matrix
* ---------------------------------------------------------------------
*/
void
jacld(int k)
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j;
double r43;
double c1345;
double c34;
double tmp1, tmp2, tmp3;
r43 = (4.0 / 3.0);
c1345 = C1 * C3 * C4 * C5;
c34 = C3 * C4;
#pragma omp for nowait schedule(static)
for(j = jst; j < jend; j++)
{
for(i = ist; i < iend; i++)
{
/*
* ---------------------------------------------------------------------
* form the block daigonal
* ---------------------------------------------------------------------
*/
tmp1 = rho_i[k][j][i];
tmp2 = tmp1 * tmp1;
tmp3 = tmp1 * tmp2;
d[j][i][0][0] = 1.0 + dt * 2.0 * (tx1 * dx1 + ty1 * dy1 + tz1 * dz1);
d[j][i][1][0] = 0.0;
d[j][i][2][0] = 0.0;
d[j][i][3][0] = 0.0;
d[j][i][4][0] = 0.0;
d[j][i][0][1] =
-dt * 2.0 * (tx1 * r43 + ty1 + tz1) * c34 * tmp2 * u[k][j][i][1];
d[j][i][1][1] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 * r43 + ty1 + tz1) +
dt * 2.0 * (tx1 * dx2 + ty1 * dy2 + tz1 * dz2);
d[j][i][2][1] = 0.0;
d[j][i][3][1] = 0.0;
d[j][i][4][1] = 0.0;
d[j][i][0][2] =
-dt * 2.0 * (tx1 + ty1 * r43 + tz1) * c34 * tmp2 * u[k][j][i][2];
d[j][i][1][2] = 0.0;
d[j][i][2][2] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 + ty1 * r43 + tz1) +
dt * 2.0 * (tx1 * dx3 + ty1 * dy3 + tz1 * dz3);
d[j][i][3][2] = 0.0;
d[j][i][4][2] = 0.0;
d[j][i][0][3] =
-dt * 2.0 * (tx1 + ty1 + tz1 * r43) * c34 * tmp2 * u[k][j][i][3];
d[j][i][1][3] = 0.0;
d[j][i][2][3] = 0.0;
d[j][i][3][3] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 + ty1 + tz1 * r43) +
dt * 2.0 * (tx1 * dx4 + ty1 * dy4 + tz1 * dz4);
d[j][i][4][3] = 0.0;
d[j][i][0][4] = -dt * 2.0 *
(((tx1 * (r43 * c34 - c1345) + ty1 * (c34 - c1345) +
tz1 * (c34 - c1345)) *
(u[k][j][i][1] * u[k][j][i][1]) +
(tx1 * (c34 - c1345) + ty1 * (r43 * c34 - c1345) +
tz1 * (c34 - c1345)) *
(u[k][j][i][2] * u[k][j][i][2]) +
(tx1 * (c34 - c1345) + ty1 * (c34 - c1345) +
tz1 * (r43 * c34 - c1345)) *
(u[k][j][i][3] * u[k][j][i][3])) *
tmp3 +
(tx1 + ty1 + tz1) * c1345 * tmp2 * u[k][j][i][4]);
d[j][i][1][4] =
dt * 2.0 * tmp2 * u[k][j][i][1] *
(tx1 * (r43 * c34 - c1345) + ty1 * (c34 - c1345) + tz1 * (c34 - c1345));
d[j][i][2][4] =
dt * 2.0 * tmp2 * u[k][j][i][2] *
(tx1 * (c34 - c1345) + ty1 * (r43 * c34 - c1345) + tz1 * (c34 - c1345));
d[j][i][3][4] =
dt * 2.0 * tmp2 * u[k][j][i][3] *
(tx1 * (c34 - c1345) + ty1 * (c34 - c1345) + tz1 * (r43 * c34 - c1345));
d[j][i][4][4] = 1.0 + dt * 2.0 * (tx1 + ty1 + tz1) * c1345 * tmp1 +
dt * 2.0 * (tx1 * dx5 + ty1 * dy5 + tz1 * dz5);
/*
* ---------------------------------------------------------------------
* form the first block sub-diagonal
* ---------------------------------------------------------------------
*/
tmp1 = rho_i[k - 1][j][i];
tmp2 = tmp1 * tmp1;
tmp3 = tmp1 * tmp2;
a[j][i][0][0] = -dt * tz1 * dz1;
a[j][i][1][0] = 0.0;
a[j][i][2][0] = 0.0;
a[j][i][3][0] = -dt * tz2;
a[j][i][4][0] = 0.0;
a[j][i][0][1] =
-dt * tz2 * (-(u[k - 1][j][i][1] * u[k - 1][j][i][3]) * tmp2) -
dt * tz1 * (-c34 * tmp2 * u[k - 1][j][i][1]);
a[j][i][1][1] = -dt * tz2 * (u[k - 1][j][i][3] * tmp1) -
dt * tz1 * c34 * tmp1 - dt * tz1 * dz2;
a[j][i][2][1] = 0.0;
a[j][i][3][1] = -dt * tz2 * (u[k - 1][j][i][1] * tmp1);
a[j][i][4][1] = 0.0;
a[j][i][0][2] =
-dt * tz2 * (-(u[k - 1][j][i][2] * u[k - 1][j][i][3]) * tmp2) -
dt * tz1 * (-c34 * tmp2 * u[k - 1][j][i][2]);
a[j][i][1][2] = 0.0;
a[j][i][2][2] = -dt * tz2 * (u[k - 1][j][i][3] * tmp1) -
dt * tz1 * (c34 * tmp1) - dt * tz1 * dz3;
a[j][i][3][2] = -dt * tz2 * (u[k - 1][j][i][2] * tmp1);
a[j][i][4][2] = 0.0;
a[j][i][0][3] =
-dt * tz2 *
(-(u[k - 1][j][i][3] * tmp1) * (u[k - 1][j][i][3] * tmp1) +
C2 * qs[k - 1][j][i] * tmp1) -
dt * tz1 * (-r43 * c34 * tmp2 * u[k - 1][j][i][3]);
a[j][i][1][3] = -dt * tz2 * (-C2 * (u[k - 1][j][i][1] * tmp1));
a[j][i][2][3] = -dt * tz2 * (-C2 * (u[k - 1][j][i][2] * tmp1));
a[j][i][3][3] = -dt * tz2 * (2.0 - C2) * (u[k - 1][j][i][3] * tmp1) -
dt * tz1 * (r43 * c34 * tmp1) - dt * tz1 * dz4;
a[j][i][4][3] = -dt * tz2 * C2;
a[j][i][0][4] =
-dt * tz2 *
((C2 * 2.0 * qs[k - 1][j][i] - C1 * u[k - 1][j][i][4]) *
u[k - 1][j][i][3] * tmp2) -
dt * tz1 *
(-(c34 - c1345) * tmp3 * (u[k - 1][j][i][1] * u[k - 1][j][i][1]) -
(c34 - c1345) * tmp3 * (u[k - 1][j][i][2] * u[k - 1][j][i][2]) -
(r43 * c34 - c1345) * tmp3 *
(u[k - 1][j][i][3] * u[k - 1][j][i][3]) -
c1345 * tmp2 * u[k - 1][j][i][4]);
a[j][i][1][4] =
-dt * tz2 * (-C2 * (u[k - 1][j][i][1] * u[k - 1][j][i][3]) * tmp2) -
dt * tz1 * (c34 - c1345) * tmp2 * u[k - 1][j][i][1];
a[j][i][2][4] =
-dt * tz2 * (-C2 * (u[k - 1][j][i][2] * u[k - 1][j][i][3]) * tmp2) -
dt * tz1 * (c34 - c1345) * tmp2 * u[k - 1][j][i][2];
a[j][i][3][4] = -dt * tz2 *
(C1 * (u[k - 1][j][i][4] * tmp1) -
C2 * (qs[k - 1][j][i] * tmp1 +
u[k - 1][j][i][3] * u[k - 1][j][i][3] * tmp2)) -
dt * tz1 * (r43 * c34 - c1345) * tmp2 * u[k - 1][j][i][3];
a[j][i][4][4] = -dt * tz2 * (C1 * (u[k - 1][j][i][3] * tmp1)) -
dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5;
/*
* ---------------------------------------------------------------------
* form the second block sub-diagonal
* ---------------------------------------------------------------------
*/
tmp1 = rho_i[k][j - 1][i];
tmp2 = tmp1 * tmp1;
tmp3 = tmp1 * tmp2;
b[j][i][0][0] = -dt * ty1 * dy1;
b[j][i][1][0] = 0.0;
b[j][i][2][0] = -dt * ty2;
b[j][i][3][0] = 0.0;
b[j][i][4][0] = 0.0;
b[j][i][0][1] =
-dt * ty2 * (-(u[k][j - 1][i][1] * u[k][j - 1][i][2]) * tmp2) -
dt * ty1 * (-c34 * tmp2 * u[k][j - 1][i][1]);
b[j][i][1][1] = -dt * ty2 * (u[k][j - 1][i][2] * tmp1) -
dt * ty1 * (c34 * tmp1) - dt * ty1 * dy2;
b[j][i][2][1] = -dt * ty2 * (u[k][j - 1][i][1] * tmp1);
b[j][i][3][1] = 0.0;
b[j][i][4][1] = 0.0;
b[j][i][0][2] =
-dt * ty2 *
(-(u[k][j - 1][i][2] * tmp1) * (u[k][j - 1][i][2] * tmp1) +
C2 * (qs[k][j - 1][i] * tmp1)) -
dt * ty1 * (-r43 * c34 * tmp2 * u[k][j - 1][i][2]);
b[j][i][1][2] = -dt * ty2 * (-C2 * (u[k][j - 1][i][1] * tmp1));
b[j][i][2][2] = -dt * ty2 * ((2.0 - C2) * (u[k][j - 1][i][2] * tmp1)) -
dt * ty1 * (r43 * c34 * tmp1) - dt * ty1 * dy3;
b[j][i][3][2] = -dt * ty2 * (-C2 * (u[k][j - 1][i][3] * tmp1));
b[j][i][4][2] = -dt * ty2 * C2;
b[j][i][0][3] =
-dt * ty2 * (-(u[k][j - 1][i][2] * u[k][j - 1][i][3]) * tmp2) -
dt * ty1 * (-c34 * tmp2 * u[k][j - 1][i][3]);
b[j][i][1][3] = 0.0;
b[j][i][2][3] = -dt * ty2 * (u[k][j - 1][i][3] * tmp1);
b[j][i][3][3] = -dt * ty2 * (u[k][j - 1][i][2] * tmp1) -
dt * ty1 * (c34 * tmp1) - dt * ty1 * dy4;
b[j][i][4][3] = 0.0;
b[j][i][0][4] =
-dt * ty2 *
((C2 * 2.0 * qs[k][j - 1][i] - C1 * u[k][j - 1][i][4]) *
(u[k][j - 1][i][2] * tmp2)) -
dt * ty1 *
(-(c34 - c1345) * tmp3 * (u[k][j - 1][i][1] * u[k][j - 1][i][1]) -
(r43 * c34 - c1345) * tmp3 *
(u[k][j - 1][i][2] * u[k][j - 1][i][2]) -
(c34 - c1345) * tmp3 * (u[k][j - 1][i][3] * u[k][j - 1][i][3]) -
c1345 * tmp2 * u[k][j - 1][i][4]);
b[j][i][1][4] =
-dt * ty2 * (-C2 * (u[k][j - 1][i][1] * u[k][j - 1][i][2]) * tmp2) -
dt * ty1 * (c34 - c1345) * tmp2 * u[k][j - 1][i][1];
b[j][i][2][4] = -dt * ty2 *
(C1 * (u[k][j - 1][i][4] * tmp1) -
C2 * (qs[k][j - 1][i] * tmp1 +
u[k][j - 1][i][2] * u[k][j - 1][i][2] * tmp2)) -
dt * ty1 * (r43 * c34 - c1345) * tmp2 * u[k][j - 1][i][2];
b[j][i][3][4] =
-dt * ty2 * (-C2 * (u[k][j - 1][i][2] * u[k][j - 1][i][3]) * tmp2) -
dt * ty1 * (c34 - c1345) * tmp2 * u[k][j - 1][i][3];
b[j][i][4][4] = -dt * ty2 * (C1 * (u[k][j - 1][i][2] * tmp1)) -
dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5;
/*
* ---------------------------------------------------------------------
* form the third block sub-diagonal
* ---------------------------------------------------------------------
*/
tmp1 = rho_i[k][j][i - 1];
tmp2 = tmp1 * tmp1;
tmp3 = tmp1 * tmp2;
c[j][i][0][0] = -dt * tx1 * dx1;
c[j][i][1][0] = -dt * tx2;
c[j][i][2][0] = 0.0;
c[j][i][3][0] = 0.0;
c[j][i][4][0] = 0.0;
c[j][i][0][1] =
-dt * tx2 *
(-(u[k][j][i - 1][1] * tmp1) * (u[k][j][i - 1][1] * tmp1) +
C2 * qs[k][j][i - 1] * tmp1) -
dt * tx1 * (-r43 * c34 * tmp2 * u[k][j][i - 1][1]);
c[j][i][1][1] = -dt * tx2 * ((2.0 - C2) * (u[k][j][i - 1][1] * tmp1)) -
dt * tx1 * (r43 * c34 * tmp1) - dt * tx1 * dx2;
c[j][i][2][1] = -dt * tx2 * (-C2 * (u[k][j][i - 1][2] * tmp1));
c[j][i][3][1] = -dt * tx2 * (-C2 * (u[k][j][i - 1][3] * tmp1));
c[j][i][4][1] = -dt * tx2 * C2;
c[j][i][0][2] =
-dt * tx2 * (-(u[k][j][i - 1][1] * u[k][j][i - 1][2]) * tmp2) -
dt * tx1 * (-c34 * tmp2 * u[k][j][i - 1][2]);
c[j][i][1][2] = -dt * tx2 * (u[k][j][i - 1][2] * tmp1);
c[j][i][2][2] = -dt * tx2 * (u[k][j][i - 1][1] * tmp1) -
dt * tx1 * (c34 * tmp1) - dt * tx1 * dx3;
c[j][i][3][2] = 0.0;
c[j][i][4][2] = 0.0;
c[j][i][0][3] =
-dt * tx2 * (-(u[k][j][i - 1][1] * u[k][j][i - 1][3]) * tmp2) -
dt * tx1 * (-c34 * tmp2 * u[k][j][i - 1][3]);
c[j][i][1][3] = -dt * tx2 * (u[k][j][i - 1][3] * tmp1);
c[j][i][2][3] = 0.0;
c[j][i][3][3] = -dt * tx2 * (u[k][j][i - 1][1] * tmp1) -
dt * tx1 * (c34 * tmp1) - dt * tx1 * dx4;
c[j][i][4][3] = 0.0;
c[j][i][0][4] =
-dt * tx2 *
((C2 * 2.0 * qs[k][j][i - 1] - C1 * u[k][j][i - 1][4]) *
u[k][j][i - 1][1] * tmp2) -
dt * tx1 *
(-(r43 * c34 - c1345) * tmp3 *
(u[k][j][i - 1][1] * u[k][j][i - 1][1]) -
(c34 - c1345) * tmp3 * (u[k][j][i - 1][2] * u[k][j][i - 1][2]) -
(c34 - c1345) * tmp3 * (u[k][j][i - 1][3] * u[k][j][i - 1][3]) -
c1345 * tmp2 * u[k][j][i - 1][4]);
c[j][i][1][4] = -dt * tx2 *
(C1 * (u[k][j][i - 1][4] * tmp1) -
C2 * (u[k][j][i - 1][1] * u[k][j][i - 1][1] * tmp2 +
qs[k][j][i - 1] * tmp1)) -
dt * tx1 * (r43 * c34 - c1345) * tmp2 * u[k][j][i - 1][1];
c[j][i][2][4] =
-dt * tx2 * (-C2 * (u[k][j][i - 1][2] * u[k][j][i - 1][1]) * tmp2) -
dt * tx1 * (c34 - c1345) * tmp2 * u[k][j][i - 1][2];
c[j][i][3][4] =
-dt * tx2 * (-C2 * (u[k][j][i - 1][3] * u[k][j][i - 1][1]) * tmp2) -
dt * tx1 * (c34 - c1345) * tmp2 * u[k][j][i - 1][3];
c[j][i][4][4] = -dt * tx2 * (C1 * (u[k][j][i - 1][1] * tmp1)) -
dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5;
}
}
}
/*
* ---------------------------------------------------------------------
* compute the upper triangular part of the jacobian matrix
* ---------------------------------------------------------------------
*/
void
jacu(int k)
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j;
double r43;
double c1345;
double c34;
double tmp1, tmp2, tmp3;
r43 = (4.0 / 3.0);
c1345 = C1 * C3 * C4 * C5;
c34 = C3 * C4;
#pragma omp for nowait schedule(static)
for(j = jend - 1; j >= jst; j--)
{
for(i = iend - 1; i >= ist; i--)
{
/*
* ---------------------------------------------------------------------
* form the block daigonal
* ---------------------------------------------------------------------
*/
tmp1 = rho_i[k][j][i];
tmp2 = tmp1 * tmp1;
tmp3 = tmp1 * tmp2;
d[j][i][0][0] = 1.0 + dt * 2.0 * (tx1 * dx1 + ty1 * dy1 + tz1 * dz1);
d[j][i][1][0] = 0.0;
d[j][i][2][0] = 0.0;
d[j][i][3][0] = 0.0;
d[j][i][4][0] = 0.0;
d[j][i][0][1] =
dt * 2.0 * (-tx1 * r43 - ty1 - tz1) * (c34 * tmp2 * u[k][j][i][1]);
d[j][i][1][1] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 * r43 + ty1 + tz1) +
dt * 2.0 * (tx1 * dx2 + ty1 * dy2 + tz1 * dz2);
d[j][i][2][1] = 0.0;
d[j][i][3][1] = 0.0;
d[j][i][4][1] = 0.0;
d[j][i][0][2] =
dt * 2.0 * (-tx1 - ty1 * r43 - tz1) * (c34 * tmp2 * u[k][j][i][2]);
d[j][i][1][2] = 0.0;
d[j][i][2][2] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 + ty1 * r43 + tz1) +
dt * 2.0 * (tx1 * dx3 + ty1 * dy3 + tz1 * dz3);
d[j][i][3][2] = 0.0;
d[j][i][4][2] = 0.0;
d[j][i][0][3] =
dt * 2.0 * (-tx1 - ty1 - tz1 * r43) * (c34 * tmp2 * u[k][j][i][3]);
d[j][i][1][3] = 0.0;
d[j][i][2][3] = 0.0;
d[j][i][3][3] = 1.0 + dt * 2.0 * c34 * tmp1 * (tx1 + ty1 + tz1 * r43) +
dt * 2.0 * (tx1 * dx4 + ty1 * dy4 + tz1 * dz4);
d[j][i][4][3] = 0.0;
d[j][i][0][4] = -dt * 2.0 *
(((tx1 * (r43 * c34 - c1345) + ty1 * (c34 - c1345) +
tz1 * (c34 - c1345)) *
(u[k][j][i][1] * u[k][j][i][1]) +
(tx1 * (c34 - c1345) + ty1 * (r43 * c34 - c1345) +
tz1 * (c34 - c1345)) *
(u[k][j][i][2] * u[k][j][i][2]) +
(tx1 * (c34 - c1345) + ty1 * (c34 - c1345) +
tz1 * (r43 * c34 - c1345)) *
(u[k][j][i][3] * u[k][j][i][3])) *
tmp3 +
(tx1 + ty1 + tz1) * c1345 * tmp2 * u[k][j][i][4]);
d[j][i][1][4] =
dt * 2.0 *
(tx1 * (r43 * c34 - c1345) + ty1 * (c34 - c1345) + tz1 * (c34 - c1345)) *
tmp2 * u[k][j][i][1];
d[j][i][2][4] =
dt * 2.0 *
(tx1 * (c34 - c1345) + ty1 * (r43 * c34 - c1345) + tz1 * (c34 - c1345)) *
tmp2 * u[k][j][i][2];
d[j][i][3][4] =
dt * 2.0 *
(tx1 * (c34 - c1345) + ty1 * (c34 - c1345) + tz1 * (r43 * c34 - c1345)) *
tmp2 * u[k][j][i][3];
d[j][i][4][4] = 1.0 + dt * 2.0 * (tx1 + ty1 + tz1) * c1345 * tmp1 +
dt * 2.0 * (tx1 * dx5 + ty1 * dy5 + tz1 * dz5);
/*
* ---------------------------------------------------------------------
* form the first block sub-diagonal
* ---------------------------------------------------------------------
*/
tmp1 = rho_i[k][j][i + 1];
tmp2 = tmp1 * tmp1;
tmp3 = tmp1 * tmp2;
a[j][i][0][0] = -dt * tx1 * dx1;
a[j][i][1][0] = dt * tx2;
a[j][i][2][0] = 0.0;
a[j][i][3][0] = 0.0;
a[j][i][4][0] = 0.0;
a[j][i][0][1] =
dt * tx2 *
(-(u[k][j][i + 1][1] * tmp1) * (u[k][j][i + 1][1] * tmp1) +
C2 * qs[k][j][i + 1] * tmp1) -
dt * tx1 * (-r43 * c34 * tmp2 * u[k][j][i + 1][1]);
a[j][i][1][1] = dt * tx2 * ((2.0 - C2) * (u[k][j][i + 1][1] * tmp1)) -
dt * tx1 * (r43 * c34 * tmp1) - dt * tx1 * dx2;
a[j][i][2][1] = dt * tx2 * (-C2 * (u[k][j][i + 1][2] * tmp1));
a[j][i][3][1] = dt * tx2 * (-C2 * (u[k][j][i + 1][3] * tmp1));
a[j][i][4][1] = dt * tx2 * C2;
a[j][i][0][2] = dt * tx2 * (-(u[k][j][i + 1][1] * u[k][j][i + 1][2]) * tmp2) -
dt * tx1 * (-c34 * tmp2 * u[k][j][i + 1][2]);
a[j][i][1][2] = dt * tx2 * (u[k][j][i + 1][2] * tmp1);
a[j][i][2][2] = dt * tx2 * (u[k][j][i + 1][1] * tmp1) -
dt * tx1 * (c34 * tmp1) - dt * tx1 * dx3;
a[j][i][3][2] = 0.0;
a[j][i][4][2] = 0.0;
a[j][i][0][3] = dt * tx2 * (-(u[k][j][i + 1][1] * u[k][j][i + 1][3]) * tmp2) -
dt * tx1 * (-c34 * tmp2 * u[k][j][i + 1][3]);
a[j][i][1][3] = dt * tx2 * (u[k][j][i + 1][3] * tmp1);
a[j][i][2][3] = 0.0;
a[j][i][3][3] = dt * tx2 * (u[k][j][i + 1][1] * tmp1) -
dt * tx1 * (c34 * tmp1) - dt * tx1 * dx4;
a[j][i][4][3] = 0.0;
a[j][i][0][4] =
dt * tx2 *
((C2 * 2.0 * qs[k][j][i + 1] - C1 * u[k][j][i + 1][4]) *
(u[k][j][i + 1][1] * tmp2)) -
dt * tx1 *
(-(r43 * c34 - c1345) * tmp3 *
(u[k][j][i + 1][1] * u[k][j][i + 1][1]) -
(c34 - c1345) * tmp3 * (u[k][j][i + 1][2] * u[k][j][i + 1][2]) -
(c34 - c1345) * tmp3 * (u[k][j][i + 1][3] * u[k][j][i + 1][3]) -
c1345 * tmp2 * u[k][j][i + 1][4]);
a[j][i][1][4] = dt * tx2 *
(C1 * (u[k][j][i + 1][4] * tmp1) -
C2 * (u[k][j][i + 1][1] * u[k][j][i + 1][1] * tmp2 +
qs[k][j][i + 1] * tmp1)) -
dt * tx1 * (r43 * c34 - c1345) * tmp2 * u[k][j][i + 1][1];
a[j][i][2][4] =
dt * tx2 * (-C2 * (u[k][j][i + 1][2] * u[k][j][i + 1][1]) * tmp2) -
dt * tx1 * (c34 - c1345) * tmp2 * u[k][j][i + 1][2];
a[j][i][3][4] =
dt * tx2 * (-C2 * (u[k][j][i + 1][3] * u[k][j][i + 1][1]) * tmp2) -
dt * tx1 * (c34 - c1345) * tmp2 * u[k][j][i + 1][3];
a[j][i][4][4] = dt * tx2 * (C1 * (u[k][j][i + 1][1] * tmp1)) -
dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5;
/*
* ---------------------------------------------------------------------
* form the second block sub-diagonal
* ---------------------------------------------------------------------
*/
tmp1 = rho_i[k][j + 1][i];
tmp2 = tmp1 * tmp1;
tmp3 = tmp1 * tmp2;
b[j][i][0][0] = -dt * ty1 * dy1;
b[j][i][1][0] = 0.0;
b[j][i][2][0] = dt * ty2;
b[j][i][3][0] = 0.0;
b[j][i][4][0] = 0.0;
b[j][i][0][1] = dt * ty2 * (-(u[k][j + 1][i][1] * u[k][j + 1][i][2]) * tmp2) -
dt * ty1 * (-c34 * tmp2 * u[k][j + 1][i][1]);
b[j][i][1][1] = dt * ty2 * (u[k][j + 1][i][2] * tmp1) -
dt * ty1 * (c34 * tmp1) - dt * ty1 * dy2;
b[j][i][2][1] = dt * ty2 * (u[k][j + 1][i][1] * tmp1);
b[j][i][3][1] = 0.0;
b[j][i][4][1] = 0.0;
b[j][i][0][2] =
dt * ty2 *
(-(u[k][j + 1][i][2] * tmp1) * (u[k][j + 1][i][2] * tmp1) +
C2 * (qs[k][j + 1][i] * tmp1)) -
dt * ty1 * (-r43 * c34 * tmp2 * u[k][j + 1][i][2]);
b[j][i][1][2] = dt * ty2 * (-C2 * (u[k][j + 1][i][1] * tmp1));
b[j][i][2][2] = dt * ty2 * ((2.0 - C2) * (u[k][j + 1][i][2] * tmp1)) -
dt * ty1 * (r43 * c34 * tmp1) - dt * ty1 * dy3;
b[j][i][3][2] = dt * ty2 * (-C2 * (u[k][j + 1][i][3] * tmp1));
b[j][i][4][2] = dt * ty2 * C2;
b[j][i][0][3] = dt * ty2 * (-(u[k][j + 1][i][2] * u[k][j + 1][i][3]) * tmp2) -
dt * ty1 * (-c34 * tmp2 * u[k][j + 1][i][3]);
b[j][i][1][3] = 0.0;
b[j][i][2][3] = dt * ty2 * (u[k][j + 1][i][3] * tmp1);
b[j][i][3][3] = dt * ty2 * (u[k][j + 1][i][2] * tmp1) -
dt * ty1 * (c34 * tmp1) - dt * ty1 * dy4;
b[j][i][4][3] = 0.0;
b[j][i][0][4] =
dt * ty2 *
((C2 * 2.0 * qs[k][j + 1][i] - C1 * u[k][j + 1][i][4]) *
(u[k][j + 1][i][2] * tmp2)) -
dt * ty1 *
(-(c34 - c1345) * tmp3 * (u[k][j + 1][i][1] * u[k][j + 1][i][1]) -
(r43 * c34 - c1345) * tmp3 *
(u[k][j + 1][i][2] * u[k][j + 1][i][2]) -
(c34 - c1345) * tmp3 * (u[k][j + 1][i][3] * u[k][j + 1][i][3]) -
c1345 * tmp2 * u[k][j + 1][i][4]);
b[j][i][1][4] =
dt * ty2 * (-C2 * (u[k][j + 1][i][1] * u[k][j + 1][i][2]) * tmp2) -
dt * ty1 * (c34 - c1345) * tmp2 * u[k][j + 1][i][1];
b[j][i][2][4] = dt * ty2 *
(C1 * (u[k][j + 1][i][4] * tmp1) -
C2 * (qs[k][j + 1][i] * tmp1 +
u[k][j + 1][i][2] * u[k][j + 1][i][2] * tmp2)) -
dt * ty1 * (r43 * c34 - c1345) * tmp2 * u[k][j + 1][i][2];
b[j][i][3][4] =
dt * ty2 * (-C2 * (u[k][j + 1][i][2] * u[k][j + 1][i][3]) * tmp2) -
dt * ty1 * (c34 - c1345) * tmp2 * u[k][j + 1][i][3];
b[j][i][4][4] = dt * ty2 * (C1 * (u[k][j + 1][i][2] * tmp1)) -
dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5;
/*
* ---------------------------------------------------------------------
* form the third block sub-diagonal
* ---------------------------------------------------------------------
*/
tmp1 = rho_i[k + 1][j][i];
tmp2 = tmp1 * tmp1;
tmp3 = tmp1 * tmp2;
c[j][i][0][0] = -dt * tz1 * dz1;
c[j][i][1][0] = 0.0;
c[j][i][2][0] = 0.0;
c[j][i][3][0] = dt * tz2;
c[j][i][4][0] = 0.0;
c[j][i][0][1] = dt * tz2 * (-(u[k + 1][j][i][1] * u[k + 1][j][i][3]) * tmp2) -
dt * tz1 * (-c34 * tmp2 * u[k + 1][j][i][1]);
c[j][i][1][1] = dt * tz2 * (u[k + 1][j][i][3] * tmp1) -
dt * tz1 * c34 * tmp1 - dt * tz1 * dz2;
c[j][i][2][1] = 0.0;
c[j][i][3][1] = dt * tz2 * (u[k + 1][j][i][1] * tmp1);
c[j][i][4][1] = 0.0;
c[j][i][0][2] = dt * tz2 * (-(u[k + 1][j][i][2] * u[k + 1][j][i][3]) * tmp2) -
dt * tz1 * (-c34 * tmp2 * u[k + 1][j][i][2]);
c[j][i][1][2] = 0.0;
c[j][i][2][2] = dt * tz2 * (u[k + 1][j][i][3] * tmp1) -
dt * tz1 * (c34 * tmp1) - dt * tz1 * dz3;
c[j][i][3][2] = dt * tz2 * (u[k + 1][j][i][2] * tmp1);
c[j][i][4][2] = 0.0;
c[j][i][0][3] =
dt * tz2 *
(-(u[k + 1][j][i][3] * tmp1) * (u[k + 1][j][i][3] * tmp1) +
C2 * (qs[k + 1][j][i] * tmp1)) -
dt * tz1 * (-r43 * c34 * tmp2 * u[k + 1][j][i][3]);
c[j][i][1][3] = dt * tz2 * (-C2 * (u[k + 1][j][i][1] * tmp1));
c[j][i][2][3] = dt * tz2 * (-C2 * (u[k + 1][j][i][2] * tmp1));
c[j][i][3][3] = dt * tz2 * (2.0 - C2) * (u[k + 1][j][i][3] * tmp1) -
dt * tz1 * (r43 * c34 * tmp1) - dt * tz1 * dz4;
c[j][i][4][3] = dt * tz2 * C2;
c[j][i][0][4] =
dt * tz2 *
((C2 * 2.0 * qs[k + 1][j][i] - C1 * u[k + 1][j][i][4]) *
(u[k + 1][j][i][3] * tmp2)) -
dt * tz1 *
(-(c34 - c1345) * tmp3 * (u[k + 1][j][i][1] * u[k + 1][j][i][1]) -
(c34 - c1345) * tmp3 * (u[k + 1][j][i][2] * u[k + 1][j][i][2]) -
(r43 * c34 - c1345) * tmp3 *
(u[k + 1][j][i][3] * u[k + 1][j][i][3]) -
c1345 * tmp2 * u[k + 1][j][i][4]);
c[j][i][1][4] =
dt * tz2 * (-C2 * (u[k + 1][j][i][1] * u[k + 1][j][i][3]) * tmp2) -
dt * tz1 * (c34 - c1345) * tmp2 * u[k + 1][j][i][1];
c[j][i][2][4] =
dt * tz2 * (-C2 * (u[k + 1][j][i][2] * u[k + 1][j][i][3]) * tmp2) -
dt * tz1 * (c34 - c1345) * tmp2 * u[k + 1][j][i][2];
c[j][i][3][4] = dt * tz2 *
(C1 * (u[k + 1][j][i][4] * tmp1) -
C2 * (qs[k + 1][j][i] * tmp1 +
u[k + 1][j][i][3] * u[k + 1][j][i][3] * tmp2)) -
dt * tz1 * (r43 * c34 - c1345) * tmp2 * u[k + 1][j][i][3];
c[j][i][4][4] = dt * tz2 * (C1 * (u[k + 1][j][i][3] * tmp1)) -
dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5;
}
}
}
/*
* ---------------------------------------------------------------------
* to compute the l2-norm of vector v.
* ---------------------------------------------------------------------
* to improve cache performance, second two dimensions padded by 1
* for even number sizes only. Only needed in v.
* ---------------------------------------------------------------------
*/
void
l2norm(int nx0, int ny0, int nz0, int ist, int iend, int jst, int jend,
double v[][ISIZ2 / 2 * 2 + 1][ISIZ1 / 2 * 2 + 1][5], double sum[5])
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, k, m;
double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0;
#pragma omp single
for(m = 0; m < 5; m++)
{
sum[m] = 0.0;
}
#pragma omp for nowait
for(k = 1; k < nz0 - 1; k++)
{
for(j = jst; j < jend; j++)
{
for(i = ist; i < iend; i++)
{
sum0 = sum0 + v[i][j][k][0] * v[i][j][k][0];
sum1 = sum1 + v[i][j][k][1] * v[i][j][k][1];
sum2 = sum2 + v[i][j][k][2] * v[i][j][k][2];
sum3 = sum3 + v[i][j][k][3] * v[i][j][k][3];
sum4 = sum4 + v[i][j][k][4] * v[i][j][k][4];
}
}
}
#pragma omp critical
{
sum[0] += sum0;
sum[1] += sum1;
sum[2] += sum2;
sum[3] += sum3;
sum[4] += sum4;
}
#pragma omp barrier
#pragma omp single
for(m = 0; m < 5; m++)
{
sum[m] = sqrt(sum[m] / ((nx0 - 2) * (ny0 - 2) * (nz0 - 2)));
}
}
void
pintgr()
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, k;
int ibeg, ifin, ifin1;
int jbeg, jfin, jfin1;
double phi1[ISIZ3 + 2][ISIZ2 + 2];
double phi2[ISIZ3 + 2][ISIZ2 + 2];
double frc1, frc2, frc3;
/*
* ---------------------------------------------------------------------
* set up the sub-domains for integeration in each processor
* ---------------------------------------------------------------------
*/
ibeg = ii1;
ifin = ii2;
jbeg = ji1;
jfin = ji2;
ifin1 = ifin - 1;
jfin1 = jfin - 1;
/*
* ---------------------------------------------------------------------
* initialize
* ---------------------------------------------------------------------
*/
for(i = 0; i <= ISIZ2 + 1; i++)
{
for(k = 0; k <= ISIZ3 + 1; k++)
{
phi1[k][i] = 0.0;
phi2[k][i] = 0.0;
}
}
for(j = jbeg; j < jfin; j++)
{
for(i = ibeg; i < ifin; i++)
{
k = ki1;
phi1[j][i] = C2 * (u[k][j][i][4] - 0.50 *
(u[k][j][i][1] * u[k][j][i][1] +
u[k][j][i][2] * u[k][j][i][2] +
u[k][j][i][3] * u[k][j][i][3]) /
u[k][j][i][0]);
k = ki2 - 1;
phi2[j][i] = C2 * (u[k][j][i][4] - 0.50 *
(u[k][j][i][1] * u[k][j][i][1] +
u[k][j][i][2] * u[k][j][i][2] +
u[k][j][i][3] * u[k][j][i][3]) /
u[k][j][i][0]);
}
}
frc1 = 0.0;
for(j = jbeg; j < jfin1; j++)
{
for(i = ibeg; i < ifin1; i++)
{
frc1 = frc1 +
(phi1[j][i] + phi1[j][i + 1] + phi1[j + 1][i] + phi1[j + 1][i + 1] +
phi2[j][i] + phi2[j][i + 1] + phi2[j + 1][i] + phi2[j + 1][i + 1]);
}
}
frc1 = dxi * deta * frc1;
/*
* ---------------------------------------------------------------------
* initialize
* ---------------------------------------------------------------------
*/
for(i = 0; i <= ISIZ2 + 1; i++)
{
for(k = 0; k <= ISIZ3 + 1; k++)
{
phi1[k][i] = 0.0;
phi2[k][i] = 0.0;
}
}
if(jbeg == ji1)
{
for(k = ki1; k < ki2; k++)
{
for(i = ibeg; i < ifin; i++)
{
phi1[k][i] =
C2 * (u[k][jbeg][i][4] - 0.50 *
(u[k][jbeg][i][1] * u[k][jbeg][i][1] +
u[k][jbeg][i][2] * u[k][jbeg][i][2] +
u[k][jbeg][i][3] * u[k][jbeg][i][3]) /
u[k][jbeg][i][0]);
}
}
}
if(jfin == ji2)
{
for(k = ki1; k < ki2; k++)
{
for(i = ibeg; i < ifin; i++)
{
phi2[k][i] = C2 * (u[k][jfin - 1][i][4] -
0.50 *
(u[k][jfin - 1][i][1] * u[k][jfin - 1][i][1] +
u[k][jfin - 1][i][2] * u[k][jfin - 1][i][2] +
u[k][jfin - 1][i][3] * u[k][jfin - 1][i][3]) /
u[k][jfin - 1][i][0]);
}
}
}
frc2 = 0.0;
for(k = ki1; k < ki2 - 1; k++)
{
for(i = ibeg; i < ifin1; i++)
{
frc2 = frc2 +
(phi1[k][i] + phi1[k][i + 1] + phi1[k + 1][i] + phi1[k + 1][i + 1] +
phi2[k][i] + phi2[k][i + 1] + phi2[k + 1][i] + phi2[k + 1][i + 1]);
}
}
frc2 = dxi * dzeta * frc2;
/*
* ---------------------------------------------------------------------
* initialize
* ---------------------------------------------------------------------
*/
for(i = 0; i <= ISIZ2 + 1; i++)
{
for(k = 0; k <= ISIZ3 + 1; k++)
{
phi1[k][i] = 0.0;
phi2[k][i] = 0.0;
}
}
if(ibeg == ii1)
{
for(k = ki1; k < ki2; k++)
{
for(j = jbeg; j < jfin; j++)
{
phi1[k][j] =
C2 * (u[k][j][ibeg][4] - 0.50 *
(u[k][j][ibeg][1] * u[k][j][ibeg][1] +
u[k][j][ibeg][2] * u[k][j][ibeg][2] +
u[k][j][ibeg][3] * u[k][j][ibeg][3]) /
u[k][j][ibeg][0]);
}
}
}
if(ifin == ii2)
{
for(k = ki1; k < ki2; k++)
{
for(j = jbeg; j < jfin; j++)
{
phi2[k][j] = C2 * (u[k][j][ifin - 1][4] -
0.50 *
(u[k][j][ifin - 1][1] * u[k][j][ifin - 1][1] +
u[k][j][ifin - 1][2] * u[k][j][ifin - 1][2] +
u[k][j][ifin - 1][3] * u[k][j][ifin - 1][3]) /
u[k][j][ifin - 1][0]);
}
}
}
frc3 = 0.0;
for(k = ki1; k < ki2 - 1; k++)
{
for(j = jbeg; j < jfin1; j++)
{
frc3 = frc3 +
(phi1[k][j] + phi1[k][j + 1] + phi1[k + 1][j] + phi1[k + 1][j + 1] +
phi2[k][j] + phi2[k][j + 1] + phi2[k + 1][j] + phi2[k + 1][j + 1]);
}
}
frc3 = deta * dzeta * frc3;
frc = 0.25 * (frc1 + frc2 + frc3);
}
void
read_input()
{
/*
* ---------------------------------------------------------------------
* if input file does not exist, it uses defaults
* ipr = 1 for detailed progress output
* inorm = how often the norm is printed (once every inorm iterations)
* itmax = number of pseudo time steps
* dt = time step
* omega 1 over-relaxation factor for SSOR
* tolrsd = steady state residual tolerance levels
* nx, ny, nz = number of grid points in x, y, z directions
* ---------------------------------------------------------------------
*/
FILE* fp;
int avoid_warning = 0;
if((fp = fopen("inputlu.data", "r")) != nullptr)
{
printf("Reading from input file inputlu.data\n");
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
avoid_warning = fscanf(fp, "%d%d", &ipr, &inorm);
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
avoid_warning = fscanf(fp, "%d", &itmax);
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
avoid_warning = fscanf(fp, "%lf", &dt);
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
avoid_warning = fscanf(fp, "%lf", &omega);
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
avoid_warning = fscanf(fp, "%lf%lf%lf%lf%lf", &tolrsd[0], &tolrsd[1], &tolrsd[2],
&tolrsd[3], &tolrsd[4]);
while(fgetc(fp) != '\n')
;
while(fgetc(fp) != '\n')
;
avoid_warning = fscanf(fp, "%d%d%d", &nx0, &ny0, &nz0);
fclose(fp);
}
else
{
ipr = IPR_DEFAULT;
inorm = INORM_DEFAULT;
itmax = ITMAX_DEFAULT;
dt = DT_DEFAULT;
omega = OMEGA_DEFAULT;
tolrsd[0] = TOLRSD1_DEF;
tolrsd[1] = TOLRSD2_DEF;
tolrsd[2] = TOLRSD3_DEF;
tolrsd[3] = TOLRSD4_DEF;
tolrsd[4] = TOLRSD5_DEF;
nx0 = ISIZ1;
ny0 = ISIZ2;
nz0 = ISIZ3;
}
(void) avoid_warning;
/*
* ---------------------------------------------------------------------
* check problem size
* ---------------------------------------------------------------------
*/
if((nx0 < 4) || (ny0 < 4) || (nz0 < 4))
{
printf(" PROBLEM SIZE IS TOO SMALL - \n"
" SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5\n");
exit(EXIT_FAILURE);
}
if((nx0 > ISIZ1) || (ny0 > ISIZ2) || (nz0 > ISIZ3))
{
printf(" PROBLEM SIZE IS TOO LARGE - \n"
" NX, NY AND NZ SHOULD BE EQUAL TO \n"
" ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY\n");
exit(EXIT_FAILURE);
}
printf("\n\n NAS Parallel Benchmarks 4.1 Parallel C++ version with OpenMP - LU "
"Benchmark\n\n");
printf(" Size: %4dx%4dx%4d\n", nx0, ny0, nz0);
printf(" Iterations: %4d\n", itmax);
printf("\n");
}
/*
* ---------------------------------------------------------------------
* compute the right hand sides
* ---------------------------------------------------------------------
*/
void
rhs()
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, k, m;
double q;
double tmp, utmp[ISIZ3][6], rtmp[ISIZ3][5];
double u21, u31, u41;
double u21i, u31i, u41i, u51i;
double u21j, u31j, u41j, u51j;
double u21k, u31k, u41k, u51k;
double u21im1, u31im1, u41im1, u51im1;
double u21jm1, u31jm1, u41jm1, u51jm1;
double u21km1, u31km1, u41km1, u51km1;
double flux[ISIZ1][5];
if(timeron)
{
timer_start(T_RHS);
}
#pragma omp for
for(k = 0; k < nz; k++)
{
for(j = 0; j < ny; j++)
{
for(i = 0; i < nx; i++)
{
for(m = 0; m < 5; m++)
{
rsd[k][j][i][m] = -frct[k][j][i][m];
}
tmp = 1.0 / u[k][j][i][0];
rho_i[k][j][i] = tmp;
qs[k][j][i] =
0.50 *
(u[k][j][i][1] * u[k][j][i][1] + u[k][j][i][2] * u[k][j][i][2] +
u[k][j][i][3] * u[k][j][i][3]) *
tmp;
}
}
}
if(timeron)
{
timer_start(T_RHSX);
}
/*
* ---------------------------------------------------------------------
* xi-direction flux differences
* ---------------------------------------------------------------------
*/
#pragma omp for
for(k = 1; k < nz - 1; k++)
{
for(j = jst; j < jend; j++)
{
for(i = 0; i < nx; i++)
{
flux[i][0] = u[k][j][i][1];
u21 = u[k][j][i][1] * rho_i[k][j][i];
q = qs[k][j][i];
flux[i][1] = u[k][j][i][1] * u21 + C2 * (u[k][j][i][4] - q);
flux[i][2] = u[k][j][i][2] * u21;
flux[i][3] = u[k][j][i][3] * u21;
flux[i][4] = (C1 * u[k][j][i][4] - C2 * q) * u21;
}
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
rsd[k][j][i][m] =
rsd[k][j][i][m] - tx2 * (flux[i + 1][m] - flux[i - 1][m]);
}
}
for(i = ist; i < nx; i++)
{
tmp = rho_i[k][j][i];
u21i = tmp * u[k][j][i][1];
u31i = tmp * u[k][j][i][2];
u41i = tmp * u[k][j][i][3];
u51i = tmp * u[k][j][i][4];
tmp = rho_i[k][j][i - 1];
u21im1 = tmp * u[k][j][i - 1][1];
u31im1 = tmp * u[k][j][i - 1][2];
u41im1 = tmp * u[k][j][i - 1][3];
u51im1 = tmp * u[k][j][i - 1][4];
flux[i][1] = (4.0 / 3.0) * tx3 * (u21i - u21im1);
flux[i][2] = tx3 * (u31i - u31im1);
flux[i][3] = tx3 * (u41i - u41im1);
flux[i][4] = 0.50 * (1.0 - C1 * C5) * tx3 *
((u21i * u21i + u31i * u31i + u41i * u41i) -
(u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1)) +
(1.0 / 6.0) * tx3 * (u21i * u21i - u21im1 * u21im1) +
C1 * C5 * tx3 * (u51i - u51im1);
}
for(i = ist; i < iend; i++)
{
rsd[k][j][i][0] =
rsd[k][j][i][0] +
dx1 * tx1 *
(u[k][j][i - 1][0] - 2.0 * u[k][j][i][0] + u[k][j][i + 1][0]);
rsd[k][j][i][1] =
rsd[k][j][i][1] + tx3 * C3 * C4 * (flux[i + 1][1] - flux[i][1]) +
dx2 * tx1 *
(u[k][j][i - 1][1] - 2.0 * u[k][j][i][1] + u[k][j][i + 1][1]);
rsd[k][j][i][2] =
rsd[k][j][i][2] + tx3 * C3 * C4 * (flux[i + 1][2] - flux[i][2]) +
dx3 * tx1 *
(u[k][j][i - 1][2] - 2.0 * u[k][j][i][2] + u[k][j][i + 1][2]);
rsd[k][j][i][3] =
rsd[k][j][i][3] + tx3 * C3 * C4 * (flux[i + 1][3] - flux[i][3]) +
dx4 * tx1 *
(u[k][j][i - 1][3] - 2.0 * u[k][j][i][3] + u[k][j][i + 1][3]);
rsd[k][j][i][4] =
rsd[k][j][i][4] + tx3 * C3 * C4 * (flux[i + 1][4] - flux[i][4]) +
dx5 * tx1 *
(u[k][j][i - 1][4] - 2.0 * u[k][j][i][4] + u[k][j][i + 1][4]);
}
/*
* ---------------------------------------------------------------------
* fourth-order dissipation
* ---------------------------------------------------------------------
*/
for(m = 0; m < 5; m++)
{
rsd[k][j][1][m] =
rsd[k][j][1][m] -
dssp * (+5.0 * u[k][j][1][m] - 4.0 * u[k][j][2][m] + u[k][j][3][m]);
rsd[k][j][2][m] =
rsd[k][j][2][m] - dssp * (-4.0 * u[k][j][1][m] + 6.0 * u[k][j][2][m] -
4.0 * u[k][j][3][m] + u[k][j][4][m]);
}
for(i = 3; i < nx - 3; i++)
{
for(m = 0; m < 5; m++)
{
rsd[k][j][i][m] =
rsd[k][j][i][m] -
dssp * (u[k][j][i - 2][m] - 4.0 * u[k][j][i - 1][m] +
6.0 * u[k][j][i][m] - 4.0 * u[k][j][i + 1][m] +
u[k][j][i + 2][m]);
}
}
for(m = 0; m < 5; m++)
{
rsd[k][j][nx - 3][m] =
rsd[k][j][nx - 3][m] -
dssp * (u[k][j][nx - 5][m] - 4.0 * u[k][j][nx - 4][m] +
6.0 * u[k][j][nx - 3][m] - 4.0 * u[k][j][nx - 2][m]);
rsd[k][j][nx - 2][m] =
rsd[k][j][nx - 2][m] -
dssp * (u[k][j][nx - 4][m] - 4.0 * u[k][j][nx - 3][m] +
5.0 * u[k][j][nx - 2][m]);
}
}
}
if(timeron)
{
timer_stop(T_RHSX);
}
if(timeron)
{
timer_start(T_RHSY);
}
/*
* ---------------------------------------------------------------------
* eta-direction flux differences
* ---------------------------------------------------------------------
*/
#pragma omp for
for(k = 1; k < nz - 1; k++)
{
for(i = ist; i < iend; i++)
{
for(j = 0; j < ny; j++)
{
flux[j][0] = u[k][j][i][2];
u31 = u[k][j][i][2] * rho_i[k][j][i];
q = qs[k][j][i];
flux[j][1] = u[k][j][i][1] * u31;
flux[j][2] = u[k][j][i][2] * u31 + C2 * (u[k][j][i][4] - q);
flux[j][3] = u[k][j][i][3] * u31;
flux[j][4] = (C1 * u[k][j][i][4] - C2 * q) * u31;
}
for(j = jst; j < jend; j++)
{
for(m = 0; m < 5; m++)
{
rsd[k][j][i][m] =
rsd[k][j][i][m] - ty2 * (flux[j + 1][m] - flux[j - 1][m]);
}
}
for(j = jst; j < ny; j++)
{
tmp = rho_i[k][j][i];
u21j = tmp * u[k][j][i][1];
u31j = tmp * u[k][j][i][2];
u41j = tmp * u[k][j][i][3];
u51j = tmp * u[k][j][i][4];
tmp = rho_i[k][j - 1][i];
u21jm1 = tmp * u[k][j - 1][i][1];
u31jm1 = tmp * u[k][j - 1][i][2];
u41jm1 = tmp * u[k][j - 1][i][3];
u51jm1 = tmp * u[k][j - 1][i][4];
flux[j][1] = ty3 * (u21j - u21jm1);
flux[j][2] = (4.0 / 3.0) * ty3 * (u31j - u31jm1);
flux[j][3] = ty3 * (u41j - u41jm1);
flux[j][4] = 0.50 * (1.0 - C1 * C5) * ty3 *
((u21j * u21j + u31j * u31j + u41j * u41j) -
(u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1)) +
(1.0 / 6.0) * ty3 * (u31j * u31j - u31jm1 * u31jm1) +
C1 * C5 * ty3 * (u51j - u51jm1);
}
for(j = jst; j < jend; j++)
{
rsd[k][j][i][0] =
rsd[k][j][i][0] +
dy1 * ty1 *
(u[k][j - 1][i][0] - 2.0 * u[k][j][i][0] + u[k][j + 1][i][0]);
rsd[k][j][i][1] =
rsd[k][j][i][1] + ty3 * C3 * C4 * (flux[j + 1][1] - flux[j][1]) +
dy2 * ty1 *
(u[k][j - 1][i][1] - 2.0 * u[k][j][i][1] + u[k][j + 1][i][1]);
rsd[k][j][i][2] =
rsd[k][j][i][2] + ty3 * C3 * C4 * (flux[j + 1][2] - flux[j][2]) +
dy3 * ty1 *
(u[k][j - 1][i][2] - 2.0 * u[k][j][i][2] + u[k][j + 1][i][2]);
rsd[k][j][i][3] =
rsd[k][j][i][3] + ty3 * C3 * C4 * (flux[j + 1][3] - flux[j][3]) +
dy4 * ty1 *
(u[k][j - 1][i][3] - 2.0 * u[k][j][i][3] + u[k][j + 1][i][3]);
rsd[k][j][i][4] =
rsd[k][j][i][4] + ty3 * C3 * C4 * (flux[j + 1][4] - flux[j][4]) +
dy5 * ty1 *
(u[k][j - 1][i][4] - 2.0 * u[k][j][i][4] + u[k][j + 1][i][4]);
}
}
/*
* ---------------------------------------------------------------------
* fourth-order dissipation
* ---------------------------------------------------------------------
*/
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
rsd[k][1][i][m] =
rsd[k][1][i][m] -
dssp * (+5.0 * u[k][1][i][m] - 4.0 * u[k][2][i][m] + u[k][3][i][m]);
rsd[k][2][i][m] =
rsd[k][2][i][m] - dssp * (-4.0 * u[k][1][i][m] + 6.0 * u[k][2][i][m] -
4.0 * u[k][3][i][m] + u[k][4][i][m]);
}
}
for(j = 3; j < ny - 3; j++)
{
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
rsd[k][j][i][m] =
rsd[k][j][i][m] -
dssp * (u[k][j - 2][i][m] - 4.0 * u[k][j - 1][i][m] +
6.0 * u[k][j][i][m] - 4.0 * u[k][j + 1][i][m] +
u[k][j + 2][i][m]);
}
}
}
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
rsd[k][ny - 3][i][m] =
rsd[k][ny - 3][i][m] -
dssp * (u[k][ny - 5][i][m] - 4.0 * u[k][ny - 4][i][m] +
6.0 * u[k][ny - 3][i][m] - 4.0 * u[k][ny - 2][i][m]);
rsd[k][ny - 2][i][m] =
rsd[k][ny - 2][i][m] -
dssp * (u[k][ny - 4][i][m] - 4.0 * u[k][ny - 3][i][m] +
5.0 * u[k][ny - 2][i][m]);
}
}
}
if(timeron)
{
timer_stop(T_RHSY);
}
if(timeron)
{
timer_start(T_RHSZ);
}
/*
* ---------------------------------------------------------------------
* zeta-direction flux differences
* ---------------------------------------------------------------------
*/
#pragma omp for
for(j = jst; j < jend; j++)
{
for(i = ist; i < iend; i++)
{
for(k = 0; k < nz; k++)
{
utmp[k][0] = u[k][j][i][0];
utmp[k][1] = u[k][j][i][1];
utmp[k][2] = u[k][j][i][2];
utmp[k][3] = u[k][j][i][3];
utmp[k][4] = u[k][j][i][4];
utmp[k][5] = rho_i[k][j][i];
}
for(k = 0; k < nz; k++)
{
flux[k][0] = utmp[k][3];
u41 = utmp[k][3] * utmp[k][5];
q = qs[k][j][i];
flux[k][1] = utmp[k][1] * u41;
flux[k][2] = utmp[k][2] * u41;
flux[k][3] = utmp[k][3] * u41 + C2 * (utmp[k][4] - q);
flux[k][4] = (C1 * utmp[k][4] - C2 * q) * u41;
}
for(k = 1; k < nz - 1; k++)
{
for(m = 0; m < 5; m++)
{
rtmp[k][m] =
rsd[k][j][i][m] - tz2 * (flux[k + 1][m] - flux[k - 1][m]);
}
}
for(k = 1; k < nz; k++)
{
tmp = utmp[k][5];
u21k = tmp * utmp[k][1];
u31k = tmp * utmp[k][2];
u41k = tmp * utmp[k][3];
u51k = tmp * utmp[k][4];
tmp = utmp[k - 1][5];
u21km1 = tmp * utmp[k - 1][1];
u31km1 = tmp * utmp[k - 1][2];
u41km1 = tmp * utmp[k - 1][3];
u51km1 = tmp * utmp[k - 1][4];
flux[k][1] = tz3 * (u21k - u21km1);
flux[k][2] = tz3 * (u31k - u31km1);
flux[k][3] = (4.0 / 3.0) * tz3 * (u41k - u41km1);
flux[k][4] = 0.50 * (1.0 - C1 * C5) * tz3 *
((u21k * u21k + u31k * u31k + u41k * u41k) -
(u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1)) +
(1.0 / 6.0) * tz3 * (u41k * u41k - u41km1 * u41km1) +
C1 * C5 * tz3 * (u51k - u51km1);
}
for(k = 1; k < nz - 1; k++)
{
rtmp[k][0] =
rtmp[k][0] +
dz1 * tz1 * (utmp[k - 1][0] - 2.0 * utmp[k][0] + utmp[k + 1][0]);
rtmp[k][1] =
rtmp[k][1] + tz3 * C3 * C4 * (flux[k + 1][1] - flux[k][1]) +
dz2 * tz1 * (utmp[k - 1][1] - 2.0 * utmp[k][1] + utmp[k + 1][1]);
rtmp[k][2] =
rtmp[k][2] + tz3 * C3 * C4 * (flux[k + 1][2] - flux[k][2]) +
dz3 * tz1 * (utmp[k - 1][2] - 2.0 * utmp[k][2] + utmp[k + 1][2]);
rtmp[k][3] =
rtmp[k][3] + tz3 * C3 * C4 * (flux[k + 1][3] - flux[k][3]) +
dz4 * tz1 * (utmp[k - 1][3] - 2.0 * utmp[k][3] + utmp[k + 1][3]);
rtmp[k][4] =
rtmp[k][4] + tz3 * C3 * C4 * (flux[k + 1][4] - flux[k][4]) +
dz5 * tz1 * (utmp[k - 1][4] - 2.0 * utmp[k][4] + utmp[k + 1][4]);
}
/*
* ---------------------------------------------------------------------
* fourth-order dissipation
* ---------------------------------------------------------------------
*/
for(m = 0; m < 5; m++)
{
rsd[1][j][i][m] = rtmp[1][m] - dssp * (+5.0 * utmp[1][m] -
4.0 * utmp[2][m] + utmp[3][m]);
rsd[2][j][i][m] =
rtmp[2][m] - dssp * (-4.0 * utmp[1][m] + 6.0 * utmp[2][m] -
4.0 * utmp[3][m] + utmp[4][m]);
}
for(k = 3; k < nz - 3; k++)
{
for(m = 0; m < 5; m++)
{
rsd[k][j][i][m] =
rtmp[k][m] -
dssp * (utmp[k - 2][m] - 4.0 * utmp[k - 1][m] + 6.0 * utmp[k][m] -
4.0 * utmp[k + 1][m] + utmp[k + 2][m]);
}
}
for(m = 0; m < 5; m++)
{
rsd[nz - 3][j][i][m] =
rtmp[nz - 3][m] -
dssp * (utmp[nz - 5][m] - 4.0 * utmp[nz - 4][m] +
6.0 * utmp[nz - 3][m] - 4.0 * utmp[nz - 2][m]);
rsd[nz - 2][j][i][m] =
rtmp[nz - 2][m] - dssp * (utmp[nz - 4][m] - 4.0 * utmp[nz - 3][m] +
5.0 * utmp[nz - 2][m]);
}
}
}
if(timeron)
{
timer_stop(T_RHSZ);
}
if(timeron)
{
timer_stop(T_RHS);
}
}
/*
* ---------------------------------------------------------------------
* set the boundary values of dependent variables
* ---------------------------------------------------------------------
*/
void
setbv()
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, k, m;
double temp1[5], temp2[5];
/*
* ---------------------------------------------------------------------
* set the dependent variable values along the top and bottom faces
* ---------------------------------------------------------------------
*/
#pragma omp for
for(j = 0; j < ny; j++)
{
for(i = 0; i < nx; i++)
{
exact(i, j, 0, temp1);
exact(i, j, nz - 1, temp2);
for(m = 0; m < 5; m++)
{
u[0][j][i][m] = temp1[m];
u[nz - 1][j][i][m] = temp2[m];
}
}
}
/*
* ---------------------------------------------------------------------
* set the dependent variable values along north and south faces
* ---------------------------------------------------------------------
*/
#pragma omp for
for(k = 0; k < nz; k++)
{
for(i = 0; i < nx; i++)
{
exact(i, 0, k, temp1);
exact(i, ny - 1, k, temp2);
for(m = 0; m < 5; m++)
{
u[k][0][i][m] = temp1[m];
u[k][ny - 1][i][m] = temp2[m];
}
}
}
/*
* ---------------------------------------------------------------------
* set the dependent variable values along east and west faces
* ---------------------------------------------------------------------
*/
#pragma omp for
for(k = 0; k < nz; k++)
{
for(j = 0; j < ny; j++)
{
exact(0, j, k, temp1);
exact(nx - 1, j, k, temp2);
for(m = 0; m < 5; m++)
{
u[k][j][0][m] = temp1[m];
u[k][j][nx - 1][m] = temp2[m];
}
}
}
}
void
setcoeff()
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
* set up coefficients
* ---------------------------------------------------------------------
*/
dxi = 1.0 / (nx0 - 1);
deta = 1.0 / (ny0 - 1);
dzeta = 1.0 / (nz0 - 1);
tx1 = 1.0 / (dxi * dxi);
tx2 = 1.0 / (2.0 * dxi);
tx3 = 1.0 / dxi;
ty1 = 1.0 / (deta * deta);
ty2 = 1.0 / (2.0 * deta);
ty3 = 1.0 / deta;
tz1 = 1.0 / (dzeta * dzeta);
tz2 = 1.0 / (2.0 * dzeta);
tz3 = 1.0 / dzeta;
/*
* ---------------------------------------------------------------------
* diffusion coefficients
* ---------------------------------------------------------------------
*/
dx1 = 0.75;
dx2 = dx1;
dx3 = dx1;
dx4 = dx1;
dx5 = dx1;
dy1 = 0.75;
dy2 = dy1;
dy3 = dy1;
dy4 = dy1;
dy5 = dy1;
dz1 = 1.00;
dz2 = dz1;
dz3 = dz1;
dz4 = dz1;
dz5 = dz1;
/*
* ---------------------------------------------------------------------
* fourth difference dissipation
* ---------------------------------------------------------------------
*/
dssp = (max(max(dx1, dy1), dz1)) / 4.0;
/*
* ---------------------------------------------------------------------
* coefficients of the exact solution to the first pde
* ---------------------------------------------------------------------
*/
ce[0][0] = 2.0;
ce[1][0] = 0.0;
ce[2][0] = 0.0;
ce[3][0] = 4.0;
ce[4][0] = 5.0;
ce[5][0] = 3.0;
ce[6][0] = 5.0e-01;
ce[7][0] = 2.0e-02;
ce[8][0] = 1.0e-02;
ce[9][0] = 3.0e-02;
ce[10][0] = 5.0e-01;
ce[11][0] = 4.0e-01;
ce[12][0] = 3.0e-01;
/*
* ---------------------------------------------------------------------
* coefficients of the exact solution to the second pde
* ---------------------------------------------------------------------
*/
ce[0][1] = 1.0;
ce[1][1] = 0.0;
ce[2][1] = 0.0;
ce[3][1] = 0.0;
ce[4][1] = 1.0;
ce[5][1] = 2.0;
ce[6][1] = 3.0;
ce[7][1] = 1.0e-02;
ce[8][1] = 3.0e-02;
ce[9][1] = 2.0e-02;
ce[10][1] = 4.0e-01;
ce[11][1] = 3.0e-01;
ce[12][1] = 5.0e-01;
/*
* ---------------------------------------------------------------------
* coefficients of the exact solution to the third pde
* ---------------------------------------------------------------------
*/
ce[0][2] = 2.0;
ce[1][2] = 2.0;
ce[2][2] = 0.0;
ce[3][2] = 0.0;
ce[4][2] = 0.0;
ce[5][2] = 2.0;
ce[6][2] = 3.0;
ce[7][2] = 4.0e-02;
ce[8][2] = 3.0e-02;
ce[9][2] = 5.0e-02;
ce[10][2] = 3.0e-01;
ce[11][2] = 5.0e-01;
ce[12][2] = 4.0e-01;
/*
* ---------------------------------------------------------------------
* coefficients of the exact solution to the fourth pde
* ---------------------------------------------------------------------
*/
ce[0][3] = 2.0;
ce[1][3] = 2.0;
ce[2][3] = 0.0;
ce[3][3] = 0.0;
ce[4][3] = 0.0;
ce[5][3] = 2.0;
ce[6][3] = 3.0;
ce[7][3] = 3.0e-02;
ce[8][3] = 5.0e-02;
ce[9][3] = 4.0e-02;
ce[10][3] = 2.0e-01;
ce[11][3] = 1.0e-01;
ce[12][3] = 3.0e-01;
/*
* ---------------------------------------------------------------------
* coefficients of the exact solution to the fifth pde
* ---------------------------------------------------------------------
*/
ce[0][4] = 5.0;
ce[1][4] = 4.0;
ce[2][4] = 3.0;
ce[3][4] = 2.0;
ce[4][4] = 1.0e-01;
ce[5][4] = 4.0e-01;
ce[6][4] = 3.0e-01;
ce[7][4] = 5.0e-02;
ce[8][4] = 4.0e-02;
ce[9][4] = 3.0e-02;
ce[10][4] = 1.0e-01;
ce[11][4] = 3.0e-01;
ce[12][4] = 2.0e-01;
}
/*
* ---------------------------------------------------------------------
* set the initial values of independent variables based on tri-linear
* interpolation of boundary values in the computational space.
* ---------------------------------------------------------------------
*/
void
setiv()
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, k, m;
double xi, eta, zeta;
double pxi, peta, pzeta;
double ue_1jk[5], ue_nx0jk[5], ue_i1k[5];
double ue_iny0k[5], ue_ij1[5], ue_ijnz[5];
#pragma omp for
for(k = 1; k < nz - 1; k++)
{
zeta = ((double) k) / (nz - 1);
for(j = 1; j < ny - 1; j++)
{
eta = ((double) j) / (ny0 - 1);
for(i = 1; i < nx - 1; i++)
{
xi = ((double) i) / (nx0 - 1);
exact(0, j, k, ue_1jk);
exact(nx0 - 1, j, k, ue_nx0jk);
exact(i, 0, k, ue_i1k);
exact(i, ny0 - 1, k, ue_iny0k);
exact(i, j, 0, ue_ij1);
exact(i, j, nz - 1, ue_ijnz);
for(m = 0; m < 5; m++)
{
pxi = (1.0 - xi) * ue_1jk[m] + xi * ue_nx0jk[m];
peta = (1.0 - eta) * ue_i1k[m] + eta * ue_iny0k[m];
pzeta = (1.0 - zeta) * ue_ij1[m] + zeta * ue_ijnz[m];
u[k][j][i][m] = pxi + peta + pzeta - pxi * peta - peta * pzeta -
pzeta * pxi + pxi * peta * pzeta;
}
}
}
}
}
/*
* ---------------------------------------------------------------------
* to perform pseudo-time stepping SSOR iterations
* for five nonlinear pde's.
* ---------------------------------------------------------------------
*/
void
ssor(int niter)
{
/*
* ---------------------------------------------------------------------
* local variables
* ---------------------------------------------------------------------
*/
int i, j, k, m, n;
int istep;
double tmp, tv[ISIZ2 * (ISIZ1 / 2 * 2 + 1) * 5];
double delunm[5];
/*
* ---------------------------------------------------------------------
* begin pseudo-time stepping iterations
* ---------------------------------------------------------------------
*/
tmp = 1.0 / (omega * (2.0 - omega));
/*
* ---------------------------------------------------------------------
* initialize a,b,c,d to zero (guarantees that page tables have been
* formed, if applicable on given architecture, before timestepping).
* ---------------------------------------------------------------------
*/
#pragma omp parallel for private(i, j, n, m)
for(j = 0; j < ISIZ2; j++)
{
for(i = 0; i < ISIZ1; i++)
{
for(n = 0; n < 5; n++)
{
for(m = 0; m < 5; m++)
{
a[j][i][n][m] = 0.0;
b[j][i][n][m] = 0.0;
c[j][i][n][m] = 0.0;
d[j][i][n][m] = 0.0;
}
}
}
}
for(i = 1; i <= T_LAST; i++)
{
timer_clear(i);
}
#pragma omp parallel
{
/*
* ---------------------------------------------------------------------
* compute the steady-state residuals
* ---------------------------------------------------------------------
*/
rhs();
/*
* ---------------------------------------------------------------------
* compute the L2 norms of newton iteration residuals
* ---------------------------------------------------------------------
*/
l2norm(nx0, ny0, nz0, ist, iend, jst, jend, rsd, rsdnm);
} /* end parallel */
for(i = 1; i <= T_LAST; i++)
{
timer_clear(i);
}
timer_start(1);
#pragma omp parallel private(istep, i, j, k, m)
{
/*
* ---------------------------------------------------------------------
* the timestep loop
* ---------------------------------------------------------------------
*/
for(istep = 1; istep <= niter; istep++)
{
if((istep % 20) == 0 || istep == itmax || istep == 1)
{
#pragma omp master
if(niter > 1)
{
printf(" Time step %4d\n", istep);
}
}
/*
* ---------------------------------------------------------------------
* perform SSOR iteration
* ---------------------------------------------------------------------
*/
if(timeron)
{
#pragma omp master
timer_start(T_RHS);
}
#pragma omp for
for(k = 1; k < nz - 1; k++)
{
for(j = jst; j < jend; j++)
{
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
rsd[k][j][i][m] = dt * rsd[k][j][i][m];
}
}
}
}
if(timeron)
{
#pragma omp master
timer_stop(T_RHS);
}
for(k = 1; k < nz - 1; k++)
{
/*
* ---------------------------------------------------------------------
* form the lower triangular part of the jacobian matrix
* ---------------------------------------------------------------------
*/
if(timeron)
{
#pragma omp master
timer_start(T_JACLD);
}
jacld(k);
if(timeron)
{
#pragma omp master
timer_stop(T_JACLD);
}
/*
* ---------------------------------------------------------------------
* perform the lower triangular solution
* ---------------------------------------------------------------------
*/
if(timeron)
{
#pragma omp master
timer_start(T_BLTS);
}
blts(nx, ny, nz, k, omega, rsd, a, b, c, d, ist, iend, jst, jend, nx0,
ny0);
if(timeron)
{
#pragma omp master
timer_stop(T_BLTS);
}
}
#pragma omp barrier
for(k = nz - 2; k > 0; k--)
{
/*
* ---------------------------------------------------------------------
* form the strictly upper triangular part of the jacobian matrix
* ---------------------------------------------------------------------
*/
if(timeron)
{
#pragma omp master
timer_start(T_JACU);
}
jacu(k);
if(timeron)
{
#pragma omp master
timer_stop(T_JACU);
}
/*
* ---------------------------------------------------------------------
* perform the upper triangular solution
* ---------------------------------------------------------------------
*/
if(timeron)
{
#pragma omp master
timer_start(T_BUTS);
}
buts(nx, ny, nz, k, omega, rsd, tv, d, a, b, c, ist, iend, jst, jend, nx0,
ny0);
if(timeron)
{
#pragma omp master
timer_stop(T_BUTS);
}
}
#pragma omp barrier
/*
* ---------------------------------------------------------------------
* update the variables
* ---------------------------------------------------------------------
*/
if(timeron)
{
#pragma omp master
timer_start(T_ADD);
}
#pragma omp for
for(k = 1; k < nz - 1; k++)
{
for(j = jst; j < jend; j++)
{
for(i = ist; i < iend; i++)
{
for(m = 0; m < 5; m++)
{
u[k][j][i][m] = u[k][j][i][m] + tmp * rsd[k][j][i][m];
}
}
}
}
if(timeron)
{
#pragma omp master
timer_stop(T_ADD);
}
/*
* ---------------------------------------------------------------------
* compute the max-norms of newton iteration corrections
* ---------------------------------------------------------------------
*/
if((istep % inorm) == 0)
{
if(timeron)
{
#pragma omp master
timer_start(T_L2NORM);
}
l2norm(nx0, ny0, nz0, ist, iend, jst, jend, rsd, delunm);
if(timeron)
{
#pragma omp master
timer_stop(T_L2NORM);
}
}
/*
* ---------------------------------------------------------------------
* compute the steady-state residuals
* ---------------------------------------------------------------------
*/
rhs();
/*
* ---------------------------------------------------------------------
* compute the max-norms of newton iteration residuals
* ---------------------------------------------------------------------
*/
if(((istep % inorm) == 0) || (istep == itmax))
{
if(timeron)
{
#pragma omp master
timer_start(T_L2NORM);
}
l2norm(nx0, ny0, nz0, ist, iend, jst, jend, rsd, rsdnm);
if(timeron)
{
#pragma omp master
timer_stop(T_L2NORM);
}
}
/*
* ---------------------------------------------------------------------
* check the newton-iteration residuals against the tolerance levels
* ---------------------------------------------------------------------
*/
if((rsdnm[0] < tolrsd[0]) && (rsdnm[1] < tolrsd[1]) &&
(rsdnm[2] < tolrsd[2]) && (rsdnm[3] < tolrsd[3]) && (rsdnm[4] < tolrsd[4]))
{
#pragma omp master
printf(" \n convergence was achieved after %4d pseudo-time steps\n",
istep);
break;
}
}
} /* end parallel */
timer_stop(1);
maxtime = timer_read(1);
}
/*
* ---------------------------------------------------------------------
* verification routine
* ---------------------------------------------------------------------
*/
void
verify(double xcr[], double xce[], double xci, char* class_npb, boolean* verified)
{
double xcrref[5], xceref[5], xciref;
double xcrdif[5], xcedif[5], xcidif;
double epsilon, dtref = 0.0;
int m;
/*
* ---------------------------------------------------------------------
* tolerance level
* ---------------------------------------------------------------------
*/
epsilon = 1.0e-08;
*class_npb = 'U';
*verified = TRUE;
for(m = 0; m < 5; m++)
{
xcrref[m] = 1.0;
xceref[m] = 1.0;
}
xciref = 1.0;
if((nx0 == 12) && (ny0 == 12) && (nz0 == 12) && (itmax == 50))
{
*class_npb = 'S';
dtref = 5.0e-1;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of residual, for the (12X12X12) grid,
* after 50 time steps, with DT = 5.0d-01
* ---------------------------------------------------------------------
*/
xcrref[0] = 1.6196343210976702e-02;
xcrref[1] = 2.1976745164821318e-03;
xcrref[2] = 1.5179927653399185e-03;
xcrref[3] = 1.5029584435994323e-03;
xcrref[4] = 3.4264073155896461e-02;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of solution error, for the (12X12X12) grid,
* after 50 time steps, with DT = 5.0d-01
* ---------------------------------------------------------------------
*/
xceref[0] = 6.4223319957960924e-04;
xceref[1] = 8.4144342047347926e-05;
xceref[2] = 5.8588269616485186e-05;
xceref[3] = 5.8474222595157350e-05;
xceref[4] = 1.3103347914111294e-03;
/*
* ---------------------------------------------------------------------
* reference value of surface integral, for the (12X12X12) grid,
* after 50 time steps, with DT = 5.0d-01
* ---------------------------------------------------------------------
*/
xciref = 7.8418928865937083e+00;
}
else if((nx0 == 33) && (ny0 == 33) && (nz0 == 33) && (itmax == 300))
{
*class_npb = 'W'; /* SPEC95fp size */
dtref = 1.5e-3;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of residual, for the (33x33x33) grid,
* after 300 time steps, with DT = 1.5d-3
* ---------------------------------------------------------------------
*/
xcrref[0] = 0.1236511638192e+02;
xcrref[1] = 0.1317228477799e+01;
xcrref[2] = 0.2550120713095e+01;
xcrref[3] = 0.2326187750252e+01;
xcrref[4] = 0.2826799444189e+02;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of solution error, for the (33X33X33) grid,
* ---------------------------------------------------------------------
*/
xceref[0] = 0.4867877144216e+00;
xceref[1] = 0.5064652880982e-01;
xceref[2] = 0.9281818101960e-01;
xceref[3] = 0.8570126542733e-01;
xceref[4] = 0.1084277417792e+01;
/*
* ---------------------------------------------------------------------
* rReference value of surface integral, for the (33X33X33) grid,
* after 300 time steps, with DT = 1.5d-3
* ---------------------------------------------------------------------
*/
xciref = 0.1161399311023e+02;
}
else if((nx0 == 64) && (ny0 == 64) && (nz0 == 64) && (itmax == 250))
{
*class_npb = 'A';
dtref = 2.0e+0;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of residual, for the (64X64X64) grid,
* after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xcrref[0] = 7.7902107606689367e+02;
xcrref[1] = 6.3402765259692870e+01;
xcrref[2] = 1.9499249727292479e+02;
xcrref[3] = 1.7845301160418537e+02;
xcrref[4] = 1.8384760349464247e+03;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of solution error, for the (64X64X64) grid,
* after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xceref[0] = 2.9964085685471943e+01;
xceref[1] = 2.8194576365003349e+00;
xceref[2] = 7.3473412698774742e+00;
xceref[3] = 6.7139225687777051e+00;
xceref[4] = 7.0715315688392578e+01;
/*
* ---------------------------------------------------------------------
* reference value of surface integral, for the (64X64X64) grid,
* after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xciref = 2.6030925604886277e+01;
}
else if((nx0 == 102) && (ny0 == 102) && (nz0 == 102) && (itmax == 250))
{
*class_npb = 'B';
dtref = 2.0e+0;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of residual, for the (102X102X102) grid,
* after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xcrref[0] = 3.5532672969982736e+03;
xcrref[1] = 2.6214750795310692e+02;
xcrref[2] = 8.8333721850952190e+02;
xcrref[3] = 7.7812774739425265e+02;
xcrref[4] = 7.3087969592545314e+03;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of solution error, for the (102X102X102)
* grid, after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xceref[0] = 1.1401176380212709e+02;
xceref[1] = 8.1098963655421574e+00;
xceref[2] = 2.8480597317698308e+01;
xceref[3] = 2.5905394567832939e+01;
xceref[4] = 2.6054907504857413e+02;
/*
c---------------------------------------------------------------------
* reference value of surface integral, for the (102X102X102) grid,
* after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xciref = 4.7887162703308227e+01;
}
else if((nx0 == 162) && (ny0 == 162) && (nz0 == 162) && (itmax == 250))
{
*class_npb = 'C';
dtref = 2.0e+0;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of residual, for the (162X162X162) grid,
* after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xcrref[0] = 1.03766980323537846e+04;
xcrref[1] = 8.92212458801008552e+02;
xcrref[2] = 2.56238814582660871e+03;
xcrref[3] = 2.19194343857831427e+03;
xcrref[4] = 1.78078057261061185e+04;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of solution error, for the (162X162X162)
* grid, after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xceref[0] = 2.15986399716949279e+02;
xceref[1] = 1.55789559239863600e+01;
xceref[2] = 5.41318863077207766e+01;
xceref[3] = 4.82262643154045421e+01;
xceref[4] = 4.55902910043250358e+02;
/*
* ---------------------------------------------------------------------
* reference value of surface integral, for the (162X162X162) grid,
* after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xciref = 6.66404553572181300e+01;
/*
* ---------------------------------------------------------------------
* reference value of surface integral, for the (162X162X162) grid,
* after 250 time steps, with DT = 2.0d+00
* ---------------------------------------------------------------------
*/
xciref = 6.66404553572181300e+01;
}
else if((nx0 == 408) && (ny0 == 408) && (nz0 == 408) && (itmax == 300))
{
*class_npb = 'D';
dtref = 1.0e+0;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of residual, for the (408X408X408) grid,
* after 300 time steps, with DT = 1.0d+00
* ---------------------------------------------------------------------
*/
xcrref[0] = 0.4868417937025e+05;
xcrref[1] = 0.4696371050071e+04;
xcrref[2] = 0.1218114549776e+05;
xcrref[3] = 0.1033801493461e+05;
xcrref[4] = 0.7142398413817e+05;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of solution error, for the (408X408X408)
* grid, after 300 time steps, with DT = 1.0d+00
* ---------------------------------------------------------------------
*/
xceref[0] = 0.3752393004482e+03;
xceref[1] = 0.3084128893659e+02;
xceref[2] = 0.9434276905469e+02;
xceref[3] = 0.8230686681928e+02;
xceref[4] = 0.7002620636210e+03;
/*
* ---------------------------------------------------------------------
* reference value of surface integral, for the (408X408X408) grid,
* after 300 time steps, with DT = 1.0d+00
* ---------------------------------------------------------------------
*/
xciref = 0.8334101392503e+02;
}
else if((nx0 == 1020) && (ny0 == 1020) && (nz0 == 1020) && (itmax == 300))
{
*class_npb = 'E';
dtref = 0.5e+0;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of residual, for the (1020X1020X1020) grid,
* after 300 time steps, with DT = 0.5d+00
* ---------------------------------------------------------------------
*/
xcrref[0] = 0.2099641687874e+06;
xcrref[1] = 0.2130403143165e+05;
xcrref[2] = 0.5319228789371e+05;
xcrref[3] = 0.4509761639833e+05;
xcrref[4] = 0.2932360006590e+06;
/*
* ---------------------------------------------------------------------
* reference values of RMS-norms of solution error, for the (1020X1020X1020)
* grid, after 300 time steps, with DT = 0.5d+00
* ---------------------------------------------------------------------
*/
xceref[0] = 0.4800572578333e+03;
xceref[1] = 0.4221993400184e+02;
xceref[2] = 0.1210851906824e+03;
xceref[3] = 0.1047888986770e+03;
xceref[4] = 0.8363028257389e+03;
/*
* ---------------------------------------------------------------------
* reference value of surface integral, for the (1020X1020X1020) grid,
* after 300 time steps, with DT = 0.5d+00
* ---------------------------------------------------------------------
*/
xciref = 0.9512163272273e+02;
}
else
{
*verified = FALSE;
}
/*
* ---------------------------------------------------------------------
* verification test for residuals if gridsize is one of
* the defined grid sizes above (class .ne. 'U')
* ---------------------------------------------------------------------
* compute the difference of solution values and the known reference values.
* ---------------------------------------------------------------------
*/
for(m = 0; m < 5; m++)
{
xcrdif[m] = fabs((xcr[m] - xcrref[m]) / xcrref[m]);
xcedif[m] = fabs((xce[m] - xceref[m]) / xceref[m]);
}
xcidif = fabs((xci - xciref) / xciref);
/*
* ---------------------------------------------------------------------
* output the comparison of computed results to known cases.
* ---------------------------------------------------------------------
*/
if(*class_npb != 'U')
{
printf("\n Verification being performed for class_npb %c\n", *class_npb);
printf(" Accuracy setting for epsilon = %20.13E\n", epsilon);
*verified = (fabs(dt - dtref) <= epsilon);
if(!(*verified))
{
*class_npb = 'U';
printf(" DT does not match the reference value of %15.8E\n", dtref);
}
}
else
{
printf(" Unknown class_npb\n");
}
if(*class_npb != 'U')
{
printf(" Comparison of RMS-norms of residual\n");
}
else
{
printf(" RMS-norms of residual\n");
}
for(m = 0; m < 5; m++)
{
if(*class_npb == 'U')
{
printf(" %2d %20.13E\n", m + 1, xcr[m]);
}
else if(xcrdif[m] <= epsilon)
{
printf(" %2d %20.13E%20.13E%20.13E\n", m + 1, xcr[m], xcrref[m],
xcrdif[m]);
}
else
{
*verified = FALSE;
printf(" FAILURE: %2d %20.13E%20.13E%20.13E\n", m + 1, xcr[m], xcrref[m],
xcrdif[m]);
}
}
if(*class_npb != 'U')
{
printf(" Comparison of RMS-norms of solution error\n");
}
else
{
printf(" RMS-norms of solution error\n");
}
for(m = 0; m < 5; m++)
{
if(*class_npb == 'U')
{
printf(" %2d %20.13E\n", m + 1, xce[m]);
}
else if(xcedif[m] <= epsilon)
{
printf(" %2d %20.13E%20.13E%20.13E\n", m + 1, xce[m], xceref[m],
xcedif[m]);
}
else
{
*verified = FALSE;
printf(" FAILURE: %2d %20.13E%20.13E%20.13E\n", m + 1, xce[m], xceref[m],
xcedif[m]);
}
}
if(*class_npb != 'U')
{
printf(" Comparison of surface integral\n");
}
else
{
printf(" Surface integral\n");
}
if(*class_npb == 'U')
{
printf(" %20.13E\n", xci);
}
else if(xcidif <= epsilon)
{
printf(" %20.13E%20.13E%20.13E\n", xci, xciref, xcidif);
}
else
{
*verified = FALSE;
printf(" FAILURE: %20.13E%20.13E%20.13E\n", xci, xciref, xcidif);
}
if(*class_npb == 'U')
{
printf(" No reference values provided\n");
printf("No verification performed\n");
}
else if(*verified)
{
printf(" Verification Successful\n");
}
else
{
printf(" Verification failed\n");
}
}