Commit 4f8cc609 authored by Jean-Matthieu Gallard's avatar Jean-Matthieu Gallard
Browse files

KernelGen Nonlinear STP - fix IPO bug with gemm tmp output array by reusing lduh instead

parent 26b077cc
...@@ -108,7 +108,6 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral( ...@@ -108,7 +108,6 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
{{m.vectPDEsArrays('St', nVar, False) | indent(2)}}{##} {{m.vectPDEsArrays('St', nVar, False) | indent(2)}}{##}
{% endif %} {% endif %}
double new_lQi_slice[{{nDof*nVarPad}}] __attribute__((aligned(ALIGNMENT))); //for step 4 (computing new lQi value), doesn't update parameters
const double dtBydx = inverseDx * dt; //Assume dx[0] == dx[1] == dx[2] const double dtBydx = inverseDx * dt; //Assume dx[0] == dx[1] == dx[2]
{% if useNCP or (useFlux and useCERKGuess) or useViscousFlux %} {% if useNCP or (useFlux and useCERKGuess) or useViscousFlux %}
double dudxT_by_dx[{{nDof*nDofPad}}] __attribute__((aligned(ALIGNMENT))); double dudxT_by_dx[{{nDof*nDofPad}}] __attribute__((aligned(ALIGNMENT)));
...@@ -322,15 +321,16 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral( ...@@ -322,15 +321,16 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
double machine_precision = max_approx*1e-10; // used to avoid rounding error relative error to prevent convergence, assume all variables are correlated (and thus value below avg variable * 10^-10 is a rounding error) double machine_precision = max_approx*1e-10; // used to avoid rounding error relative error to prevent convergence, assume all variables are correlated (and thus value below avg variable * 10^-10 is a rounding error)
{% endif %} {% endif %}
for (int xyz = 0; xyz < {{nDof**nDim}}; xyz++) { for (int xyz = 0; xyz < {{nDof**nDim}}; xyz++) {
{{ m.matmul_legacy('lqi', 'rhs', 's_m_QSlice', 'new_lQi_slice',nVarPad~'*xyz', '0', '0', trueB='iK1_T', trueAlpha='iweights3[xyz]') | indent(6) }}{##} // use lduh as scratch to store new slice
{{ m.matmul_legacy('lqi', 'rhs', 's_m_QSlice', 'lduh', nVarPad~'*xyz', '0', '0', trueB='iK1_T', trueAlpha='iweights3[xyz]') | indent(6) }}{##}
for(int t = 0; t < {{nDof}}; t++) { for(int t = 0; t < {{nDof}}; t++) {
for(int n=0; n<{{nVar}}; n++) { //only copy and change the variables, skip parameters for(int n=0; n<{{nVar}}; n++) { //only copy and change the variables, skip parameters
{% if advancedStopCriterion %} {% if advancedStopCriterion %}
sq_res += ((new_lQi_slice[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]) * (new_lQi_slice[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]))/((lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]*lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)])+machine_precision); sq_res += ((lduh[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]) * (lduh[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]))/((lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]*lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)])+machine_precision);
{% else %} {% else %}
sq_res += (new_lQi_slice[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]) * (new_lQi_slice[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]); sq_res += (lduh[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]) * (lduh[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]);
{% endif %} {% endif %}
lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)] = new_lQi_slice[n+{{nVarPad}}*t]; lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)] = lduh[n+{{nVarPad}}*t];
} }
} }
} }
......
...@@ -103,7 +103,6 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral( ...@@ -103,7 +103,6 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
// 0. Allocate local variable // 0. Allocate local variable
constexpr int MaxIterations = {{2*nDof+1}}; constexpr int MaxIterations = {{2*nDof+1}};
double new_lQi_slice[{{nDof*nDof2Pad}}] __attribute__((aligned(ALIGNMENT))) = {0.}; //for step 4 (computing new lQi value), doesn't update parameters (t*yx (fused y and x block))
const double inverseDt = 1.0 / dt; const double inverseDt = 1.0 / dt;
const double dtBydx = inverseDx * dt; //Assume dx[0] == dx[1] == dx[2] const double dtBydx = inverseDx * dt; //Assume dx[0] == dx[1] == dx[2]
{% if useNCP or (useFlux and useCERKGuess) or useViscousFlux %} {% if useNCP or (useFlux and useCERKGuess) or useViscousFlux %}
...@@ -327,11 +326,12 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral( ...@@ -327,11 +326,12 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
// 3. Multiply with (K1)^(-1) to get the discrete time integral of the discrete Picard iteration. Rhs missing weight and dt included in coeff matrix // 3. Multiply with (K1)^(-1) to get the discrete time integral of the discrete Picard iteration. Rhs missing weight and dt included in coeff matrix
double sq_res = 0.0; double sq_res = 0.0;
for (int zn = 0; zn < {{nDof3D*nVar}}; zn++) { for (int zn = 0; zn < {{nDof3D*nVar}}; zn++) {
{{ m.matmul('lqi', 'rhs', 'iK1_T_wt_dt', 'new_lQi_slice',idxRhs(0,0,zn,0,0), '0', '0') | indent(6) }}{##} // use lduh as scratch to store new slice
{{ m.matmul('lqi', 'rhs', 'iK1_T_wt_dt', 'lduh',idxRhs(0,0,zn,0,0), '0', '0') | indent(6) }}{##}
for(int t = 0; t < {{nDof}}; t++) { for(int t = 0; t < {{nDof}}; t++) {
for(int yx=0; yx<{{nDof2Pad}}; yx++) { //only copy and change the variables, skip parameters for(int yx=0; yx<{{nDof2Pad}}; yx++) { //only copy and change the variables, skip parameters
sq_res += (new_lQi_slice[yx+{{nDof2Pad}}*t] - lQi[{{idxLQi(t,0,zn,0,yx)}}]) * (new_lQi_slice[yx+{{nDof2Pad}}*t] - lQi[{{idxLQi(t,0,zn,0,yx)}}]); sq_res += (lduh[yx+{{nDof2Pad}}*t] - lQi[{{idxLQi(t,0,zn,0,yx)}}]) * (lduh[yx+{{nDof2Pad}}*t] - lQi[{{idxLQi(t,0,zn,0,yx)}}]);
lQi[{{idxLQi(t,0,zn,0,yx)}}] = new_lQi_slice[yx+{{nDof2Pad}}*t]; lQi[{{idxLQi(t,0,zn,0,yx)}}] = lduh[yx+{{nDof2Pad}}*t];
} }
} }
} }
......
...@@ -98,7 +98,6 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral( ...@@ -98,7 +98,6 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
{{ m.setupMatmul('lqi') | indent(2) }}{##} {{ m.setupMatmul('lqi') | indent(2) }}{##}
// 0. Allocate local variable // 0. Allocate local variable
double new_lQi_slice[{{nDof*nVarPad}}] __attribute__((aligned(ALIGNMENT))); //for step 4 (computing new lQi value), doesn't update parameters
const double inverseDt = 1.0 / dt; const double inverseDt = 1.0 / dt;
const double dtBydx = inverseDx * dt; //Assume dx[0] == dx[1] == dx[2] const double dtBydx = inverseDx * dt; //Assume dx[0] == dx[1] == dx[2]
{% if useNCP or (useFlux and useCERKGuess) or useViscousFlux %} {% if useNCP or (useFlux and useCERKGuess) or useViscousFlux %}
...@@ -365,11 +364,12 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral( ...@@ -365,11 +364,12 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
// 3. Multiply with (K1)^(-1) to get the discrete time integral of the discrete Picard iteration. Rhs missing weight and dt included in coeff matrix // 3. Multiply with (K1)^(-1) to get the discrete time integral of the discrete Picard iteration. Rhs missing weight and dt included in coeff matrix
double sq_res = 0.0; double sq_res = 0.0;
for (int zyx = 0; zyx < {{nDof**nDim}}; zyx++) { for (int zyx = 0; zyx < {{nDof**nDim}}; zyx++) {
{{ m.matmul('lqi', 'rhs', 'iK1_T_wt_dt', 'new_lQi_slice',idxRhs(0,0,0,zyx,0), '0', '0') | indent(6) }}{##} // use lduh as scratch to store new slice
{{ m.matmul('lqi', 'rhs', 'iK1_T_wt_dt', 'lduh',idxRhs(0,0,0,zyx,0), '0', '0') | indent(6) }}{##}
for(int t = 0; t < {{nDof}}; t++) { for(int t = 0; t < {{nDof}}; t++) {
for(int n=0; n<{{nVar}}; n++) { //only copy and change the variables, skip parameters for(int n=0; n<{{nVar}}; n++) { //only copy and change the variables, skip parameters
sq_res += (new_lQi_slice[n+{{nVarPad}}*t] - lQi[{{idxLQi(t,0,0,zyx,n)}}]) * (new_lQi_slice[n+{{nVarPad}}*t] - lQi[{{idxLQi(t,0,0,zyx,n)}}]); sq_res += (lduh[n+{{nVarPad}}*t] - lQi[{{idxLQi(t,0,0,zyx,n)}}]) * (lduh[n+{{nVarPad}}*t] - lQi[{{idxLQi(t,0,0,zyx,n)}}]);
lQi[{{idxLQi(t,0,0,zyx,n)}}] = new_lQi_slice[n+{{nVarPad}}*t]; lQi[{{idxLQi(t,0,0,zyx,n)}}] = lduh[n+{{nVarPad}}*t];
} }
} }
} }
......
...@@ -103,7 +103,6 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral( ...@@ -103,7 +103,6 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
// 0. Allocate local variable // 0. Allocate local variable
constexpr int MaxIterations = {{2*nDof+1}}; constexpr int MaxIterations = {{2*nDof+1}};
double new_lQi_slice[{{nDof*nVar*nDofPad}}] __attribute__((aligned(ALIGNMENT))) = {0.}; //for step 4 (computing new lQi value), doesn't update parameters
const double inverseDt = 1.0 / dt; const double inverseDt = 1.0 / dt;
const double dtBydx = inverseDx * dt; //Assume dx[0] == dx[1] == dx[2] const double dtBydx = inverseDx * dt; //Assume dx[0] == dx[1] == dx[2]
{% if useNCP or (useFlux and useCERKGuess) or useViscousFlux %} {% if useNCP or (useFlux and useCERKGuess) or useViscousFlux %}
...@@ -381,11 +380,12 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral( ...@@ -381,11 +380,12 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
// 3. Multiply with (K1)^(-1) to get the discrete time integral of the discrete Picard iteration. Rhs missing weight and dt included in coeff matrix // 3. Multiply with (K1)^(-1) to get the discrete time integral of the discrete Picard iteration. Rhs missing weight and dt included in coeff matrix
double sq_res = 0.0; double sq_res = 0.0;
for (int zy = 0; zy < {{nDof3D*nDof}}; zy++) { for (int zy = 0; zy < {{nDof3D*nDof}}; zy++) {
{{ m.matmul('lqi', 'rhs', 'iK1_T_wt_dt', 'new_lQi_slice',idxRhs(0,0,zy,0,0), '0', '0') | indent(6) }}{##} // use lduh as scratch to store new slice
{{ m.matmul('lqi', 'rhs', 'iK1_T_wt_dt', 'lduh',idxRhs(0,0,zy,0,0), '0', '0') | indent(6) }}{##}
for(int t = 0; t < {{nDof}}; t++) { for(int t = 0; t < {{nDof}}; t++) {
for(int nx=0; nx<{{nVar*nDofPad}}; nx++) { //only copy and change the variables, skip parameters for(int nx=0; nx<{{nVar*nDofPad}}; nx++) { //only copy and change the variables, skip parameters
sq_res += (new_lQi_slice[nx+{{nVar*nDofPad}}*t] - lQi[{{idxLQi(t,0,zy,0,nx)}}]) * (new_lQi_slice[nx+{{nVar*nDofPad}}*t] - lQi[{{idxLQi(t,0,zy,0,nx)}}]); sq_res += (lduh[nx+{{nVar*nDofPad}}*t] - lQi[{{idxLQi(t,0,zy,0,nx)}}]) * (lduh[nx+{{nVar*nDofPad}}*t] - lQi[{{idxLQi(t,0,zy,0,nx)}}]);
lQi[{{idxLQi(t,0,zy,0,nx)}}] = new_lQi_slice[nx+{{nVar*nDofPad}}*t]; lQi[{{idxLQi(t,0,zy,0,nx)}}] = lduh[nx+{{nVar*nDofPad}}*t];
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment