Commit b096afb2 authored by Jean-Matthieu Gallard's avatar Jean-Matthieu Gallard
Browse files

KernelGen - add prefetch Eigen and AoSoA linear + experimental test prefetch user functions

parent 43378557
......@@ -71,10 +71,11 @@ class Configuration:
# prefetching settings
# Experimental, not supported by all kernel
# Will use prefetching to optimize tensor operation (prefetch the next slice of an LoG)
prefetching = "None" # "None", "Inputs", "Outputs", "All"
prefetchLevel = "_MM_HINT_T0" # intrisic _mm_prefetch locality hint (_MM_HINT_T0 = all level of cache), see compiler header xmmintrin.h
cachelineSize = {
# Will use prefetching to optimize tensor operations (prefetch the next slices of an LoG)
prefetchingLoG = "None" # "None" (disable the feature), "Inputs" (prefetch only the moving input slices of next LoG), "Outputs" (idem for outputs), "All" (Inputs and outputs)
prefetchingPDE = "None" # "None" (disable the feature), "Inputs" (prefetch only the next PDE input chunks), "Outputs" (idem for outputs), "All" (Inputs and outputs)
prefetchLevel = "_MM_HINT_T0" # intrisic _mm_prefetch locality hint (_MM_HINT_T0 = all level of cache), see compiler header xmmintrin.h
cachelineSize = {
"noarch" : 8,
"wsm" : 8,
"snb" : 8,
......
......@@ -70,8 +70,10 @@ class Controller:
"useEigen" : Configuration.matmulLib == "Eigen",
"pathToLibxsmmGemmGenerator" : Configuration.pathToLibxsmmGemmGenerator,
"runtimeDebug" : Configuration.runtimeDebug, #for debug
"prefetchInputs" : Configuration.prefetching in ["Inputs", "All"],
"prefetchOutputs" : Configuration.prefetching in ["Outputs", "All"],
"prefetchLoGInputs" : Configuration.prefetchingLoG in ["Inputs", "All"],
"prefetchLoGOutputs" : Configuration.prefetchingLoG in ["Outputs", "All"],
"prefetchPDEInputs" : Configuration.prefetchingPDE in ["Inputs", "All"],
"prefetchPDEOutputs" : Configuration.prefetchingPDE in ["Outputs", "All"],
"prefetchLevel" : Configuration.prefetchLevel
}
......@@ -102,10 +104,10 @@ class Controller:
"useVectPDE" : args["useVectPDE"],
"useAoSoA2" : args["useAoSoA2"],
"predictorRecompute" : args["predictorRecompute"],
"advancedStopCriterion" : False, #TODO JMG put as proper toolkit arg
#"initialGuess" : "mixedPicard", #TODO JMG put as proper toolkit arg
"initialGuess" : "default", #TODO JMG put as proper toolkit arg
"singlePrecisionSTP" : args["singlePrecisionSTP"], # experiment, not supported by every kernel
"advancedStopCriterion" : False, #TODO Experimental WiP
#"initialGuess" : "mixedPicard", #TODO Experimental WiP
"initialGuess" : "default", #TODO Experimental WiP
"singlePrecisionSTP" : args["singlePrecisionSTP"], # experiment, only supported by linear AoSoA2
"useSinglePrecision" : args["singlePrecisionSTP"] # should be enabled if single precision coeff matrices are required
})
self.config["useSourceOrNCP"] = self.config["useSource"] or self.config["useNCP"]
......@@ -146,7 +148,7 @@ class Controller:
self.validateConfig(Configuration.simdWidth.keys())
self.config["vectSize"] = Configuration.simdWidth[self.config["architecture"]] #only initialize once architecture has been validated
self.config["cachelineSize"] = Configuration.cachelineSize[self.config["architecture"]] #only initialize once architecture has been validated
# if single precision is used, multiply SIMD and cache values by 2 (TODO JMG: WiP, this affects all the code instead of only the single precision kernels)
# if single precision is used, multiply SIMD and cache values by 2 (TODO Experimental: WiP, this affects all the code instead of only the single precision kernels)
if self.config["useSinglePrecision"]:
self.config["vectSize"] *= 2
self.config["cachelineSize"] *= 2
......
......@@ -139,16 +139,16 @@ class FusedSpaceTimePredictorVolumeIntegralModel(AbstractModelBaseClass):
if self.context["useFlux"]:
if self.context["useMaterialParam"]:
self.context["matmulConfigs"]["flux_x_sck_vect"] = MatmulConfig(nDofPad, nVar, nDof, nDofPad , nDofPad, nDofPad , 1, 0, 1, 1, 1, "flux_x_sck_vect") # beta, 0 => overwrite C
self.context["matmulConfigs"]["flux_y_or_z_sck_vect"] = MatmulConfig(nDofPad*nVar, nVar, nDof, nDofPad*nVar , nDofPad, nDofPad*nVar , 1, 0, 1, 1, 1, "flux_y_or_z_sck_vect") # beta, 0 => overwrite C
self.context["matmulConfigs"]["flux_y_or_z_sck_vect"] = MatmulConfig(nDofPad*nVar, nVar, nDof, nDofPad*nVar , nDofPad, nDofPad*nVar , 1, 0, 1, 1, 1, "flux_y_or_z_sck_vect") # no moving slices, beta, 0 => overwrite C
else:
self.context["matmulConfigs"]["flux_x_sck_vect"] = MatmulConfig(nDofPad, nVar, nDof, nDofPad , nDofPad, nDofPad , 1, 1, 1, 1, 1, "flux_x_sck_vect")
self.context["matmulConfigs"]["flux_y_sck_vect"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar , nDofPad, nDofPad*nVar , 1, 1, 1, 1, 1, "flux_y_sck_vect")
self.context["matmulConfigs"]["flux_x_sck_vect"] = MatmulConfig(nDofPad, nVar, nDof, nDofPad , nDofPad, nDofPad , 1, 1, 1, 1, 1, "flux_x_sck_vect", prefetchOutput="C")
self.context["matmulConfigs"]["flux_y_sck_vect"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar , nDofPad, nDofPad*nVar , 1, 1, 1, 1, 1, "flux_y_sck_vect", prefetchOutput="C")
if self.context["nDim"]>=3:
self.context["matmulConfigs"]["flux_z_sck_vect"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar , nDofPad, nDofPad*nVar*nDof, 1, 1, 1, 1, 1, "flux_z_sck_vect")
self.context["matmulConfigs"]["gradQ_x_sck_vect"] = MatmulConfig(nDofPad, nVar*nDof*nDof3D, nDof, nDofPad , nDofPad, nDofPad , 1, 0, 1, 1, 1, "gradQ_x_sck_vect") # beta, 0 => overwrite C
self.context["matmulConfigs"]["gradQ_y_sck_vect"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar , nDofPad, nDofPad*nVar , 1, 0, 1, 1, 1, "gradQ_y_sck_vect") # beta, 0 => overwrite C
self.context["matmulConfigs"]["flux_z_sck_vect"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar , nDofPad, nDofPad*nVar*nDof, 1, 1, 1, 1, 1, "flux_z_sck_vect", prefetchOutput="C")
self.context["matmulConfigs"]["gradQ_x_sck_vect"] = MatmulConfig(nDofPad, nVar*nDof*nDof3D, nDof, nDofPad , nDofPad, nDofPad , 1, 0, 1, 1, 1, "gradQ_x_sck_vect") # no moving slices, beta, 0 => overwrite C
self.context["matmulConfigs"]["gradQ_y_sck_vect"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar , nDofPad, nDofPad*nVar , 1, 0, 1, 1, 1, "gradQ_y_sck_vect", prefetchInput="A", prefetchOutput="C") # beta, 0 => overwrite C
if self.context["nDim"]>=3:
self.context["matmulConfigs"]["gradQ_z_sck_vect"] = MatmulConfig(nDofPad*nVar*nDof, nDof, nDof, nDofPad*nVar*nDof, nDofPad, nDofPad*nVar*nDof, 1, 0, 1, 1, 1, "gradQ_z_sck_vect") # beta, 0 => overwrite C
self.context["matmulConfigs"]["gradQ_z_sck_vect"] = MatmulConfig(nDofPad*nVar*nDof, nDof, nDof, nDofPad*nVar*nDof, nDofPad, nDofPad*nVar*nDof, 1, 0, 1, 1, 1, "gradQ_z_sck_vect") # no moving slices, beta, 0 => overwrite C
# Linear SplitCK scalar
......@@ -187,12 +187,12 @@ class FusedSpaceTimePredictorVolumeIntegralModel(AbstractModelBaseClass):
# Nonlinear PredictorRecompute AoSoA2
if self.context["useFlux"]:
self.context["matmulConfigs"]["rhs_x"] = MatmulConfig(nDof, nDof, nDof, nDofPad, nDof, nDof, 1, 1, 1, 1, 1, "rhs_x", prefetchInput="B", prefetchOutput="C")
self.context["matmulConfigs"]["rhs_y"] = MatmulConfig(nDof, nDof, nDof, nDof, nDofPad, nDof, 1, 1, 1, 1, 1, "rhs_y", prefetchInput="A", prefetchOutput="C")
self.context["matmulConfigs"]["rhs_x"] = MatmulConfig(nDof, nDof, nDof, nDofPad, nDof, nDof, 1, 1, 1, 1, 1, "rhs_x") #, prefetchInput="B", prefetchOutput="C")
self.context["matmulConfigs"]["rhs_y"] = MatmulConfig(nDof, nDof, nDof, nDof, nDofPad, nDof, 1, 1, 1, 1, 1, "rhs_y") #, prefetchInput="A", prefetchOutput="C")
if self.context["nDim"]>=3:
self.context["matmulConfigs"]["rhs_z"] = MatmulConfig(nDof2Pad*nVar, nDof, nDof, nDof2Pad*nVar, nDofPad, nDof2Pad*nVar, 1, 1, 1, 1, 1, "rhs_z")
self.context["matmulConfigs"]["lduh_x"] = MatmulConfig(nDof, nDof, nDof, nDofPad, nDof, nDof, 1, 1, 1, 1, 1, "lduh_x", prefetchInput="B", prefetchOutput="C")
self.context["matmulConfigs"]["lduh_y"] = MatmulConfig(nDof, nDof, nDof, nDof, nDofPad, nDof, 1, 1, 1, 1, 1, "lduh_y", prefetchInput="A", prefetchOutput="C")
self.context["matmulConfigs"]["lduh_x"] = MatmulConfig(nDof, nDof, nDof, nDofPad, nDof, nDof, 1, 1, 1, 1, 1, "lduh_x") #, prefetchInput="B", prefetchOutput="C")
self.context["matmulConfigs"]["lduh_y"] = MatmulConfig(nDof, nDof, nDof, nDof, nDofPad, nDof, 1, 1, 1, 1, 1, "lduh_y") #, prefetchInput="A", prefetchOutput="C")
if self.context["nDim"]>=3:
self.context["matmulConfigs"]["lduh_z"] = MatmulConfig(nDof2Pad*nVar, nDof, nDof, nDof2Pad*nVar, nDofPad, nDof2Pad*nVar, 1, 1, 1, 1, 1, "lduh_z")
if self.context["useNCP"] or self.context['useViscousFlux']:
......
......@@ -11,7 +11,7 @@
* For the full license text, see LICENSE.txt
**/
{% import 'subtemplates/macros.template' as m with context %}{# get template macros #}
{% import "subtemplates/index.template" as i with context %}
{% import 'subtemplates/index.template' as i with context %}
{% macro idxW3(z,y,x) %}{{i.index_3(z,y,x,nDof,nDof)}}{% endmacro %}
{% macro idxLuh(z,y,x,n) %}{{i.index_4(z,y,x,n,nDof,nDof,nData)}}{% endmacro %}
{% macro idxLPi(z,y,n,x) %}{{i.index_4(z,y,n,x,nDof,nPar,nDofPad)}}{% endmacro %}
......@@ -20,8 +20,10 @@
{% macro idxLduh(z,y,x,n) %}{{i.index_4(z,y,x,n,nDof,nDof,nVarPad)}}{% endmacro %}{# lduh is not transposed #}
{% macro idxLQhbnd(f,z_y,y_x,n) %}{{i.index_4(f,z_y,y_x,n,nDof3D,nDof,nDataPad)}}{% endmacro %}{# f = face | x face: z_y = z, y_x = y | y face: z_y = z, y_x = x | z face: z_y = y, y_x = x #}
{% macro idxLFhbnd(f,z_y,y_x,n) %}{{i.index_4(f,z_y,y_x,n,nDof3D,nDof,nVarPad)}}{% endmacro %}{# f = face | x face: z_y = z, y_x = y | y face: z_y = z, y_x = x | z face: z_y = y, y_x = x #}
{% set x,y,z,n,t,xyz,it,it_t="x","y","z","n","t","xyz","it","it_t" %}{# shortcut for the idx #}
{% set xy,yz,nx="xy","yz","nx" %}{# shortcut for the idx #}
{% set x,y,z,n,t,xyz,it,it_t='x','y','z','n','t','xyz','it','it_t' %}{# shortcut for the idx #}
{% set xy,yz,nx='xy','yz','nx' %}{# shortcut for the idx #}
{% set PDEChunkSize=nDofPad*nVar %}
{% set PDEChunkSize_Par=nDofPad*nPar %}
#include <cstring>
#include <algorithm>
......@@ -194,10 +196,10 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
{% if nPar != 0 %}
for (int n = 0; n < {{nPar}}; n++) {
for (int x = 0; x < {{nDof}}; x++) {
lPi[{{idxLPi(0,yz,n,x)}}] = luh[{{idxLuh(0,yz,x,"n+"~nVar)}}];
lPi[{{idxLPi(0,yz,n,x)}}] = luh[{{idxLuh(0,yz,x,'n+'~nVar)}}];
}
for (int x = {{nDof}}; x < {{nDofPad}}; x++) {
lPi[{{idxLPi(0,yz,n,x)}}] = 0;//luh[{{idxLuh(0,yz,0,"n+"~nVar)}}];
lPi[{{idxLPi(0,yz,n,x)}}] = 0;//luh[{{idxLuh(0,yz,0,'n+'~nVar)}}];
}
}
{% endif %}
......@@ -229,17 +231,20 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
{% if useFlux %}
//call flux in x
for (int yz = 0; yz < {{nDof*nDof3D}}; yz++) {
{{m.prefetchPDEInput('lQi', PDEChunkSize, idx(0,'(yz+1)',0,0)) | indent(6) }}
{% if nPar != 0 %}{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi(0,'(yz+1)',0,0)) | indent(6) }}{%endif%}
solver.{{solverName}}::flux_x_vect(lQi+{{idx(0,yz,0,0)}}, {% if nPar != 0 %}lPi+{{idxLPi(0,yz,0,0)}}{% else %}nullptr{%endif%}, lFhi); //lFhi[N][X]
{% if useMaterialParam %}
// store M*rhs in gradQ [n][x]
{{ m.matmul('flux_x_sck_vect', 'negativeDudx_by_dx', 'lFhi', 'gradQ', '0', '0', '0') | indent(6) }}{##}
{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi(0,'(yz+1)',0,0)) | indent(6) }}
solver.{{solverName}}::multiplyMaterialParameterMatrix_vect(lPi+{{idxLPi(0,yz,0,0)}}, gradQ);
#pragma omp simd aligned(lQi_next,gradQ:ALIGNMENT)
for (int nx = 0; nx < {{nVar*nDofPad}} ; nx++){
lQi_next[{{idx(0,yz,0,nx)}}] += gradQ[{{idx(0,0,0,nx)}}];
}
{% else %}
{{ m.matmul('flux_x_sck_vect', 'negativeDudx_by_dx', 'lFhi', 'lQi_next', '0', '0', idx(0,yz,0,0)) | indent(6) }}{##}
{{ m.matmul_prefetch('flux_x_sck_vect', 'negativeDudx_by_dx', 'lFhi', 'lQi_next', '0', '0', idx(0,yz,0,0), '0', '0', idx(0,'(yz+1)',0,0)) | indent(6) }}{##}
{% endif %}{# useMaterialParam #}
}
{% endif %}{# useFlux #}
......@@ -251,6 +256,9 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
//call ncp in x
for (int yz = 0; yz < {{nDof*nDof3D}}; yz++) {
{{m.prefetchPDEInput('lQi', PDEChunkSize, idx(0,'(yz+1)',0,0)) | indent(6) }}
{% if nPar != 0 %}{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi(0,'(yz+1)',0,0)) | indent(6) }}{%endif%}
{{m.prefetchPDEOutput('gradQ', PDEChunkSize, idx(0,'(yz+1)',0,0)) | indent(6) }}
solver.{{solverName}}::nonConservativeProduct_x_vect(lQi+{{idx(0,yz,0,0)}}, {% if nPar != 0 %}lPi+{{idxLPi(0,yz,0,0)}}{% else %}nullptr{%endif%}, gradQ+{{idx(0,yz,0,0)}}, lFhi);
#pragma omp simd aligned(lQi_next,lFhi:ALIGNMENT)
for (int nx = 0; nx < {{nVar*nDofPad}}; nx++) {
......@@ -262,12 +270,17 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
//call flux in y
for (int z = 0; z < {{nDof3D}}; z++) {
for (int y = 0; y < {{nDof}} ; y++){
{{m.prefetchPDEInput('lQi', PDEChunkSize, idx(z,'(y+1)',0,0)) | indent(8) }}
{% if nPar != 0 %}{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi(z,'(y+1)',0,0)) | indent(8) }}{%endif%}
{{m.prefetchPDEOutput('lFhi', PDEChunkSize, '(y+1)*'~nDofPad*nVar) | indent(8) }}
solver.{{solverName}}::flux_y_vect(lQi+{{idx(z,y,0,0)}}, {% if nPar != 0 %}lPi+{{idxLPi(z,y,0,0)}}{% else %}nullptr{%endif%}, lFhi+y*{{nDofPad*nVar}}); //lFhi[Y][N][X]
}
{% if useMaterialParam %}
// store M*rhs in gradQ [y][n][x], fuse n and x
{{ m.matmul('flux_y_or_z_sck_vect', 'lFhi', 'negativeDudxT_by_dx', 'gradQ', '0', '0', '0') | indent(6) }}{##}
for (int y = 0; y < {{nDof}} ; y++){
{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi(z,'(y+1)',0,0)) | indent(8) }}
{{m.prefetchPDEOutput('gradQ', PDEChunkSize, '(y+1)*'~nDofPad*nVar) | indent(8) }}
solver.{{solverName}}::multiplyMaterialParameterMatrix_vect(lPi+{{idxLPi(z,y,0,0)}}, gradQ+y*{{nDofPad*nVar}});
#pragma omp simd aligned(lQi_next,gradQ:ALIGNMENT)
for (int nx = 0; nx < {{nVar*nDofPad}} ; nx++){
......@@ -276,7 +289,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
}
{% else %}
//fuse n and x
{{ m.matmul('flux_y_sck_vect', 'lFhi', 'negativeDudxT_by_dx', 'lQi_next', '0', '0', idx(z,0,0,0)) | indent(6) }}{##}
{{ m.matmul_prefetch('flux_y_sck_vect', 'lFhi', 'negativeDudxT_by_dx', 'lQi_next', '0', '0', idx(z,0,0,0), '0', '0', idx('(z+1)',0,0,0)) | indent(6) }}{##}
{% endif %}{# useMaterialParam #}
}
{% endif %}{# useFlux #}
......@@ -284,11 +297,14 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
//ncp in y
// y direction (independent from the x and z derivatives)
for (int z = 0; z < {{nDof3D}}; z++) { //fuse n and x
{{ m.matmul('gradQ_y_sck_vect', 'lQi', 'dudxT_by_dx', 'gradQ', idx(z,0,0,0), '0', idx(z,0,0,0)) | indent(6) }}{##}
{{ m.matmul_prefetch('gradQ_y_sck_vect', 'lQi', 'dudxT_by_dx', 'gradQ', idx(z,0,0,0), '0', idx(z,0,0,0), idx('(z+1)',0,0,0), '0', idx('(z+1)',0,0,0)) | indent(6) }}{##}
}
//call ncp in y
for (int yz = 0; yz < {{nDof*nDof3D}}; yz++) {
{{m.prefetchPDEInput('lQi', PDEChunkSize, idx(0,'(yz+1)',0,0)) | indent(6) }}
{% if nPar != 0 %}{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi(0,'(yz+1)',0,0)) | indent(6) }}{%endif%}
{{m.prefetchPDEOutput('gradQ', PDEChunkSize, idx(0,'(yz+1)',0,0)) | indent(6) }}
solver.{{solverName}}::nonConservativeProduct_y_vect(lQi+{{idx(0,yz,0,0)}}, {% if nPar != 0 %}lPi+{{idxLPi(0,yz,0,0)}}{% else %}nullptr{%endif%}, gradQ+{{idx(0,yz,0,0)}}, lFhi);
#pragma omp simd aligned(lQi_next,lFhi:ALIGNMENT)
for (int nx = 0; nx < {{nVar*nDofPad}}; nx++) {
......@@ -301,12 +317,17 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
//call flux in z
for (int y = 0; y < {{nDof}}; y++){
for (int z = 0; z < {{nDof}}; z++) {
{{m.prefetchPDEInput('lQi', PDEChunkSize, idx('(z+1)',y,0,0)) | indent(8) }}
{% if nPar != 0 %}{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi('(z+1)',y,0,0)) | indent(8) }}{%endif%}
{{m.prefetchPDEOutput('lFhi', PDEChunkSize, '(z+1)*'~nDofPad*nVar) | indent(8) }}
solver.{{solverName}}::flux_z_vect(lQi+{{idx(z,y,0,0)}}, {% if nPar != 0 %}lPi+{{idxLPi(z,y,0,0)}}{% else %}nullptr{%endif%}, lFhi+z*{{nDofPad*nVar}}); //lFhi[Z][N][X]
}
{% if useMaterialParam %}
// store M*rhs in gradQ [z][n][x], fuse n and x
{{ m.matmul('flux_y_or_z_sck_vect', 'lFhi', 'negativeDudxT_by_dx', 'gradQ', '0', '0', '0') | indent(6) }}{##}
for (int z = 0; z < {{nDof}} ; z++){
{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi('(z+1)',y,0,0)) | indent(8) }}
{{m.prefetchPDEOutput('gradQ', PDEChunkSize, '(z+1)*'~nDofPad*nVar) | indent(8) }}
solver.{{solverName}}::multiplyMaterialParameterMatrix_vect(lPi+{{idxLPi(z,y,0,0)}}, gradQ+z*{{nDofPad*nVar}});
#pragma omp simd aligned(lQi_next,gradQ:ALIGNMENT)
for (int nx = 0; nx < {{nVar*nDofPad}} ; nx++){
......@@ -315,7 +336,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
}
{% else %}
//fuse n and x
{{ m.matmul('flux_z_sck_vect', 'lFhi', 'negativeDudxT_by_dx', 'lQi_next', '0', '0', idx(0,y,0,0)) | indent(6) }}{##}
{{ m.matmul_prefetch('flux_z_sck_vect', 'lFhi', 'negativeDudxT_by_dx', 'lQi_next', '0', '0', idx(0,y,0,0), '0', '0', idx(0,'(y+1)',0,0)) | indent(6) }}{##}
{% endif %}{# useMaterialParam #}
}
{% endif %}{# useFlux #}
......@@ -327,6 +348,9 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
//call ncp in z
for (int yz = 0; yz < {{nDof*nDof3D}}; yz++) {
{{m.prefetchPDEInput('lQi', PDEChunkSize, idx(0,'(yz+1)',0,0)) | indent(6) }}
{% if nPar != 0 %}{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi(0,'(yz+1)',0,0)) | indent(6) }}{%endif%}
{{m.prefetchPDEOutput('gradQ', PDEChunkSize, idx(0,'(yz+1)',0,0)) | indent(6) }}
solver.{{solverName}}::nonConservativeProduct_z_vect(lQi+{{idx(0,yz,0,0)}}, {% if nPar != 0 %}lPi+{{idxLPi(0,yz,0,0)}}{% else %}nullptr{%endif%}, gradQ+{{idx(0,yz,0,0)}}, lFhi);
#pragma omp simd aligned(lQi_next,lFhi:ALIGNMENT)
for (int nx = 0; nx < {{nVar*nDofPad}}; nx++) {
......@@ -337,6 +361,8 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
{% if useSource %}
//call source
for (int yz = 0; yz < {{nDof*nDof3D}}; yz++) {
{{m.prefetchPDEInput('lQi', PDEChunkSize, idx(0,'(yz+1)',0,0)) | indent(6) }}
{% if nPar != 0 %}{{m.prefetchPDEInput('lPi', PDEChunkSize_Par, idxLPi(0,'(yz+1)',0,0)) | indent(6) }}{%endif%}
solver.{{solverName}}::algebraicSource_vect(lQi+{{idx(0,yz,0,0)}}, {% if nPar != 0 %}lPi+{{idxLPi(0,yz,0,0)}}{% else %}nullptr{%endif%}, lFhi, center, tStep);
#pragma omp simd aligned(lQi_next,lFhi:ALIGNMENT)
for (int nx = 0; nx < {{nVar*nDofPad}}; nx++) {
......@@ -399,9 +425,9 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
#pragma omp simd aligned(lQhbnd,lPi:ALIGNMENT)
for (int n = 0; n < {{nPar}}; n++) {
// left
lQhbnd[{{idxLQhbnd(0,0,yz,"n+"~nVar)}}] += lPi[{{idxLPi(0,yz,n,x)}}] * FLCoeff[x];
lQhbnd[{{idxLQhbnd(0,0,yz,'n+'~nVar)}}] += lPi[{{idxLPi(0,yz,n,x)}}] * FLCoeff[x];
// right
lQhbnd[{{idxLQhbnd(1,0,yz,"n+"~nVar)}}] += lPi[{{idxLPi(0,yz,n,x)}}] * FRCoeff[x];
lQhbnd[{{idxLQhbnd(1,0,yz,'n+'~nVar)}}] += lPi[{{idxLPi(0,yz,n,x)}}] * FRCoeff[x];
}
{% endif %}
}
......@@ -435,8 +461,8 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
// right
tmpR += lPi[{{idxLPi(0,yz,n,x)}}] * FRCoeff[x];
}
lQhbnd[{{idxLQhbnd(0,0,yz,"n+"~nVar)}}] = tmpL;
lQhbnd[{{idxLQhbnd(1,0,yz,"n+"~nVar)}}] = tmpR;
lQhbnd[{{idxLQhbnd(0,0,yz,'n+'~nVar)}}] = tmpL;
lQhbnd[{{idxLQhbnd(1,0,yz,'n+'~nVar)}}] = tmpR;
{% endif %}
}
}
......@@ -457,9 +483,9 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
#pragma omp simd aligned(lQhbnd,lPi:ALIGNMENT)
for (int n = 0; n < {{nPar}}; n++) {
// left
lQhbnd[{{idxLQhbnd(2,z,x,"n+"~nVar)}}] += lPi[{{idxLPi(z,y,n,x)}}] * FLCoeff[y];
lQhbnd[{{idxLQhbnd(2,z,x,'n+'~nVar)}}] += lPi[{{idxLPi(z,y,n,x)}}] * FLCoeff[y];
// right
lQhbnd[{{idxLQhbnd(3,z,x,"n+"~nVar)}}] += lPi[{{idxLPi(z,y,n,x)}}] * FRCoeff[y];
lQhbnd[{{idxLQhbnd(3,z,x,'n+'~nVar)}}] += lPi[{{idxLPi(z,y,n,x)}}] * FRCoeff[y];
}
{% endif %}
}
......@@ -482,9 +508,9 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
#pragma omp simd aligned(lQhbnd,lPi:ALIGNMENT)
for (int n = 0; n <{{nPar}}; n++) {
// left
lQhbnd[{{idxLQhbnd(4,y,x,"n+"~nVar)}}] += lPi[{{idxLPi(z,y,n,x)}}] * FLCoeff[z];
lQhbnd[{{idxLQhbnd(4,y,x,'n+'~nVar)}}] += lPi[{{idxLPi(z,y,n,x)}}] * FLCoeff[z];
// right
lQhbnd[{{idxLQhbnd(5,y,x,"n+"~nVar)}}] += lPi[{{idxLPi(z,y,n,x)}}] * FRCoeff[z];
lQhbnd[{{idxLQhbnd(5,y,x,'n+'~nVar)}}] += lPi[{{idxLPi(z,y,n,x)}}] * FRCoeff[z];
}
{% endif %}
}
......@@ -536,7 +562,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
lFhi[{{idx(0,yz,0,nx)}}] += gradQ[{{idx(0,0,0,nx)}}];
}
{% else %}
{{ m.matmul('flux_x_sck_vect', 'dudx_by_dx', 'lQi_next', 'lFhi', '0', '0', idx(0,yz,0,0)) | indent(4) }}{##}
{{ m.matmul_prefetch('flux_x_sck_vect', 'dudx_by_dx', 'lQi_next', 'lFhi', '0', '0', idx(0,yz,0,0),'0', '0', idx(0,'(yz+1)',0,0)) | indent(4) }}{##}
{% endif %}{# useMaterialParam #}
}
{% endif %}{# useFlux #}
......@@ -619,13 +645,13 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
}
{% else %}
//fuse n and x
{{ m.matmul('flux_y_sck_vect', 'lQi_next', 'dudxT_by_dx', 'lFhi', '0', '0', idx(z,0,0,0)) | indent(4) }}{##}
{{ m.matmul_prefetch('flux_y_sck_vect', 'lQi_next', 'dudxT_by_dx', 'lFhi', '0', '0', idx(z,0,0,0), '0', '0', idx('(z+1)',0,0,0)) | indent(4) }}{##}
{% endif %}{# useMaterialParam #}
}
{% endif %}{# useFlux #}
// ncp in y
for (int z = 0; z < {{nDof3D}}; z++) { //fuse n and x
{{ m.matmul('gradQ_y_sck_vect', 'lQhi', 'dudxT_by_dx', 'gradQ', idx(z,0,0,0), '0', idx(z,0,0,0)) | indent(6) }}{##}
{{ m.matmul_prefetch('gradQ_y_sck_vect', 'lQhi', 'dudxT_by_dx', 'gradQ', idx(z,0,0,0), '0', idx(z,0,0,0), idx('(z+1)',0,0,0), '0', idx('(z+1)',0,0,0)) | indent(6) }}{##}
}
//call ncp in y
......@@ -680,7 +706,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral{{nameSuffix}}(
}
{% else %}
//fuse n and x
{{ m.matmul('flux_z_sck_vect', 'lQi_next', 'dudxT_by_dx', 'lFhi', '0', '0', idx(0,y,0,0)) | indent(6) }}{##}
{{ m.matmul_prefetch('flux_z_sck_vect', 'lQi_next', 'dudxT_by_dx', 'lFhi', '0', '0', idx(0,y,0,0), '0', '0', idx(0,'(y+1)',0,0)) | indent(6) }}{##}
{% endif %}{# useMaterialParam #}
}
{% endif %}{# useFlux #}
......
......@@ -60,7 +60,7 @@ _mm_free({{name}});
*/
#}
{% macro matmulInclude() %}
{% if prefetchInputs or prefetchOutputs%}
{% if prefetchLoGInputs or prefetchLoGOutputs or prefetchPDEInputs or prefetchPDEOutputs %}
// intrisics for prefetching
#include <xmmintrin.h>
{% endif %}
......@@ -75,6 +75,24 @@ _mm_free({{name}});
{% endmacro %}
{#
/**
Prefetching macros
*/
#}
{% macro prefetchArray(array, size, offset) %}
{% for offsetLine in range(0, size, cachelineSize)%}
_mm_prefetch({{array}}+({{offset}}){% if offsetLine != 0 %}+{{offsetLine}}{% endif %}, {{prefetchLevel}});
{% endfor%}
{% endmacro %}
{% macro prefetchPDEInput(array, size, offset) %}
{% if prefetchPDEInputs %}{{prefetchArray(array, size, offset)}}{% endif %}
{% endmacro %}
{% macro prefetchPDEOutput(array, size, offset) %}
{% if prefetchPDEOutputs %}{{prefetchArray(array, size, offset)}}{% endif %}
{% endmacro %}
{#
/**
Setup matmul
*/
......
......@@ -42,9 +42,9 @@
{% else %}
{% set fpFormat = "float" %}
{% endif %}
{% set prefetchA = (prefetchInputs and (conf.prefetchInput == "A" or conf.prefetchInput == "AB")) %}
{% set prefetchB = (prefetchInputs and (conf.prefetchInput == "B" or conf.prefetchInput == "AB")) %}
{% set prefetchC = (prefetchOutputs and conf.prefetchOutput == "C") %}
{% set prefetchA = (prefetchLoGInputs and (conf.prefetchInput == "A" or conf.prefetchInput == "AB")) %}
{% set prefetchB = (prefetchLoGInputs and (conf.prefetchInput == "B" or conf.prefetchInput == "AB")) %}
{% set prefetchC = (prefetchLoGOutputs and conf.prefetchOutput == "C") %}
{# /*******************
**** Sub macros ****
********************/ #}
......@@ -98,6 +98,15 @@ _mm_prefetch({{array}}+{{offset}}{% if offsetLine != 0 %}+{{offsetLine}}{% endif
{{Cp}}_m.noalias() {{ '+' if conf.beta == 1 }}= {{ '-' if conf.alpha == -1 }}{{Ap}}_m * {{Bp}}_m {% if (useTrueB and not forceCoeffMatrix) %}* {{trueAlpha}}{% endif %};
}
#}
{% if prefetchA %}{# check global config then local config#}
{{prefetchMatrix(A, conf.M, conf.LDA, conf.K, A_next)}}{##}
{% endif %}{# prefetch A #}
{% if prefetchB %}
{{prefetchMatrix(B, conf.K, conf.LDB, conf.N, B_next)}}{##}
{% endif %}{# prefetch B #}
{% if prefetchC %}
{{prefetchMatrix(C, conf.M, conf.LDC, conf.N, C_next)}}{##}
{% endif %}{# prefetch C #}
#pragma forceinline recursive
{
new (&{{conf.baseroutinename}}_A_map) Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.K}}>, Eigen::{{"Aligned"if conf.alignment_A == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDA}}> >({{A}}{% if A_shift != '0' %}+{{A_shift}}{% endif %}); //{{conf.baseroutinename}}_A_map = {{A}}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment