Commit d40c7c0f authored by Jean-Matthieu Gallard's avatar Jean-Matthieu Gallard
Browse files

KernelGen - prefetch experimental

parent a8a865d9
......@@ -62,12 +62,27 @@ class Configuration:
}
# choose the BLAS library for the matmul: "None" (= C++ loops), "Libxsmm" or "Eigen"
matmulLib = "Libxsmm";
#matmulLib = "Eigen";
#matmulLib = "None";
#matmulLib = "Libxsmm"
#matmulLib = "Eigen"
matmulLib = "None"
# set to true to print models runtime
runtimeDebug = False;
runtimeDebug = False
# prefetching settings
# Experimental, not supported by all kernel
# Will use prefetching to optimize tensor operation (prefetch the next slice of an LoG)
prefetching = "All" # "Inputs", "Outputs", "All"
prefetchLevel = "_MM_HINT_T0" # intrisic _mm_prefetch locality hint (_MM_HINT_T0 = all level of cache), see compiler header xmmintrin.h
cachelineSize = {
"noarch" : 8,
"wsm" : 8,
"snb" : 8,
"hsw" : 8,
"knc" : 8,
"knl" : 8,
"skx" : 8
} # size for double, CPUs usuallly have 64B L1 Cache line
......
......@@ -70,7 +70,10 @@ class Controller:
"useLibxsmm" : Configuration.matmulLib == "Libxsmm",
"useEigen" : Configuration.matmulLib == "Eigen",
"pathToLibxsmmGemmGenerator" : Configuration.pathToLibxsmmGemmGenerator,
"runtimeDebug" : Configuration.runtimeDebug #for debug
"runtimeDebug" : Configuration.runtimeDebug, #for debug
"prefetchInputs" : Configuration.prefetching in ["Inputs", "All"],
"prefetchOutputs" : Configuration.prefetching in ["Outputs", "All"],
"prefetchLevel" : Configuration.prefetchLevel
}
if self.config["kernelType"] == "aderdg":
......@@ -140,6 +143,7 @@ class Controller:
self.validateConfig(Configuration.simdWidth.keys())
self.config["vectSize"] = Configuration.simdWidth[self.config["architecture"]] #only initialize once architecture has been validated
self.config["cachelineSize"] = Configuration.cachelineSize[self.config["architecture"]] #only initialize once architecture has been validated
self.baseContext = self.generateBaseContext() # default context build from config
self.gemmList = [] #list to store the name of all generated gemms (used for gemmsCPPModel)
......
......@@ -156,12 +156,12 @@ class FusedSpaceTimePredictorVolumeIntegralModel(AbstractModelBaseClass):
if self.context["predictorRecompute"]: # TODO JMG matmuls for gradQ, rhs and lduh are exactly the same...
if self.context["useVectPDE"]:
if self.context["useFlux"]:
self.context["matmulConfigs"]["rhs_x"] = MatmulConfig(nDofPad, nVar, nDof, nDofPad, nDofPad, nDofPad , 1, 1, 1, 1, 1, "rhs_x")
self.context["matmulConfigs"]["rhs_y"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar, nDofPad, nDofPad*nVar , 1, 1, 1, 1, 1, "rhs_y")
self.context["matmulConfigs"]["rhs_x"] = MatmulConfig(nDofPad, nVar, nDof, nDofPad, nDofPad, nDofPad , 1, 1, 1, 1, 1, "rhs_x", prefetchInput="B", prefetchOutput="C")
self.context["matmulConfigs"]["rhs_y"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar, nDofPad, nDofPad*nVar , 1, 1, 1, 1, 1, "rhs_y", prefetchInput="A", prefetchOutput="C")
if self.context["nDim"]>=3:
self.context["matmulConfigs"]["rhs_z"] = MatmulConfig(nDofPad*nVar*nDof, nDof, nDof, nDofPad*nVar*nDof, nDofPad, nDofPad*nVar*nDof , 1, 1, 1, 1, 1, "rhs_z")
self.context["matmulConfigs"]["lduh_x"] = MatmulConfig(nDofPad, nVar, nDof, nDofPad, nDofPad, nDofPad , 1, 1, 1, 1, 1, "lduh_x")
self.context["matmulConfigs"]["lduh_y"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar, nDofPad, nDofPad*nVar , 1, 1, 1, 1, 1, "lduh_y")
self.context["matmulConfigs"]["lduh_x"] = MatmulConfig(nDofPad, nVar, nDof, nDofPad, nDofPad, nDofPad , 1, 1, 1, 1, 1, "lduh_x", prefetchInput="B", prefetchOutput="C")
self.context["matmulConfigs"]["lduh_y"] = MatmulConfig(nDofPad*nVar, nDof, nDof, nDofPad*nVar, nDofPad, nDofPad*nVar , 1, 1, 1, 1, 1, "lduh_y", prefetchInput="A", prefetchOutput="C")
if self.context["nDim"]>=3:
self.context["matmulConfigs"]["lduh_z"] = MatmulConfig(nDofPad*nVar*nDof, nDof, nDof, nDofPad*nVar*nDof, nDofPad, nDofPad*nVar*nDof, 1, 1, 1, 1, 1, "lduh_z")
if self.context["useNCP"] or self.context['useViscousFlux']:
......
......@@ -309,12 +309,12 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
// Compute the "derivatives" (contributions of the stiffness matrix)
// x direction (independent from the y and z derivatives), note transposed n and x
for (int zy = 0; zy < {{nDof3D*nDof}}; zy++) {
{{ m.matmul('rhs_x', 'rhsCoeff_T', 'lFhi', 'rhs', '0', idxLFhi(0,0,zy,0,0), idxRhs(t,0,zy,0,0)) | indent(8) }}{##}
{{ m.matmul_prefetch('rhs_x', 'rhsCoeff_T', 'lFhi', 'rhs', '0', idxLFhi(0,0,zy,0,0), idxRhs(t,0,zy,0,0), '0', idxLFhi(0,0,'(zy+1)',0,0), idxRhs(t,0,'(zy+1)',0,0)) | indent(8) }}{##}
}
// y direction (independent from the x and z derivatives), fuse nx
for (int z = 0; z < {{nDof3D}}; z++) {
{{ m.matmul('rhs_y', 'lFhi', 'rhsCoeff', 'rhs', idxLFhi(1,z,0,0,0), '0', idxRhs(t,z,0,0,0)) | indent(8) }}{##}
{{ m.matmul_prefetch('rhs_y', 'lFhi', 'rhsCoeff', 'rhs', idxLFhi(1,z,0,0,0), '0', idxRhs(t,z,0,0,0), idxLFhi(1,'(z+1)',0,0,0), '0', idxRhs(t,'(z+1)',0,0,0)) | indent(8) }}{##}
}
{% if nDim==3 %}
......@@ -792,7 +792,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
// Assume equispaced mesh, dx[0] == dx[1] == dx[2]
//x, note transposed n and x
for (int zy = 0; zy < {{nDof3D*nDof}}; zy++) {
{{ m.matmul('lduh_x', 'coeffVolume_T', 'lFhi', 'lQi', '0', idxLFhi(0,0,zy,0,0), idxLQhi(0,zy,0,0)) | indent(4) }}{##}
{{ m.matmul_prefetch('lduh_x', 'coeffVolume_T', 'lFhi', 'lQi', '0', idxLFhi(0,0,zy,0,0), idxLQhi(0,zy,0,0), '0', idxLFhi(0,0,'(zy+1)',0,0), idxLQhi(0,'(zy+1)',0,0)) | indent(4) }}{##}
}
double coeffVolume[{{nDof*nDofPad}}] __attribute__((aligned(ALIGNMENT)));
......@@ -810,7 +810,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
//y, fuse nx
for (int z = 0; z < {{nDof3D}}; z++) {
{{ m.matmul('lduh_y', 'lFhi', 'coeffVolume', 'lQi', idxLFhi(1,z,0,0,0), '0', idxLQhi(z,0,0,0)) | indent(4) }}{##}
{{ m.matmul_prefetch('lduh_y', 'lFhi', 'coeffVolume', 'lQi', idxLFhi(1,z,0,0,0), '0', idxLQhi(z,0,0,0), idxLFhi(1,'(z+1)',0,0,0), '0', idxLQhi('(z+1)',0,0,0)) | indent(4) }}{##}
}
{% if nDim == 3 %}
......
......@@ -60,6 +60,10 @@ _mm_free({{name}});
*/
#}
{% macro matmulInclude() %}
{% if prefetchInputs or prefetchOutputs%}
// intrisics for prefetching
#include <xmmintrin.h>
{% endif %}
{% if useEigen %}
// include Eigen for matmul
#include <{{pathToOptKernel}}/Eigen/Dense>
......
......@@ -42,6 +42,23 @@
{% else %}
{% set fpFormat = "float" %}
{% endif %}
{% set prefetchA = (prefetchInputs and (conf.prefetchInput == "A" or conf.prefetchInput == "AB")) %}
{% set prefetchB = (prefetchInputs and (conf.prefetchInput == "B" or conf.prefetchInput == "AB")) %}
{% set prefetchC = (prefetchOutputs and conf.prefetchOutput == "C") %}
{# /*******************
**** Sub macros ****
********************/ #}
{% macro prefetchArray(array, size, offset=0) %}
{% for offsetLine in range(0, size, cachelineSize)%}
_mm_prefetch({{array}}+{{offset}}{% if offsetLine != 0 %}+{{offsetLine}}{% endif %}, {{prefetchLevel}});
{% endfor%}
{% endmacro %}{##}
{% macro prefetchMatrix(matrix, fastDim, fastDimPadded, slowDim, offset) %}
{# fastDimPadded = fastDim + eventual padding (e.g. LDA vs M for A in matmul)#}
{% for slowDimTraversal in range(slowDim) %}
{{prefetchArray(matrix,fastDim,offset=slowDimTraversal~'*'~fastDimPadded~'+'~offset)}}{##}
{% endfor%}
{% endmacro %}{##}
{# /********************
**** Subtemplate ****
*********************/ #}
......@@ -52,6 +69,15 @@
#}
{% if useLibxsmm %}
{% if prefetchA %}{# check global config then local config#}
{{prefetchMatrix(A, conf.M, conf.LDA, conf.K, A_next)}}{##}
{% endif %}{# prefetch A #}
{% if prefetchB %}
{{prefetchMatrix(B, conf.K, conf.LDB, conf.N, B_next)}}{##}
{% endif %}{# prefetch B #}
{% if prefetchC %}
{{prefetchMatrix(C, conf.M, conf.LDC, conf.N, C_next)}}{##}
{% endif %}{# prefetch C #}
#ifdef USE_IPO
#pragma forceinline
#endif
......@@ -84,6 +110,8 @@
// No BLAS case
//-------------
//for prefetching if possible weaves it in the loop
#}
{% else %}{# no BLAS library #}
{% if conf.beta == 0 %}
......@@ -95,9 +123,21 @@ for (int it_1 = 0; it_1 < {{conf.N}}; it_1++) {
}
}
{% endif %}
{% if prefetchA and conf.K != conf.N %}{# check global config then local config#}
{{prefetchMatrix(A, conf.M, conf.LDA, conf.K, A_next)}}{##}
{% endif %}{# prefetch A if K != N #}
for (int it_1 = 0; it_1 < {{conf.N}}; it_1++) {
{% if prefetchA and conf.K == conf.N %}
{{prefetchArray(A, conf.M, offset='it_1*'~conf.LDA~'+'~A_next) | indent(2) }}{##}
{% endif %}{# prefetch N lines of A if K == N (usually N == K in most matmul) #}
{% if prefetchB %}
{{prefetchArray(B, conf.K, offset='it_1*'~conf.LDB~'+'~B_next) | indent(2) }}{##}
{% endif %}{# prefetch B #}
{% if prefetchC %}
{{prefetchArray(C, conf.M, offset='it_1*'~conf.LDC~'+'~C_next) | indent(2) }}{##}
{% endif %}{# prefetch C #}
for (int it_2 = 0; it_2 < {{conf.K}}; it_2++) {
#pragma omp simd aligned({{Cp}},{{Ap}},{{trueBp}}:ALIGNMENT)
#pragma omp simd aligned({{Cp}},{{Ap}},{{Bp}}:ALIGNMENT)
for (int it_3 = 0; it_3 < {{conf.M}}; it_3++) {
{{C}}[{% if C_shift != '0' %}{{C_shift}}+{% endif %}it_1*{{conf.LDC}}+it_3] {{ '+' if conf.alpha == 1 else '-' }}= {{A}}[{% if A_shift != '0' %}{{A_shift}}+{% endif %}it_2*{{conf.LDA}}+it_3] * {{B}}[{% if B_shift != '0' %}{{B_shift}}+{% endif %}it_1*{{conf.LDB}}+it_2];
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment