Commit a8a865d9 authored by Jean-Matthieu Gallard's avatar Jean-Matthieu Gallard
Browse files

KernelGen - update matmul macro

parent 83003c8a
......@@ -274,8 +274,9 @@ class Controller:
for matmul in matmulConfigList:
# add the gemm name to the list of generated gemm
self.gemmList.append((matmul.baseroutinename, matmul.precision))
# for plain assembly code (rather than inline assembly) choose dense_asm
commandLineArguments = " " + "dense" + \
prefecthing = "nopf" # No native prefetching supported!
type = "dense" # for plain assembly code (rather than inline assembly) choose dense_asm
commandLineArguments = " " + type + \
" " + os.path.join(self.config["pathToOutputDirectory"], outputFileName) + \
" " + self.config["codeNamespace"] + "::" + matmul.baseroutinename + \
" " + str(matmul.M) + \
......@@ -289,8 +290,9 @@ class Controller:
" " + str(matmul.alignment_A) + \
" " + str(matmul.alignment_C) + \
" " + self.config["architecture"] + \
" " + matmul.prefetchStrategy + \
" " + prefecthing + \
" " + matmul.precision
bashCommand = self.config["pathToLibxsmmGemmGenerator"] + commandLineArguments
subprocess.call(bashCommand.split())
......
......@@ -50,17 +50,17 @@ class AMRRoutinesModel(AbstractModelBaseClass):
# Always overwrite input (no need to set to 0), except if add
# nDim-1 face projection, inputs are padded
self.context["matmulConfigs"]["face_Q_x"] = MatmulConfig(nDataPad, nDof, nDof, nDataPad, nDofPad, nDataPad, 1, 0, 1, 1, "face_Q_x", "nopf", "gemm")
self.context["matmulConfigs"]["face_F_x"] = MatmulConfig(nVarPad , nDof, nDof, nVarPad , nDofPad, nVarPad , 1, 0, 1, 1, "face_F_x", "nopf", "gemm")
self.context["matmulConfigs"]["face_Q_x"] = MatmulConfig(nDataPad, nDof, nDof, nDataPad, nDofPad, nDataPad, 1, 0, 1, 1, 1, "face_Q_x")
self.context["matmulConfigs"]["face_F_x"] = MatmulConfig(nVarPad , nDof, nDof, nVarPad , nDofPad, nVarPad , 1, 0, 1, 1, 1, "face_F_x")
if(nDim == 3):
self.context["matmulConfigs"]["face_Q_y"] = MatmulConfig(nDataPad, nDof, nDof, nDataPad*nDof, nDofPad, nDataPad*nDof, 1, 0, 1, 1, "face_Q_y", "nopf", "gemm")
self.context["matmulConfigs"]["face_F_y"] = MatmulConfig(nVarPad , nDof, nDof, nVarPad*nDof , nDofPad, nVarPad*nDof , 1, 0, 1, 1, "face_F_y", "nopf", "gemm")
self.context["matmulConfigs"]["face_Q_y"] = MatmulConfig(nDataPad, nDof, nDof, nDataPad*nDof, nDofPad, nDataPad*nDof, 1, 0, 1, 1, 1, "face_Q_y")
self.context["matmulConfigs"]["face_F_y"] = MatmulConfig(nVarPad , nDof, nDof, nVarPad*nDof , nDofPad, nVarPad*nDof , 1, 0, 1, 1, 1, "face_F_y")
# nDim volume projection, luh (input/output) is not padded
self.context["matmulConfigs"]["volume_x"] = MatmulConfig(nData , nDof, nDof, nData , nDofPad, nDataPad , 1, 0, 0, 1, "volume_x", "nopf", "gemm") # input slice not aligned
self.context["matmulConfigs"]["volume_x"] = MatmulConfig(nData , nDof, nDof, nData , nDofPad, nDataPad , 1, 0, 0, 1, 1, "volume_x") # input slice not aligned
if(nDim==3):
self.context["matmulConfigs"]["volume_y"] = MatmulConfig(nDataPad, nDof, nDof, nDataPad*nDof , nDofPad, nDataPad*nDof, 1, 0, 1, 1, "volume_y", "nopf", "gemm")
self.context["matmulConfigs"]["volume_z"] = MatmulConfig(nData , nDof, nDof, nDataPad*nDof2, nDofPad, nData*nDof2 , 1, 0, 1, 0, "volume_z", "nopf", "gemm") # output slice not aligned
self.context["matmulConfigs"]["volume_z_add"] = MatmulConfig(nData, nDof, nDof, nDataPad*nDof2, nDofPad, nData*nDof2 , 1, 1, 1, 0, "volume_z_add", "nopf", "gemm") # output slice not aligned, add to result
self.context["matmulConfigs"]["volume_y"] = MatmulConfig(nDataPad, nDof, nDof, nDataPad*nDof , nDofPad, nDataPad*nDof, 1, 0, 1, 1, 1, "volume_y")
self.context["matmulConfigs"]["volume_z"] = MatmulConfig(nData , nDof, nDof, nDataPad*nDof2, nDofPad, nData*nDof2 , 1, 0, 1, 1, 0, "volume_z") # output slice not aligned
self.context["matmulConfigs"]["volume_z_add"] = MatmulConfig(nData, nDof, nDof, nDataPad*nDof2, nDofPad, nData*nDof2 , 1, 1, 1, 1, 0, "volume_z_add") # output slice not aligned, add to result
else:
self.context["matmulConfigs"]["volume_y"] = MatmulConfig(nData , nDof, nDof, nDataPad*nDof , nDofPad, nData*nDof , 1, 0, 1, 0, "volume_y", "nopf", "gemm") # output slice not aligned
self.context["matmulConfigs"]["volume_y_add"] = MatmulConfig(nData, nDof, nDof, nDataPad*nDof , nDofPad, nData*nDof , 1, 1, 1, 0, "volume_y_add", "nopf", "gemm") # output slice not aligned, add to result
self.context["matmulConfigs"]["volume_y"] = MatmulConfig(nData , nDof, nDof, nDataPad*nDof , nDofPad, nData*nDof , 1, 0, 1, 1, 0, "volume_y") # output slice not aligned
self.context["matmulConfigs"]["volume_y_add"] = MatmulConfig(nData, nDof, nDof, nDataPad*nDof , nDofPad, nData*nDof , 1, 1, 1, 1, 0, "volume_y_add") # output slice not aligned, add to result
......@@ -53,31 +53,31 @@ class LimiterModel(AbstractModelBaseClass):
# Always overwrite input (no need to set to 0)
# Project to FV
self.context["matmulConfigs"]["dg2fv_x"] = MatmulConfig(nData , nDofLim, nDof, nData , nDofPad, nDataPad , 1, 0, 0, 1, "dg2fv_x", "nopf", "gemm") # input slice not aligned
self.context["matmulConfigs"]["dg2fv_x"] = MatmulConfig(nData , nDofLim, nDof, nData , nDofPad, nDataPad , 1, 0, 0, 1, 1, "dg2fv_x") # input slice not aligned
if(nDim==3):
self.context["matmulConfigs"]["dg2fv_y"] = MatmulConfig(nDataPad, nDofLim, nDof, nDofLim*nDataPad , nDofPad, nDofLim*nDataPad , 1, 0, 1, 1, "dg2fv_y", "nopf", "gemm") #M is padded in both input and output
self.context["matmulConfigs"]["dg2fv_z"] = MatmulConfig(nData , nDofLim, nDof, nDofLim2*nDataPad, nDofPad, nDofLim2*nData , 1, 0, 1, 0, "dg2fv_z", "nopf", "gemm") # output slice not aligned
self.context["matmulConfigs"]["dg2fv_y"] = MatmulConfig(nDataPad, nDofLim, nDof, nDofLim*nDataPad , nDofPad, nDofLim*nDataPad , 1, 0, 1, 1, 1, "dg2fv_y") # M is padded in both input and output
self.context["matmulConfigs"]["dg2fv_z"] = MatmulConfig(nData , nDofLim, nDof, nDofLim2*nDataPad, nDofPad, nDofLim2*nData , 1, 0, 1, 1, 0, "dg2fv_z") # output slice not aligned
else:
self.context["matmulConfigs"]["dg2fv_y"] = MatmulConfig(nData , nDofLim, nDof, nDofLim*nDataPad , nDofPad, nDofLim*nData , 1, 0, 1, 0, "dg2fv_y", "nopf", "gemm") # output slice not aligned
self.context["matmulConfigs"]["dg2fv_y"] = MatmulConfig(nData , nDofLim, nDof, nDofLim*nDataPad , nDofPad, nDofLim*nData , 1, 0, 1, 1, 0, "dg2fv_y") # output slice not aligned
# Project to DG
self.context["matmulConfigs"]["fv2dg_x"] = MatmulConfig(nData , nDof, nDofLim, nData , nDofLimPad, nDataPad , 1, 0, 0, 1, "fv2dg_x", "nopf", "gemm") # input slice not aligned
self.context["matmulConfigs"]["fv2dg_x"] = MatmulConfig(nData , nDof, nDofLim, nData , nDofLimPad, nDataPad , 1, 0, 0, 1, 1, "fv2dg_x") # input slice not aligned
if(nDim==3):
self.context["matmulConfigs"]["fv2dg_y"] = MatmulConfig(nDataPad, nDof, nDofLim, nDof*nDataPad , nDofLimPad, nDof*nDataPad , 1, 0, 1, 1, "fv2dg_y", "nopf", "gemm") #M is padded in both input and output
self.context["matmulConfigs"]["fv2dg_z"] = MatmulConfig(nData , nDof, nDofLim, nDof2*nDataPad, nDofLimPad, nDof2*nData , 1, 0, 1, 0, "fv2dg_z", "nopf", "gemm") # output slice not aligned
self.context["matmulConfigs"]["fv2dg_y"] = MatmulConfig(nDataPad, nDof, nDofLim, nDof*nDataPad , nDofLimPad, nDof*nDataPad , 1, 0, 1, 1, 1, "fv2dg_y") # M is padded in both input and output
self.context["matmulConfigs"]["fv2dg_z"] = MatmulConfig(nData , nDof, nDofLim, nDof2*nDataPad, nDofLimPad, nDof2*nData , 1, 0, 1, 1, 0, "fv2dg_z") # output slice not aligned
else:
self.context["matmulConfigs"]["fv2dg_y"] = MatmulConfig(nData , nDof, nDofLim, nDof*nDataPad , nDofLimPad, nDof*nData , 1, 0, 1, 0, "fv2dg_y", "nopf", "gemm") # output slice not aligned
self.context["matmulConfigs"]["fv2dg_y"] = MatmulConfig(nData , nDof, nDofLim, nDof*nDataPad , nDofLimPad, nDof*nData , 1, 0, 1, 1, 0, "fv2dg_y") # output slice not aligned
# Project to Lobatto for Min/Max
self.context["matmulConfigs"]["uh2lob_x"] = MatmulConfig(nData , nDof, nDof, nData , nDofPad, nDataPad , 1, 0, 0, 1, "uh2lob_x", "nopf", "gemm") # input slice not aligned
self.context["matmulConfigs"]["uh2lob_x"] = MatmulConfig(nData , nDof, nDof, nData , nDofPad, nDataPad , 1, 0, 0, 1, 1, "uh2lob_x") # input slice not aligned
if(nDim==3):
self.context["matmulConfigs"]["uh2lob_y"] = MatmulConfig(nDataPad, nDof, nDof, nDof*nDataPad , nDofPad, nDof*nDataPad , 1, 0, 1, 1, "uh2lob_y", "nopf", "gemm") #M is padded in both input and output
self.context["matmulConfigs"]["uh2lob_z_slice"] = MatmulConfig(nDataPad, nDof, nDof, nDof2*nDataPad, nDofPad, nDataPad , 1, 0, 1, 1, "uh2lob_z_slice", "nopf", "gemm") # will only write a slice, overwrite
self.context["matmulConfigs"]["uh2lob_y"] = MatmulConfig(nDataPad, nDof, nDof, nDof*nDataPad , nDofPad, nDof*nDataPad , 1, 0, 1, 1, 1, "uh2lob_y") # M is padded in both input and output
self.context["matmulConfigs"]["uh2lob_z_slice"] = MatmulConfig(nDataPad, nDof, nDof, nDof2*nDataPad, nDofPad, nDataPad , 1, 0, 1, 1, 1, "uh2lob_z_slice") # will only write a slice, overwrite
else:
self.context["matmulConfigs"]["uh2lob_y_slice"] = MatmulConfig(nDataPad, nDof, nDof, nDof*nDataPad , nDofPad, nDataPad , 1, 0, 1, 1, "uh2lob_y_slice", "nopf", "gemm") # will only write a slice, overwrite
self.context["matmulConfigs"]["uh2lob_y_slice"] = MatmulConfig(nDataPad, nDof, nDof, nDof*nDataPad , nDofPad, nDataPad , 1, 0, 1, 1, 1, "uh2lob_y_slice") # will only write a slice, overwrite
# Project to FV for Min/Max, reuse previous gem except last one for slice
if(nDim==3):
self.context["matmulConfigs"]["dg2fv_z_slice"] = MatmulConfig(nDataPad , nDofLim, nDof, nDofLim2*nDataPad, nDofPad, nDataPad , 1, 0, 1, 1, "dg2fv_z_slice", "nopf", "gemm") # will only write a slice, overwrite
self.context["matmulConfigs"]["dg2fv_z_slice"] = MatmulConfig(nDataPad , nDofLim, nDof, nDofLim2*nDataPad, nDofPad, nDataPad , 1, 0, 1, 1, 1, "dg2fv_z_slice") # will only write a slice, overwrite
else:
self.context["matmulConfigs"]["dg2fv_y_slice"] = MatmulConfig(nDataPad , nDofLim, nDof, nDofLim*nDataPad , nDofPad, nDataPad , 1, 0, 1, 1, "dg2fv_y_slice", "nopf", "gemm") # will only write a slice, overwrite
self.context["matmulConfigs"]["dg2fv_y_slice"] = MatmulConfig(nDataPad , nDofLim, nDof, nDofLim*nDataPad , nDofPad, nDataPad , 1, 0, 1, 1, 1, "dg2fv_y_slice") # will only write a slice, overwrite
......@@ -269,7 +269,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
for (int z = 0; z < {{nDof3D}}; z++) {
for (int y = 0; y < {{nDof}}; y++) {
{% if useFlux %}
{{ m.matmul('rhs_x', 'lFi', 'coeffRhsX', 'rhs', '((t*'~nDof3D~'+z)*'~nDof~'+y)*'~nVarPad*nDof, '0', '((t*'~nDof3D~'+z)*'~nDof~'+y)*'~nVarPad*nDof, trueB='Kxi', trueAlpha='- weights3[t*'~nDof*nDof3D~'+z*'~nDof~'+y] * dtBydx') | indent(10) }}{##}
{{ m.matmul_legacy('rhs_x', 'lFi', 'coeffRhsX', 'rhs', '((t*'~nDof3D~'+z)*'~nDof~'+y)*'~nVarPad*nDof, '0', '((t*'~nDof3D~'+z)*'~nDof~'+y)*'~nVarPad*nDof, trueB='Kxi', trueAlpha='- weights3[t*'~nDof*nDof3D~'+z*'~nDof~'+y] * dtBydx') | indent(10) }}{##}
{% endif %}{# useFlux #}
}
}
......@@ -278,7 +278,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
for (int z = 0; z < {{nDof3D}}; z++) {
for (int x = 0; x < {{nDof}}; x++) {
{% if useFlux %}
{{ m.matmul('rhs_y', 'lFi', 'coeffRhsY', 'rhs', '((t*'~nDof3D~'+z)*'~nDof*nDof~'+x)*'~nVarPad~'+'~1*(nDof**nDim)*nDof*nVarPad, '0', '((t*'~nDof3D~'+z)*'~nDof*nDof~'+x)*'~nVarPad, trueB='Kxi', trueAlpha='- weights3[t*'~nDof*nDof3D~'+z*'~nDof~'+x] * dtBydx') | indent(10) }}{##}
{{ m.matmul_legacy('rhs_y', 'lFi', 'coeffRhsY', 'rhs', '((t*'~nDof3D~'+z)*'~nDof*nDof~'+x)*'~nVarPad~'+'~1*(nDof**nDim)*nDof*nVarPad, '0', '((t*'~nDof3D~'+z)*'~nDof*nDof~'+x)*'~nVarPad, trueB='Kxi', trueAlpha='- weights3[t*'~nDof*nDof3D~'+z*'~nDof~'+x] * dtBydx') | indent(10) }}{##}
{% endif %}{# useFlux #}
}
}
......@@ -288,7 +288,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
for (int y = 0; y < {{nDof}}; y++) {
for (int x = 0; x < {{nDof}}; x++) {
{% if useFlux %}
{{ m.matmul('rhs_z', 'lFi','coeffRhsZ', 'rhs', '((t*'~nDof*nDof~'+y)*'~nDof~'+x)*'~nVarPad~'+'~2*(nDof**nDim)*nDof*nVarPad, '0', '((t*'~nDof*nDof~'+y)*'~nDof~'+x)*'~nVarPad, trueB='Kxi', trueAlpha='- weights3[t*'~nDof*nDof3D~'+y*'~nDof~'+x] * dtBydx') | indent(10) }}{##}
{{ m.matmul_legacy('rhs_z', 'lFi','coeffRhsZ', 'rhs', '((t*'~nDof*nDof~'+y)*'~nDof~'+x)*'~nVarPad~'+'~2*(nDof**nDim)*nDof*nVarPad, '0', '((t*'~nDof*nDof~'+y)*'~nDof~'+x)*'~nVarPad, trueB='Kxi', trueAlpha='- weights3[t*'~nDof*nDof3D~'+y*'~nDof~'+x] * dtBydx') | indent(10) }}{##}
{% endif %}{# useFlux #}
}
}
......@@ -309,7 +309,7 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
// discrete Picard iteration
double sq_res = 0.0;
for (int xyz = 0; xyz < {{nDof**nDim}}; xyz++) {
{{ m.matmul('lqi', 'rhs', 's_m_QSlice', 'new_lQi_slice',nVarPad~'*xyz', '0', '0', trueB='iK1_T', trueAlpha='iweights3[xyz]') | indent(6) }}{##}
{{ m.matmul_legacy('lqi', 'rhs', 's_m_QSlice', 'new_lQi_slice',nVarPad~'*xyz', '0', '0', trueB='iK1_T', trueAlpha='iweights3[xyz]') | indent(6) }}{##}
for(int t = 0; t < {{nDof}}; t++) {
for(int n=0; n<{{nVar}}; n++) { //only copy and change the variables, skip parameters
sq_res += (new_lQi_slice[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]) * (new_lQi_slice[n+{{nVarPad}}*t] - lQi[n+{{nDataPad}}*(xyz+{{nDof**nDim}}*t)]);
......@@ -595,14 +595,14 @@ int {{codeNamespace}}::fusedSpaceTimePredictorVolumeIntegral(
for (int i=0; i<{{nDof}}; i++) {
//x, also define coefficient matrix coeffVolume
{{ m.matmul('lduh_x', 'lFhi', 'coeffVolume', 'lduh', '(j*'~nDof~'+i)*'~(nVarPad*nDof)~'+'~(0*nVarPad*(nDof**nDim)), '0', '(j*'~nDof~'+i)*'~(nVarPad*nDof), trueB='Kxi_T', trueAlpha='weights2[i+j*'~nDof~'] * inverseDx', forceCoeffMatrix=True) | indent(6) }}{##}
{{ m.matmul_legacy('lduh_x', 'lFhi', 'coeffVolume', 'lduh', '(j*'~nDof~'+i)*'~(nVarPad*nDof)~'+'~(0*nVarPad*(nDof**nDim)), '0', '(j*'~nDof~'+i)*'~(nVarPad*nDof), trueB='Kxi_T', trueAlpha='weights2[i+j*'~nDof~'] * inverseDx', forceCoeffMatrix=True) | indent(6) }}{##}
//y, reuse coeffVolume
{{ m.matmul('lduh_y', 'lFhi', 'coeffVolume', 'lduh', '(j*'~nDof~'+i)*'~(nVarPad*nDof)~'+'~(1*nVarPad*(nDof**nDim)), '0', '(j*'~(nDof*nDof)~'+i)*'~nVarPad) | indent(6) }}{##}
{{ m.matmul_legacy('lduh_y', 'lFhi', 'coeffVolume', 'lduh', '(j*'~nDof~'+i)*'~(nVarPad*nDof)~'+'~(1*nVarPad*(nDof**nDim)), '0', '(j*'~(nDof*nDof)~'+i)*'~nVarPad) | indent(6) }}{##}
{% if nDim == 3 %}
//z, reuse coeffVolume
{{ m.matmul('lduh_z', 'lFhi', 'coeffVolume', 'lduh', '(j*'~nDof~'+i)*'~(nVarPad*nDof)~'+'~(2*nVarPad*(nDof**nDim)), '0', '(j*'~nDof~'+i)*'~nVarPad) | indent(6) }}{##}
{{ m.matmul_legacy('lduh_z', 'lFhi', 'coeffVolume', 'lduh', '(j*'~nDof~'+i)*'~(nVarPad*nDof)~'+'~(2*nVarPad*(nDof**nDim)), '0', '(j*'~nDof~'+i)*'~nVarPad) | indent(6) }}{##}
{% endif %}
}
......
......@@ -23,7 +23,7 @@
std::memset({{name}}, 0, sizeof({{precision}})*{{size}});
{% endif %}
{% endif %}
{% endmacro %} {##}
{% endmacro %}{##}
{#
/**
......@@ -34,7 +34,7 @@ std::memset({{name}}, 0, sizeof({{precision}})*{{size}});
{% if not tempVarsOnStack %}
_mm_free({{name}});
{% endif %}
{% endmacro %}
{% endmacro %}{##}
{#
/**
......@@ -44,7 +44,13 @@ _mm_free({{name}});
The gemm config (fetched through matmulKey) contains M, N, K, LDA, LDB, LDC, alpha and beta
*/
#}
{% macro matmul(matmulKey, A, B, C, A_shift, B_shift, C_shift, trueAlpha="", trueB="", forceCoeffMatrix=False) %}
{% macro matmul_legacy(matmulKey, A, B, C, A_shift, B_shift, C_shift, trueAlpha="", trueB="", forceCoeffMatrix=False) %}
{% include "subtemplates/matmul_legacy.template" %}
{% endmacro %}
{% macro matmul(matmulKey, A, B, C, A_shift, B_shift, C_shift) %}
{% include "subtemplates/matmul.template" %}
{% endmacro %}
{% macro matmul_prefetch(matmulKey, A, B, C, A_shift, B_shift, C_shift, A_next, B_next, C_next) %}
{% include "subtemplates/matmul.template" %}
{% endmacro %}
{#
......
......@@ -13,11 +13,7 @@
String C_shift : shift to the zero of C
optional
String trueB : true array B, B must b a true matrix, not a tensor slice
String trueAlpha : true value of the coefficent alpha (note: it will be multiplicated by the configuration alpha, /!\ sign error)
bool forceCoeffMatrix : only when using trueB, trueAlpha, force the no libxsmm case to also generate the coeff matrix
If trueB is used, a temporary array trueAlpha*trueB is generated
#}
{% with %}
{# /**************************************
......@@ -37,20 +33,10 @@
{% if overrideUseLibxsmm is not defined or overrideUseLibxsmm == "BoolNotDefined" %}
{% set overrideUseLibxsmm = useLibxsmm %}{# if no override then take the current value #}
{% endif %}
{% if trueB is not defined or trueB == "" %}
{% set trueB = B %}
{% set useTrueB = False %}
{% else %}
{% set useTrueB = True %}
{% endif %}
{% if forceCoeffMatrix is not defined %}
{% set forceCoeffMatrix = False %}
{% endif %}
{# set arrays' name for pragma and eigen map by removing eventual index #}
{% set Ap = (A.split("["))[0] %}
{% set Bp = (B.split("["))[0] %}
{% set Cp = (C.split("["))[0] %}
{% set trueBp = (trueB.split("["))[0] %}
{% if conf.precision == "DP" %}
{% set fpFormat = "double" %}
{% else %}
......@@ -66,16 +52,6 @@
#}
{% if useLibxsmm %}
{% if useTrueB %}{# will set B[it] to be trueAlpha*trueB[it] #}
{{fpFormat}} {{B}}[{{conf.LDB*conf.K}}] __attribute__((aligned(ALIGNMENT)));
#pragma omp simd aligned({{Bp}},{{trueBp}}:ALIGNMENT)
for (int it = 0; it < {{conf.LDB*conf.K}}; it++) {
{{B}}[it] = {{trueAlpha}} * {{trueB}}[it];
}
#if defined(USE_IPO) && !defined(UNSAFE_IPO)
volatile {{fpFormat}} doNotOptimizeAway_{{B}} = {{B}}[0]; //used to prevent the compiler from optimizing temp array away. Needs to be volatile
#endif
{% endif %}{# useTrueB #}
#ifdef USE_IPO
#pragma forceinline
#endif
......@@ -91,7 +67,7 @@ volatile {{fpFormat}} doNotOptimizeAway_{{B}} = {{B}}[0]; //used to prevent the
{#
{
Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.K}}>, Eigen::{{"Aligned"if conf.alignment_A == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDA}}> > {{Ap}}_m({{A}}{% if A_shift != '0' %}+{{A_shift}}{% endif %}); //A = {{A}}
Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.K}},{{conf.N}}>, Eigen::Aligned, Eigen::OuterStride<{{conf.LDB}}> > {{Bp}}_m({{trueB}}{% if B_shift != '0' %}+{{B_shift}}{% endif %}); //B = {{B}}, assume it is aligned
Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.K}},{{conf.N}}>, Eigen::{{"Aligned"if conf.alignment_B == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDB}}> > {{Bp}}_m({{B}}{% if B_shift != '0' %}+{{B_shift}}{% endif %}); //B = {{B}}
Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.N}}>, Eigen::{{"Aligned"if conf.alignment_C == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDC}}> > {{Cp}}_m({{C}}{% if C_shift != '0' %}+{{C_shift}}{% endif %}); //C = {{C}}
{{Cp}}_m.noalias() {{ '+' if conf.beta == 1 }}= {{ '-' if conf.alpha == -1 }}{{Ap}}_m * {{Bp}}_m {% if (useTrueB and not forceCoeffMatrix) %}* {{trueAlpha}}{% endif %};
}
......@@ -99,9 +75,9 @@ volatile {{fpFormat}} doNotOptimizeAway_{{B}} = {{B}}[0]; //used to prevent the
#pragma forceinline recursive
{
new (&{{conf.baseroutinename}}_A_map) Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.K}}>, Eigen::{{"Aligned"if conf.alignment_A == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDA}}> >({{A}}{% if A_shift != '0' %}+{{A_shift}}{% endif %}); //{{conf.baseroutinename}}_A_map = {{A}}
new (&{{conf.baseroutinename}}_B_map) Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.K}},{{conf.N}}>, Eigen::Aligned, Eigen::OuterStride<{{conf.LDB}}> >({{trueB}}{% if B_shift != '0' %}+{{B_shift}}{% endif %}); //{{conf.baseroutinename}}_B_map = {{trueB}}, assume it is aligned
new (&{{conf.baseroutinename}}_B_map) Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.K}},{{conf.N}}>, Eigen::{{"Aligned"if conf.alignment_B == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDB}}> >({{B}}{% if B_shift != '0' %}+{{B_shift}}{% endif %}); //{{conf.baseroutinename}}_B_map = {{B}}, assume it is aligned
new (&{{conf.baseroutinename}}_C_map) Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.N}}>, Eigen::{{"Aligned"if conf.alignment_C == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDC}}> >({{C}}{% if C_shift != '0' %}+{{C_shift}}{% endif %}); //{{conf.baseroutinename}}_C_map = {{C}}
{{conf.baseroutinename}}_C_map.noalias() {{ '+' if conf.beta == 1 }}= {{ '-1. * ' if conf.alpha == -1 }}{{conf.baseroutinename}}_A_map * {{conf.baseroutinename}}_B_map{% if (useTrueB and not forceCoeffMatrix) %} * {{trueAlpha}}{% endif %};
{{conf.baseroutinename}}_C_map.noalias() {{ '+' if conf.beta == 1 }}= {{ '-1. * ' if conf.alpha == -1 }}{{conf.baseroutinename}}_A_map * {{conf.baseroutinename}}_B_map;
}
{#
......@@ -110,14 +86,6 @@ volatile {{fpFormat}} doNotOptimizeAway_{{B}} = {{B}}[0]; //used to prevent the
#}
{% else %}{# no BLAS library #}
{% if forceCoeffMatrix %}
{{fpFormat}} {{B}}[{{conf.LDB*conf.K}}] __attribute__((aligned(ALIGNMENT)));
#pragma omp simd aligned({{Bp}},{{trueBp}}:ALIGNMENT)
for (int it = 0; it < {{conf.LDB*conf.K}}; it++) {
{{B}}[it] = {{trueAlpha}} * {{trueB}}[it];
}
{% set trueB = B %}
{% endif %}
{% if conf.beta == 0 %}
// reset {{C}}
for (int it_1 = 0; it_1 < {{conf.N}}; it_1++) {
......@@ -131,7 +99,7 @@ for (int it_1 = 0; it_1 < {{conf.N}}; it_1++) {
for (int it_2 = 0; it_2 < {{conf.K}}; it_2++) {
#pragma omp simd aligned({{Cp}},{{Ap}},{{trueBp}}:ALIGNMENT)
for (int it_3 = 0; it_3 < {{conf.M}}; it_3++) {
{{C}}[{% if C_shift != '0' %}{{C_shift}}+{% endif %}it_1*{{conf.LDC}}+it_3] {{ '+' if conf.alpha == 1 else '-' }}= {{A}}[{% if A_shift != '0' %}{{A_shift}}+{% endif %}it_2*{{conf.LDA}}+it_3] * {% if (useTrueB and not forceCoeffMatrix) %}{{trueAlpha}}* {% endif %}{{trueB}}[{% if B_shift != '0' %}{{B_shift}}+{% endif %}it_1*{{conf.LDB}}+it_2];
{{C}}[{% if C_shift != '0' %}{{C_shift}}+{% endif %}it_1*{{conf.LDC}}+it_3] {{ '+' if conf.alpha == 1 else '-' }}= {{A}}[{% if A_shift != '0' %}{{A_shift}}+{% endif %}it_2*{{conf.LDA}}+it_3] * {{B}}[{% if B_shift != '0' %}{{B_shift}}+{% endif %}it_1*{{conf.LDB}}+it_2];
}
}
}
......
{#
C = alpha * A * B + beta * C
(M x N) (M x K) (K x N)
The gemm config (fetched through matmulKey) contains M, N, K, LDA, LDB, LDC, alpha and beta
See matmulConfig
String matmulKey : name of the associated config
String A : name of A
String B : name of B
String C : name of C
String A_shift : shift to the zero of A
String B_shift : shift to the zero of B
String C_shift : shift to the zero of C
optional
String trueB : true array B, B must b a true matrix, not a tensor slice
String trueAlpha : true value of the coefficent alpha (note: it will be multiplicated by the configuration alpha, /!\ sign error)
bool forceCoeffMatrix : only when using trueB, trueAlpha, force the no libxsmm case to also generate the coeff matrix
If trueB is used, a temporary array trueAlpha*trueB is generated
#}
{% with %}
{# /**************************************
**** Set up helper template values ****
**************************************/ #}
{# replace 0 with '0' #}
{% if A_shift == 0 %}
{% set A_shift = '0' %}
{% endif %}
{% if B_shift == 0 %}
{% set B_shift = '0' %}
{% endif %}
{% if C_shift == 0 %}
{% set C_shift = '0' %}
{% endif %}
{% set conf = matmulConfigs[matmulKey] %}
{% if overrideUseLibxsmm is not defined or overrideUseLibxsmm == "BoolNotDefined" %}
{% set overrideUseLibxsmm = useLibxsmm %}{# if no override then take the current value #}
{% endif %}
{% if trueB is not defined or trueB == "" %}
{% set trueB = B %}
{% set useTrueB = False %}
{% else %}
{% set useTrueB = True %}
{% endif %}
{% if forceCoeffMatrix is not defined %}
{% set forceCoeffMatrix = False %}
{% endif %}
{# set arrays' name for pragma and eigen map by removing eventual index #}
{% set Ap = (A.split("["))[0] %}
{% set Bp = (B.split("["))[0] %}
{% set Cp = (C.split("["))[0] %}
{% set trueBp = (trueB.split("["))[0] %}
{% if conf.precision == "DP" %}
{% set fpFormat = "double" %}
{% else %}
{% set fpFormat = "float" %}
{% endif %}
{# /********************
**** Subtemplate ****
*********************/ #}
{#
// LIBXSMM case
//-------------
#}
{% if useLibxsmm %}
{% if useTrueB %}{# will set B[it] to be trueAlpha*trueB[it] #}
{{fpFormat}} {{B}}[{{conf.LDB*conf.K}}] __attribute__((aligned(ALIGNMENT)));
#pragma omp simd aligned({{Bp}},{{trueBp}}:ALIGNMENT)
for (int it = 0; it < {{conf.LDB*conf.K}}; it++) {
{{B}}[it] = {{trueAlpha}} * {{trueB}}[it];
}
#if defined(USE_IPO) && !defined(UNSAFE_IPO)
volatile {{fpFormat}} doNotOptimizeAway_{{B}} = {{B}}[0]; //used to prevent the compiler from optimizing temp array away. Needs to be volatile
#endif
{% endif %}{# useTrueB #}
#ifdef USE_IPO
#pragma forceinline
#endif
{{conf.baseroutinename}}({{A}}{% if A_shift != '0' %}+{{A_shift}}{% endif %}, {{B}}{% if B_shift != '0' %}+{{B_shift}}{% endif %}, {{C}}{% if C_shift != '0' %}+{{C_shift}}{% endif %});
{#
// Eigen case
//-----------
#}
{% elif useEigen %}
{# old direct mapper #}
{#
{
Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.K}}>, Eigen::{{"Aligned"if conf.alignment_A == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDA}}> > {{Ap}}_m({{A}}{% if A_shift != '0' %}+{{A_shift}}{% endif %}); //A = {{A}}
Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.K}},{{conf.N}}>, Eigen::Aligned, Eigen::OuterStride<{{conf.LDB}}> > {{Bp}}_m({{trueB}}{% if B_shift != '0' %}+{{B_shift}}{% endif %}); //B = {{B}}, assume it is aligned
Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.N}}>, Eigen::{{"Aligned"if conf.alignment_C == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDC}}> > {{Cp}}_m({{C}}{% if C_shift != '0' %}+{{C_shift}}{% endif %}); //C = {{C}}
{{Cp}}_m.noalias() {{ '+' if conf.beta == 1 }}= {{ '-' if conf.alpha == -1 }}{{Ap}}_m * {{Bp}}_m {% if (useTrueB and not forceCoeffMatrix) %}* {{trueAlpha}}{% endif %};
}
#}
#pragma forceinline recursive
{
new (&{{conf.baseroutinename}}_A_map) Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.K}}>, Eigen::{{"Aligned"if conf.alignment_A == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDA}}> >({{A}}{% if A_shift != '0' %}+{{A_shift}}{% endif %}); //{{conf.baseroutinename}}_A_map = {{A}}
new (&{{conf.baseroutinename}}_B_map) Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.K}},{{conf.N}}>, Eigen::Aligned, Eigen::OuterStride<{{conf.LDB}}> >({{trueB}}{% if B_shift != '0' %}+{{B_shift}}{% endif %}); //{{conf.baseroutinename}}_B_map = {{trueB}}, assume it is aligned
new (&{{conf.baseroutinename}}_C_map) Eigen::Map<Eigen::Matrix<{{fpFormat}},{{conf.M}},{{conf.N}}>, Eigen::{{"Aligned"if conf.alignment_C == 1 else "Unaligned"}}, Eigen::OuterStride<{{conf.LDC}}> >({{C}}{% if C_shift != '0' %}+{{C_shift}}{% endif %}); //{{conf.baseroutinename}}_C_map = {{C}}
{{conf.baseroutinename}}_C_map.noalias() {{ '+' if conf.beta == 1 }}= {{ '-1. * ' if conf.alpha == -1 }}{{conf.baseroutinename}}_A_map * {{conf.baseroutinename}}_B_map{% if (useTrueB and not forceCoeffMatrix) %} * {{trueAlpha}}{% endif %};
}
{#
// No BLAS case
//-------------
#}
{% else %}{# no BLAS library #}
{% if forceCoeffMatrix %}
{{fpFormat}} {{B}}[{{conf.LDB*conf.K}}] __attribute__((aligned(ALIGNMENT)));
#pragma omp simd aligned({{Bp}},{{trueBp}}:ALIGNMENT)
for (int it = 0; it < {{conf.LDB*conf.K}}; it++) {
{{B}}[it] = {{trueAlpha}} * {{trueB}}[it];
}
{% set trueB = B %}
{% endif %}
{% if conf.beta == 0 %}
// reset {{C}}
for (int it_1 = 0; it_1 < {{conf.N}}; it_1++) {
#pragma omp simd aligned({{Cp}}:ALIGNMENT)
for (int it_3 = 0; it_3 < {{conf.M}}; it_3++) {
{{C}}[{% if C_shift != '0' %}{{C_shift}}+{% endif %}it_1*{{conf.LDC}}+it_3] = 0.;
}
}
{% endif %}
for (int it_1 = 0; it_1 < {{conf.N}}; it_1++) {
for (int it_2 = 0; it_2 < {{conf.K}}; it_2++) {
#pragma omp simd aligned({{Cp}},{{Ap}},{{trueBp}}:ALIGNMENT)
for (int it_3 = 0; it_3 < {{conf.M}}; it_3++) {
{{C}}[{% if C_shift != '0' %}{{C_shift}}+{% endif %}it_1*{{conf.LDC}}+it_3] {{ '+' if conf.alpha == 1 else '-' }}= {{A}}[{% if A_shift != '0' %}{{A_shift}}+{% endif %}it_2*{{conf.LDA}}+it_3] * {% if (useTrueB and not forceCoeffMatrix) %}{{trueAlpha}}* {% endif %}{{trueB}}[{% if B_shift != '0' %}{{B_shift}}+{% endif %}it_1*{{conf.LDB}}+it_2];
}
}
}
{% endif %}{# end choice of BLAS lib #}
{% endwith %}
\ No newline at end of file
......@@ -41,9 +41,6 @@ class MatmulConfig:
}
"""
# dgemm, dgemv, ....
operationType = ""
baseroutinename = ""
name = ""
......@@ -64,16 +61,19 @@ class MatmulConfig:
# alignment flags
alignment_A = 0 # 1 aligned, 0 unaligned
alignment_B = 0 # 1 aligned, 0 unaligned
alignment_C = 0 # 1 aligned, 0 unaligned
# prefetching
prefetchStrategy = ""
# prefetching, only if enabled by global config
prefetchInput = "" # "" = no input to prefetch, "A": only A_next, "B": only B_next, "AB": A and B
prefetchOutput = "" # "" = no output preftetch, "C": prefetch C
# precision
precision="DP" # "DP" = double, "SP" = float
# Constructor
def __init__(self, M, N, K, LDA, LDB, LDC, alpha, beta, alignment_A, alignment_C, name, prefetchStrategy, operationType="gemm", precision="DP"):
def __init__(self, M, N, K, LDA, LDB, LDC, alpha, beta, alignment_A, alignment_B, alignment_C, name, prefetchInput="", prefetchOutput="", precision="DP"):
if precision not in ["DP", "SP"]:
print("MatmulConfig: Unknown precision")
exit()
......@@ -97,13 +97,15 @@ class MatmulConfig:
self.alpha = alpha
self.beta = beta
self.alignment_A = alignment_A
self.alignment_B = alignment_B
self.alignment_C = alignment_C
self.name = name
self.prefetchStrategy = prefetchStrategy
self.baseroutinename = operationType+"_"+str(M)+"_"+str(N)+"_"+str(K)+"_"+name
self.prefetchInput = prefetchInput
self.prefetchOutput = prefetchOutput
self.baseroutinename = "gemm"+"_"+str(M)+"_"+str(N)+"_"+str(K)+"_"+name
self.precision = precision
def __repr__(self):
return "<%s: %s LDA=%s, LDB=%s, LDC=%s, alpha=%s, beta=%s, alignment_A=%s, alignment_C=%s>" \
% (self.name, self.baseroutinename, self.LDA, self.LDB, self.LDC, self.alpha, self.beta, self.alignment_A, self.alignment_C)
return "<%s: %s LDA=%s, LDB=%s, LDC=%s, alpha=%s, beta=%s, alignment_A=%s, alignment_B=%s, alignment_C=%s>" \
% (self.name, self.baseroutinename, self.LDA, self.LDB, self.LDC, self.alpha, self.beta, self.alignment_A, self.alignment_B, self.alignment_C)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment