16.12.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit a765df39 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- more hints for AST-transformation from EBNFCompiler

parent 9958d09b
......@@ -82,7 +82,7 @@ from DHParser import logging, is_filename, load_if_file, \\
last_value, counterpart, accumulate, PreprocessorFunc, \\
Node, TransformationFunc, TRUE_CONDITION, \\
traverse, remove_children_if, merge_children, is_anonymous, \\
reduce_single_child, replace_by_single_child, remove_whitespace, \\
reduce_single_child, replace_by_single_child, replace_or_reduce, remove_whitespace, \\
remove_expendables, remove_empty, remove_tokens, flatten, is_whitespace, \\
is_empty, is_expendable, collapse, replace_content, WHITESPACE_PTYPE, TOKEN_PTYPE, \\
remove_parser, remove_content, remove_brackets, replace_parser, \\
......
......@@ -330,6 +330,9 @@ class EBNFCompiler(Compiler):
recursive - A set of symbols that are used recursively and
therefore require a `Forward`-operator.
definitions - A dictionary of definitions. Other than `rules`
this maps the symbols to their compiled definienda.
deferred_taks - A list of callables that is filled during
compilatation, but that will be executed only after
compilation has finished. Typically, it contains
......@@ -367,6 +370,7 @@ class EBNFCompiler(Compiler):
self.symbols = {} # type: Dict[str, Node]
self.variables = set() # type: Set[str]
self.recursive = set() # type: Set[str]
self.definitions = {} # type: Dict[str, str]
self.deferred_tasks = [] # type: List[Callable]
self.root = "" # type: str
self.directives = {'whitespace': self.WHITESPACE['horizontal'],
......@@ -407,9 +411,15 @@ class EBNFCompiler(Compiler):
self.grammar_name + '-grammar']
transtable.append(' "+": remove_empty,')
for name in self.rules:
transtable.append(' "' + name + '": [],')
tf = '[]'
rule = self.definitions[name]
if rule.startswith('Alternative'):
tf = '[replace_or_reduce]'
elif rule.startswith('Synonym'):
tf = '[replace_by_single_child]'
transtable.append(' "' + name + '": %s,' % tf)
transtable.append(' ":Token, :RE": reduce_single_child,')
transtable += [' # "*": replace_by_single_child', '}', '', tf_name +
transtable += [' "*": replace_by_single_child', '}', '', tf_name +
' = partial(traverse, processing_table=%s)' % tt_name, '']
transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
return '\n'.join(transtable)
......@@ -560,6 +570,7 @@ class EBNFCompiler(Compiler):
assert nd.parser.name == "directive", nd.as_sxpr()
self.compile(nd)
node.error_flag = node.error_flag or nd.error_flag
self.definitions.update(definitions)
return self.assemble_parser(definitions, node)
......
......@@ -39,8 +39,10 @@ __all__ = ('transformation_factory',
'key_parser_name',
'key_tag_name',
'traverse',
'is_named',
'replace_by_single_child',
'reduce_single_child',
'replace_or_reduce',
'replace_parser',
'collapse',
'merge_children',
......@@ -235,6 +237,21 @@ def TRUE_CONDITION(node):
return True
def replace_child(node):
assert len(node.children) == 1
if not node.children[0].parser.name:
node.children[0].parser.name = node.parser.name
node.parser = node.children[0].parser
node._errors.extend(node.children[0]._errors)
node.result = node.result[0].result
def reduce_child(node):
assert len(node.children) == 1
node._errors.extend(node.children[0]._errors)
node.result = node.result[0].result
@transformation_factory(Callable)
def replace_by_single_child(node, condition=TRUE_CONDITION):
"""Remove single branch node, replacing it by its immediate descendant
......@@ -242,12 +259,8 @@ def replace_by_single_child(node, condition=TRUE_CONDITION):
(In case the descendant's name is empty (i.e. anonymous) the
name of this node's parser is kept.)
"""
if node.children and len(node.result) == 1 and condition(node.children[0]):
if not node.result[0].parser.name:
node.result[0].parser.name = node.parser.name
node.parser = node.result[0].parser
node._errors.extend(node.result[0]._errors)
node.result = node.result[0].result
if len(node.children) == 1 and condition(node.children[0]):
replace_child(node)
@transformation_factory(Callable)
......@@ -257,9 +270,23 @@ def reduce_single_child(node, condition=TRUE_CONDITION):
If the condition evaluates to false on the descendant, it will not
be reduced.
"""
if node.children and len(node.result) == 1 and condition(node.children[0]):
node._errors.extend(node.result[0]._errors)
node.result = node.result[0].result
if len(node.children) == 1 and condition(node.children[0]):
reduce_child(node)
def is_named(node):
return node.parser.name
@transformation_factory(Callable)
def replace_or_reduce(node, condition=is_named):
"""Replaces node by a single child, if condition is met on child,
otherwise (i.e. if the child is anonymous) reduces the child.
"""
if len(node.children) == 1 and condition(node.children[0]):
replace_child(node)
else:
reduce_child(node)
@transformation_factory
......
......@@ -77,6 +77,7 @@ def selftest() -> bool:
print("\n\nSTAGE 2: Selfhosting-test: Trying to compile EBNF-Grammar with generated parser...\n")
selfhosted_ebnf_parser = compileDSL(ebnf_src, None, generated_ebnf_parser,
ebnf_transformer, ebnf_compiler)
ebnf_compiler.gen_transformer_skeleton()
print(selfhosted_ebnf_parser)
print("\n\n Selftest SUCCEEDED :-)\n\n")
return True
......
......@@ -64,12 +64,12 @@ itemize = "\begin{itemize}" [PARSEP] { item } §"\end{itemize}"
enumerate = "\begin{enumerate}" [PARSEP] {item } §"\end{enumerate}"
item = "\item" [PARSEP] sequence
figure = "\begin{figure}" sequence "\end{figure}"
quotation = ("\begin{quotation}" sequence "\end{quotation}")
| ("\begin{quote}" sequence "\end{quote}")
verbatim = "\begin{verbatim}" sequence "\end{verbatim}"
table = "\begin{tabular}" table_config sequence "\end{tabular}"
table_config = "{" /[lcr|]+/~ "}"
figure = "\begin{figure}" sequence §"\end{figure}"
quotation = ("\begin{quotation}" sequence §"\end{quotation}")
| ("\begin{quote}" sequence §"\end{quote}")
verbatim = "\begin{verbatim}" sequence §"\end{verbatim}"
table = "\begin{tabular}" table_config sequence §"\end{tabular}"
table_config = "{" /[lcr|]+/~ §"}"
#### paragraphs and sequences of paragraphs ####
......@@ -92,7 +92,7 @@ end_inline_env = end_environment
begin_environment = "\begin{" §NAME §"}"
end_environment = "\end{" §::NAME §"}"
inline_math = "$" /[^$]*/ "$"
inline_math = "$" /[^$]*/ §"$"
#### commands ####
......
#!/usr/bin/python
#######################################################################
#
# SYMBOLS SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
import os
import sys
from functools import partial
try:
import regex as re
except ImportError:
import re
from DHParser import logging, is_filename, Grammar, Compiler, Lookbehind, Alternative, Pop, \
Required, Token, Synonym, \
Optional, NegativeLookbehind, OneOrMore, RegExp, Series, RE, Capture, \
ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source, \
PreprocessorFunc, \
Node, TransformationFunc, \
traverse, merge_children, remove_whitespace, remove_parser, \
reduce_single_child, replace_by_single_child, remove_expendables, remove_empty, flatten, \
collapse, replace_content, remove_brackets, remove_first
#######################################################################
#
# PREPROCESSOR SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
def LaTeXPreprocessor(text):
return text
def get_preprocessor() -> PreprocessorFunc:
return LaTeXPreprocessor
#######################################################################
#
# PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!
#
#######################################################################
class LaTeXGrammar(Grammar):
r"""Parser for a LaTeX source file, with this grammar:
# LaTeX-Grammar for DHParser
@ testing = True
@ whitespace = /[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?/ # optional whitespace, including at most one linefeed
@ comment = /%.*(?:\n|$)/
latexdoc = preamble document
preamble = { command }+
document = [PARSEP] "\begin{document}" [PARSEP]
frontpages [PARSEP]
(Chapters | Sections) [PARSEP]
[Bibliography] [Index] [PARSEP]
"\end{document}" [PARSEP] §EOF
frontpages = sequence
#######################################################################
#
# document structure
#
#######################################################################
Chapters = { Chapter [PARSEP] }+
Chapter = "\Chapter" block [PARSEP] { sequence | Sections }
Sections = { Section [PARSEP] }+
Section = "\Section" block [PARSEP] { sequence | SubSections }
SubSections = { SubSection [PARSEP] }+
SubSection = "\SubSection" block [PARSEP] { sequence | SubSubSections }
SubSubSections = { SubSubSection [PARSEP] }+
SubSubSection = "\SubSubSection" block [PARSEP] { sequence | Paragraphs }
Paragraphs = { Paragraph [PARSEP] }+
Paragraph = "\paragraph" block [PARSEP] { sequence | SubParagraphs }
SubParagraphs = { SubParagraph [PARSEP] }+
SubParagraph = "\subparagpaph" block [PARSEP] { sequence }
Bibliography = "\bibliography" block [PARSEP]
Index = "\printindex" [PARSEP]
#######################################################################
#
# document content
#
#######################################################################
#### block environments ####
block_environment = known_environment | generic_block
known_environment = itemize | enumerate | figure | table | quotation
| verbatim
generic_block = begin_generic_block sequence §end_generic_block
begin_generic_block = -&LB begin_environment -&LB
end_generic_block = -&LB end_environment -&LB
itemize = "\begin{itemize}" [PARSEP] { item } §"\end{itemize}"
enumerate = "\begin{enumerate}" [PARSEP] {item } §"\end{enumerate}"
item = "\item" [PARSEP] sequence
figure = "\begin{figure}" sequence "\end{figure}"
quotation = ("\begin{quotation}" sequence "\end{quotation}")
| ("\begin{quote}" sequence "\end{quote}")
verbatim = "\begin{verbatim}" sequence "\end{verbatim}"
table = "\begin{tabular}" table_config sequence "\end{tabular}"
table_config = "{" /[lcr|]+/~ "}"
#### paragraphs and sequences of paragraphs ####
block_of_paragraphs = /{/ sequence §/}/
sequence = { (paragraph | block_environment ) [PARSEP] }+
paragraph = { !blockcmd text_elements //~ }+
text_elements = command | text | block | inline_environment
#### inline enivronments ####
inline_environment = known_inline_env | generic_inline_env
known_inline_env = inline_math
generic_inline_env = (begin_inline_env { text_elements }+ §end_environment)
begin_inline_env = (-!LB begin_environment) | (begin_environment -!LB)
# end_inline_env = (-!LB end_environment) | (end_environment -!LB) # ambiguity with genric_block when EOF
begin_environment = "\begin{" §NAME §"}"
end_environment = "\end{" §::NAME §"}"
inline_math = "$" /[^$]*/ "$"
#### commands ####
command = known_command | generic_command
known_command = footnote | includegraphics | caption
generic_command = !no_command CMDNAME [[ //~ config ] //~ block ]
footnote = "\footnote" block_of_paragraphs
includegraphics = "\includegraphics" config block
caption = "\caption" block
#######################################################################
#
# low-level text and character sequences
#
#######################################################################
config = "[" cfgtext §"]"
block = /{/ { text_elements } §/}/
text = { cfgtext | (BRACKETS //~) }+
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
no_command = "\begin{" | "\end" | BACKSLASH structural
blockcmd = BACKSLASH ( ( "begin{" | "end{" )
( "enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| structural | begin_generic_block | end_generic_block )
structural = "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item"
#######################################################################
#
# Primitives
#
#######################################################################
CMDNAME = /\\(?:(?!_)\w)+/~
NAME = /\w+/~
ESCAPED = /\\[%$&_\/]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
TEXTCHUNK = /[^\\%$&\{\}\[\]\s\n]+/ # some piece of text excluding whitespace,
# linefeed and special characters
WSPC = /[ \t]+/ # (horizontal) whitespace
LF = !PARSEP /[ \t]*\n[ \t]*/ # linefeed but not an empty line
PARSEP = /[ \t]*(?:\n[ \t]*)+\n[ \t]*/ # at least one empty line, i.e.
# [whitespace] linefeed [whitespace] linefeed
LB = /\s*?\n|$/ # backwards line break for Lookbehind-Operator
# beginning of text marker '$' added for test code
BACKSLASH = /[\\]/
EOF = /(?!.)/ # End-Of-File
"""
begin_generic_block = Forward()
block_environment = Forward()
block_of_paragraphs = Forward()
end_generic_block = Forward()
text_elements = Forward()
source_hash__ = "06385bac4dd7cb009bd29712a8fc692c"
parser_initialization__ = "upon instantiation"
COMMENT__ = r'%.*(?:\n|$)'
WSP__ = mixin_comment(whitespace=r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?', comment=r'%.*(?:\n|$)')
wspL__ = ''
wspR__ = WSP__
EOF = RegExp('(?!.)')
BACKSLASH = RegExp('[\\\\]')
LB = RegExp('\\s*?\\n|$')
PARSEP = RegExp('[ \\t]*(?:\\n[ \\t]*)+\\n[ \\t]*')
LF = Series(NegativeLookahead(PARSEP), RegExp('[ \\t]*\\n[ \\t]*'))
WSPC = RegExp('[ \\t]+')
TEXTCHUNK = RegExp('[^\\\\%$&\\{\\}\\[\\]\\s\\n]+')
BRACKETS = RegExp('[\\[\\]]')
ESCAPED = RegExp('\\\\[%$&_/]')
NAME = Capture(RE('\\w+'))
CMDNAME = RE('\\\\(?:(?!_)\\w)+')
structural = Alternative(Token("subsection"), Token("section"), Token("chapter"), Token("subsubsection"), Token("paragraph"), Token("subparagraph"), Token("item"))
blockcmd = Series(BACKSLASH, Alternative(Series(Alternative(Token("begin{"), Token("end{")), Alternative(Token("enumerate"), Token("itemize"), Token("figure"), Token("quote"), Token("quotation"), Token("tabular")), Token("}")), structural, begin_generic_block, end_generic_block))
no_command = Alternative(Token("\\begin{"), Token("\\end"), Series(BACKSLASH, structural))
word_sequence = OneOrMore(Series(TEXTCHUNK, RE('')))
cfgtext = OneOrMore(Alternative(word_sequence, Series(ESCAPED, RE(''))))
text = OneOrMore(Alternative(cfgtext, Series(BRACKETS, RE(''))))
block = Series(RegExp('{'), ZeroOrMore(text_elements), Required(RegExp('}')))
config = Series(Token("["), cfgtext, Required(Token("]")))
caption = Series(Token("\\caption"), block)
includegraphics = Series(Token("\\includegraphics"), config, block)
footnote = Series(Token("\\footnote"), block_of_paragraphs)
generic_command = Series(NegativeLookahead(no_command), CMDNAME, Optional(Series(Optional(Series(RE(''), config)), RE(''), block)))
known_command = Alternative(footnote, includegraphics, caption)
command = Alternative(known_command, generic_command)
inline_math = Series(Token("$"), RegExp('[^$]*'), Token("$"))
end_environment = Series(Token("\\end{"), Required(Pop(NAME)), Required(Token("}")))
begin_environment = Series(Token("\\begin{"), Required(NAME), Required(Token("}")))
begin_inline_env = Alternative(Series(NegativeLookbehind(LB), begin_environment), Series(begin_environment, NegativeLookbehind(LB)))
generic_inline_env = Series(begin_inline_env, OneOrMore(text_elements), Required(end_environment))
known_inline_env = Synonym(inline_math)
inline_environment = Alternative(known_inline_env, generic_inline_env)
text_elements.set(Alternative(command, text, block, inline_environment))
paragraph = OneOrMore(Series(NegativeLookahead(blockcmd), text_elements, RE('')))
sequence = OneOrMore(Series(Alternative(paragraph, block_environment), Optional(PARSEP)))
block_of_paragraphs.set(Series(RegExp('{'), sequence, Required(RegExp('}'))))
table_config = Series(Token("{"), RE('[lcr|]+'), Token("}"))
table = Series(Token("\\begin{tabular}"), table_config, sequence, Token("\\end{tabular}"))
verbatim = Series(Token("\\begin{verbatim}"), sequence, Token("\\end{verbatim}"))
quotation = Alternative(Series(Token("\\begin{quotation}"), sequence, Token("\\end{quotation}")), Series(Token("\\begin{quote}"), sequence, Token("\\end{quote}")))
figure = Series(Token("\\begin{figure}"), sequence, Token("\\end{figure}"))
item = Series(Token("\\item"), Optional(PARSEP), sequence)
enumerate = Series(Token("\\begin{enumerate}"), Optional(PARSEP), ZeroOrMore(item), Required(Token("\\end{enumerate}")))
itemize = Series(Token("\\begin{itemize}"), Optional(PARSEP), ZeroOrMore(item), Required(Token("\\end{itemize}")))
end_generic_block.set(Series(Lookbehind(LB), end_environment, Lookbehind(LB)))
begin_generic_block.set(Series(Lookbehind(LB), begin_environment, Lookbehind(LB)))
generic_block = Series(begin_generic_block, sequence, Required(end_generic_block))
known_environment = Alternative(itemize, enumerate, figure, table, quotation, verbatim)
block_environment.set(Alternative(known_environment, generic_block))
Index = Series(Token("\\printindex"), Optional(PARSEP))
Bibliography = Series(Token("\\bibliography"), block, Optional(PARSEP))
SubParagraph = Series(Token("\\subparagpaph"), block, Optional(PARSEP), ZeroOrMore(sequence))
SubParagraphs = OneOrMore(Series(SubParagraph, Optional(PARSEP)))
Paragraph = Series(Token("\\paragraph"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubParagraphs)))
Paragraphs = OneOrMore(Series(Paragraph, Optional(PARSEP)))
SubSubSection = Series(Token("\\SubSubSection"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, Paragraphs)))
SubSubSections = OneOrMore(Series(SubSubSection, Optional(PARSEP)))
SubSection = Series(Token("\\SubSection"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubSubSections)))
SubSections = OneOrMore(Series(SubSection, Optional(PARSEP)))
Section = Series(Token("\\Section"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubSections)))
Sections = OneOrMore(Series(Section, Optional(PARSEP)))
Chapter = Series(Token("\\Chapter"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, Sections)))
Chapters = OneOrMore(Series(Chapter, Optional(PARSEP)))
frontpages = Synonym(sequence)
document = Series(Optional(PARSEP), Token("\\begin{document}"), Optional(PARSEP), frontpages, Optional(PARSEP), Alternative(Chapters, Sections), Optional(PARSEP), Optional(Bibliography), Optional(Index), Optional(PARSEP), Token("\\end{document}"), Optional(PARSEP), Required(EOF))
preamble = OneOrMore(command)
latexdoc = Series(preamble, document)
root__ = latexdoc
def get_grammar() -> LaTeXGrammar:
global thread_local_LaTeX_grammar_singleton
try:
grammar = thread_local_LaTeX_grammar_singleton
return grammar
except NameError:
thread_local_LaTeX_grammar_singleton = LaTeXGrammar()
return thread_local_LaTeX_grammar_singleton
#######################################################################
#
# AST SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
def streamline_whitespace(node):
assert node.tag_name in ['WSPC', ':Whitespace']
s = str(node)
c = s.find('%')
n = s.find('\n')
if c >= 0:
node.result = (' ' if (n >= c) or (n < 0) else '\n')+ s[c:].rstrip(' \t')
elif s.find('\n') >= 0:
node.result = '\n'
else:
node.result = ' '
LaTeX_AST_transformation_table = {
# AST Transformations for the LaTeX-grammar
"+":
remove_empty,
"latexdoc": [],
"preamble": [],
"document": [],
"block_environment": [],
"begin_generic_block":
[reduce_single_child],
"end_generic_block":
[reduce_single_child],
"parblock": [],
"sequence":
[flatten, remove_parser('PARSEP')],
"enumerate, itemize":
[remove_brackets, remove_parser('PARSEP'), reduce_single_child],
"item":
[remove_first, remove_parser('PARSEP')],
"paragraph":
[flatten(lambda node: not node.parser.name or node.parser.name == "text"),
merge_children('text', ':Whitespace')],
"quotation, generic_bloc, generic_inline_env, inline_math":
[reduce_single_child, remove_brackets],
"inline_environment": [],
"begin_environment": [remove_brackets, reduce_single_child],
"end_environment": [remove_brackets, reduce_single_child],
# "command": [],
"generic_command": [],
"config, block": [remove_brackets, reduce_single_child],
"text":
[reduce_single_child, merge_children('text', 'word_sequence', ':Whitespace')],
"cfgtext": [flatten, reduce_single_child],
"word_sequence":
[collapse],
"blockcmd": [],
"CMDNAME":
[remove_expendables, reduce_single_child],
"NAME": [reduce_single_child],
"ESCAPED": [reduce_single_child],
"BRACKETS": [],
"TEXTCHUNK": [],
"WSPC, :Whitespace":
[], # streamline_whitespace, # whitespace will be removed anyway
"LF":
replace_content(lambda node: '\n'),
"PARSEP":
[], # replace_content(lambda node: '\n\n'),
"EOF": [],
":RE":
[reduce_single_child],
":Token":
[], # [remove_whitespace, reduce_single_child], # Tokens will be removed anyway?
# "*": [] # replace_by_single_child
}
LaTeXTransform = partial(traverse, processing_table=LaTeX_AST_transformation_table)
# LaTeXTransform = lambda tree : 1
def get_transformer() -> TransformationFunc:
return LaTeXTransform
#######################################################################
#
# COMPILER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
class LaTeXCompiler(Compiler):
"""Compiler for the abstract-syntax-tree of a LaTeX source file.
"""
def __init__(self, grammar_name="LaTeX", grammar_source=""):
super(LaTeXCompiler, self).__init__(grammar_name, grammar_source)
assert re.match('\w+\Z', grammar_name)
def on_latexdoc(self, node):
return node
def on_preamble(self, node):
pass
def on_document(self, node):
pass
def on_frontpages(self, node):
pass
def on_Chapters(self, node):
pass
def on_Chapter(self, node):
pass
def on_Sections(self, node):
pass
def on_Section(self, node):
pass
def on_SubSections(self, node):
pass
def on_SubSection(self, node):
pass
def on_SubSubSections(self, node):
pass
def on_SubSubSection(self, node):
pass
def on_Paragraphs(self, node):
pass
def on_Paragraph(self, node):
pass
def on_SubParagraphs(self, node):
pass
def on_SubParagraph(self, node):
pass
def on_Bibliography(self, node):
pass