Commit db9e1654 authored by Eckhart Arnold's avatar Eckhart Arnold

- Extended LaTeX Grammar and tests

parent e8c626df
......@@ -85,7 +85,7 @@ from DHParser.syntaxtree import Node, traverse, remove_children_if, \\
remove_expendables, remove_empty, remove_tokens, flatten, is_whitespace, \\
is_empty, is_expendable, collapse, replace_content, WHITESPACE_PTYPE, TOKEN_PTYPE, \\
TransformationFunc, remove_parser, remove_content, remove_brackets, \\
keep_children, has_name, has_content
keep_children, has_name, has_content, apply_if
'''
......
......@@ -71,8 +71,7 @@ def get_ebnf_scanner() -> ScannerFunc:
#
########################################################################
# TODO: Introduce dummy/rename-parser, for simple assignments (e.g. jahr = JAHRESZAHL) or substition!
# TODO: Raise Error for unconnected parsers!
class EBNFGrammar(Grammar):
r"""Parser for an EBNF source file, with this grammar:
......@@ -194,8 +193,6 @@ def get_ebnf_grammar() -> EBNFGrammar:
########################################################################
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!?
EBNF_transformation_table = {
# AST Transformations for EBNF-grammar
"+":
......@@ -300,6 +297,8 @@ class EBNFCompilerError(Exception):
pass
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?
class EBNFCompiler(Compiler):
"""Generates a Parser from an abstract syntax tree of a grammar specified
in EBNF-Notation.
......
......@@ -188,7 +188,7 @@ def add_parser_guard(parser_func):
# if location has already been visited by the current parser,
# return saved result
if location in parser.visited:
return parser.visited[location] # TODO: might not work with Capture-Retrieve-Pop-Parsers!!!
return parser.visited[location]
# break left recursion at the maximum allowed depth
if parser.recursion_counter.setdefault(location, 0) > LEFT_RECURSION_DEPTH:
return None, text
......@@ -200,7 +200,8 @@ def add_parser_guard(parser_func):
if node is not None:
# in case of a recursive call saves the result of the first
# (or left-most) call that matches; but not for variable manipulating parsers,
# (or left-most) call that matches
# variable manipulating parsers will be excluded, though,
# because caching would interfere with changes of variable state
if grammar.last_rb__loc__ > location:
parser.visited[location] = (node, rest)
......@@ -970,13 +971,10 @@ class Alternative(NaryOperator):
self.been_here = dict() # type: Dict[int, int]
def __call__(self, text: str) -> Tuple[Node, str]:
location = len(text)
pindex = self.been_here.setdefault(location, 0)
for parser in self.parsers[pindex:]:
for parser in self.parsers:
node, text_ = parser(text)
if node:
return Node(self, node), text_
# self.been_here[location] += 1
return None, text
def __repr__(self):
......
......@@ -52,6 +52,7 @@ __all__ = ['WHITESPACE_PTYPE',
'collapse',
'join',
'replace_content',
'apply_if',
'is_whitespace',
'is_empty',
'is_expendable',
......@@ -758,6 +759,14 @@ def join(node, tag_names: List[str]):
# ------------------------------------------------
@transformation_factory
def replace_content(node, func: Callable): # Callable[[Node], ResultType]
"""Replaces the content of the node. ``func`` takes the node
as an argument an returns the mapped result.
"""
node.result = func(node.result)
def is_whitespace(node):
"""Removes whitespace and comments defined with the
``@comment``-directive."""
......@@ -787,6 +796,14 @@ def has_content(node, contents: AbstractSet[str]) -> bool:
return str(node) in contents
@transformation_factory
def apply_if(node, transformation: Callable, condition: Callable):
"""Applies a transformation only if a certain condition is met.
"""
if condition(node):
transformation(node)
@transformation_factory
def keep_children(node, section: slice=slice(None, None, None), condition=lambda node: True):
"""Keeps only the nodes which fall into a slice of the result field
......@@ -809,23 +826,6 @@ remove_empty = remove_children_if(is_empty)
remove_expendables = remove_children_if(is_expendable) # partial(remove_children_if, condition=is_expendable)
remove_brackets = keep_children(slice(1,-1))
# @transformation_factory(Callable)
# def remove_first(node, condition=lambda node: True):
# """Removes the first child if the condition is met.
# Otherwise does nothing."""
# if node.children:
# if condition(node.children[0]):
# node.result = node.result[1:]
#
#
# @transformation_factory(Callable)
# def remove_last(node, condition=lambda node: True):
# """Removes the last child if the condition is met.
# Otherwise does nothing."""
# if node.children:
# if condition(node.children[-1]):
# node.result = node.result[:-1]
@transformation_factory
def remove_tokens(node, tokens: AbstractSet[str] = frozenset()):
......@@ -847,14 +847,6 @@ def remove_content(node, contents: AbstractSet[str]):
remove_children_if(node, partial(has_content, contents=contents))
@transformation_factory
def replace_content(node, func: Callable): # Callable[[Node], ResultType]
"""Replaces the content of the node. ``func`` takes the node
as an argument an returns the mapped result.
"""
node.result = func(node.result)
########################################################################
#
# AST semantic validation functions
......
......@@ -38,11 +38,7 @@ def mock_syntax_tree(sexpr):
Example:
>>> mock_syntax_tree("(a (b c))").as_sxpr()
(a
(b
"c"
)
)
'(a\\n (b\\n "c"\\n )\\n)'
"""
def next_block(s):
s = s.strip()
......@@ -109,6 +105,7 @@ def recompile_grammar(ebnf_filename, force=False) -> bool:
base, ext = os.path.splitext(ebnf_filename)
compiler_name = base + 'Compiler.py'
error_file_name = base + '_ebnf_ERRORS.txt'
errors = []
if (not os.path.exists(compiler_name) or force or
grammar_changed(compiler_name, ebnf_filename)):
......@@ -116,14 +113,14 @@ def recompile_grammar(ebnf_filename, force=False) -> bool:
errors = compile_on_disk(ebnf_filename)
if errors:
# print("Errors while compiling: " + ebnf_filename + '!')
with open(base + '_errors.txt', 'w') as f:
with open(error_file_name, 'w') as f:
for e in errors:
f.write(e)
f.write('\n')
return False
if not errors and os.path.exists(base + '_errors.txt'):
os.remove(base + '_errors.txt')
if not errors and os.path.exists(error_file_name):
os.remove(error_file_name)
return True
......@@ -133,7 +130,7 @@ UNIT_STAGES = {'match', 'fail', 'ast', 'cst', '__ast__', '__cst__'}
def unit_from_configfile(config_filename):
"""Reads a grammar unit test from a config file.
"""
cfg = configparser.ConfigParser()
cfg = configparser.ConfigParser(interpolation=None)
cfg.read(config_filename)
OD = collections.OrderedDict
unit = OD()
......
......@@ -154,9 +154,7 @@ def is_logging() -> bool:
def repr_call(f, parameter_list) -> str:
"""Turns a list of items into a string resembling the parameter
list of a function call by omitting default values at the end:
>>> def(a, b=1):
print(a, b)
>>> def f(a, b=1): print(a, b)
>>> repr_call(f, (5,1))
'f(5)'
>>> repr_call(f, (5,2))
......@@ -206,8 +204,8 @@ def compact_sexpr(s) -> str:
whitespace.
Example:
>>> compact_sexpr("(a\n (b\n c\n )\n)\n")
(a (b c))
>>> compact_sexpr('(a\\n (b\\n c\\n )\\n)\\n')
'(a (b c))'
"""
return re.sub('\s(?=\))', '', re.sub('\s+', ' ', s)).strip()
......@@ -306,26 +304,29 @@ def smart_list(arg) -> list:
If the argument is a string, it will be interpreted as a list of
comma separated values, trying ';', ',', ' ' as possible delimiters
in this order, e.g.
>>> smart_list("1; 2, 3; 4")
["1", "2, 3", "4"]
>>> smart_list("2, 3")
["2", "3"]
>>> smart_list("a b cd")
["a", "b", "cd"]
>>> smart_list('1; 2, 3; 4')
['1', '2, 3', '4']
>>> smart_list('2, 3')
['2', '3']
>>> smart_list('a b cd')
['a', 'b', 'cd']
If the argument is a collection other than a string, it will be
returned as is, e.g.
>>> smart_list((1, 2, 3))
(1, 2, 3)
>>> smart_list({1, 2, 3})
{1, 2, 3}
>>> smart_list((1, 2, 3))
(1, 2, 3)
>>> smart_list({1, 2, 3})
{1, 2, 3}
If the argument is another iterable than a collection, it will
be converted into a list, e.g.
>>> smart_list(i for i in {1,2,3})
[1, 2, 3]
>>> smart_list(i for i in {1,2,3})
[1, 2, 3]
Finally, if none of the above is true, the argument will be
wrapped in a list and returned, e.g.
>>> smart_list(125)
[125]
>>> smart_list(125)
[125]
"""
if isinstance(arg, str):
for delimiter in (';', ','):
......@@ -333,8 +334,8 @@ def smart_list(arg) -> list:
if len(lst) > 1:
return [s.strip() for s in lst]
return [s.strip() for s in arg.strip().split(' ')]
# elif isinstance(arg, collections.abc.Sequence): # python 3.6: collections.abc.Collection
# return arg
elif isinstance(arg, collections.abc.Container):
return arg
elif isinstance(arg, collections.abc.Iterable):
return list(arg)
else:
......@@ -346,8 +347,8 @@ def expand_table(compact_table):
containing comma separated words into single keyword entries with
the same values. Returns the expanded table.
Example:
>>> expand_table({"a, b": 1, "b": 1, ('d','e','f'):5, "c":3})
{'a': 1, 'b': 1, 'c': 3, 'd': 5, 'e': 5, 'f': 5}
>>> expand_table({"a, b": 1, ('d','e','f'):5, "c":3})
{'a': 1, 'b': 1, 'd': 5, 'e': 5, 'f': 5, 'c': 3}
"""
expanded_table = {}
keys = list(compact_table.keys())
......@@ -359,24 +360,6 @@ def expand_table(compact_table):
expanded_table[k] = value
return expanded_table
# # commented, because this approach is too error-prone in connection with smart_list
# def as_partial(partial_ellipsis) -> functools.partial:
# """Transforms ``partial_ellipsis`` into a partial function
# application, i.e. string "remove_tokens({'(', ')'})" will be
# transformed into the partial "partial(remove_tokens, {'(', ')'})".
# Partial ellipsises can be considered as a short hand notation for
# partials, which look like function, calls but aren't. Plain
# function names are returned as is. Also, if ``partial_ellipsis``
# already is a callable, it will be returned as is.
# """
# if callable(partial_ellipsis):
# return partial_ellipsis
# m = re.match('\s*(\w+)(?:\(([^)]*)\))?\s*$', partial_ellipsis)
# if m:
# fname, fargs = m.groups()
# return eval("functools.partial(%s, %s)" % (fname, fargs)) if fargs else eval(fname)
# raise SyntaxError(partial_ellipsis + " does not resemble a partial function ellipsis!")
def sane_parser_name(name) -> bool:
"""Checks whether given name is an acceptable parser name. Parser names
......
# latex Grammar
@ testing = True
@ whitespace = /[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?/ # optional whitespace, including at most one linefeed
@ whitespace = /[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?/ # optional whitespace, including at most one linefeed
@ comment = /%.*(?:\n|$)/
latexdoc = preamble document
......
#!/usr/bin/python
#######################################################################
#
# SYMBOLS SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
from functools import partial
import os
import sys
try:
import regex as re
except ImportError:
import re
from DHParser.toolkit import logging, is_filename, load_if_file
from DHParser.parsers import Grammar, Compiler, nil_scanner, \
Lookbehind, Lookahead, Alternative, Pop, Required, Token, Synonym, \
Optional, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, RE, Capture, \
ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source, \
last_value, counterpart, accumulate, ScannerFunc
from DHParser.syntaxtree import Node, traverse, remove_brackets, keep_children, \
remove_children_if, reduce_single_child, replace_by_single_child, remove_whitespace, \
remove_expendables, remove_tokens, flatten, is_whitespace, is_expendable, join, \
collapse, replace_content, WHITESPACE_PTYPE, TOKEN_PTYPE, TransformationFunc, \
remove_empty, replace_parser, apply_if
#######################################################################
#
# SCANNER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
def LaTeXScanner(text):
return text
def get_scanner() -> ScannerFunc:
return LaTeXScanner
#######################################################################
#
# PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!
#
#######################################################################
class LaTeXGrammar(Grammar):
r"""Parser for a LaTeX source file, with this grammar:
# latex Grammar
@ testing = True
@ whitespace = /[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?/ # optional whitespace, including at most one linefeed
@ comment = /%.*(?:\n|$)/
latexdoc = preamble document
preamble = { command }+
document = [PARSEP] { [PARSEP] paragraph } §EOF
blockenv = beginenv sequence §endenv
parblock = "{" sequence §"}"
sequence = { paragraph [PARSEP] }+
paragraph = { !blockcmd (command | block | text) //~ }+
inlineenv = beginenv { command | block | text }+ endenv
beginenv = "\begin{" §NAME §"}"
endenv = "\end{" §::NAME §"}"
command = CMDNAME [[ //~ config ] //~ block ]
config = "[" cfgtext §"]"
block = /{/ { command | text | block } §/}/
text = { cfgtext | (BRACKETS //~) }+
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
blockcmd = "\subsection" | "\section" | "\chapter" | "\subsubsection"
| "\paragraph" | "\subparagraph" | "\begin{enumerate}"
| "\begin{itemize}" | "\item" | "\begin{figure}"
CMDNAME = /\\(?:(?!_)\w)+/~
NAME = /\w+/~
ESCAPED = /\\[%$&_\/]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
TEXTCHUNK = /[^\\%$&\{\}\[\]\s\n]+/ # some piece of text excluding whitespace,
# linefeed and special characters
WSPC = /[ \t]+/ # (horizontal) whitespace
LF = !PARSEP /[ \t]*\n[ \t]*/ # LF but not an empty line
PARSEP = /[ \t]*(?:\n[ \t]*)+\n[ \t]*/ # at least one empty line, i.e.
# [whitespace] linefeed [whitespace] linefeed
EOF = !/./
"""
block = Forward()
command = Forward()
source_hash__ = "936e76e84dd027b0af532abfad617d15"
parser_initialization__ = "upon instantiation"
COMMENT__ = r'%.*(?:\n|$)'
WSP__ = mixin_comment(whitespace=r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?', comment=r'%.*(?:\n|$)')
wspL__ = ''
wspR__ = WSP__
EOF = NegativeLookahead(RE('.', wR=''))
PARSEP = RE('[ \\t]*(?:\\n[ \\t]*)+\\n[ \\t]*', wR='')
LF = Series(NegativeLookahead(PARSEP), RE('[ \\t]*\\n[ \\t]*', wR=''))
WSPC = RE('[ \\t]+', wR='')
TEXTCHUNK = RE('[^\\\\%$&\\{\\}\\[\\]\\s\\n]+', wR='')
BRACKETS = RE('[\\[\\]]', wR='')
ESCAPED = RE('\\\\[%$&_/]', wR='')
NAME = Capture(RE('\\w+'))
CMDNAME = RE('\\\\(?:(?!_)\\w)+')
blockcmd = Alternative(Token("\\subsection"), Token("\\section"), Token("\\chapter"), Token("\\subsubsection"), Token("\\paragraph"), Token("\\subparagraph"), Token("\\begin{enumerate}"), Token("\\begin{itemize}"), Token("\\item"), Token("\\begin{figure}"))
word_sequence = OneOrMore(Series(TEXTCHUNK, RE('')))
cfgtext = OneOrMore(Alternative(word_sequence, Series(ESCAPED, RE(''))))
text = OneOrMore(Alternative(cfgtext, Series(BRACKETS, RE(''))))
block.set(Series(RE('{', wR=''), ZeroOrMore(Alternative(command, text, block)), Required(RE('}', wR=''))))
config = Series(Token("["), cfgtext, Required(Token("]")))
command.set(Series(CMDNAME, Optional(Series(Optional(Series(RE(''), config)), RE(''), block))))
endenv = Series(Token("\\end{"), Required(Pop(NAME)), Required(Token("}")))
beginenv = Series(Token("\\begin{"), Required(NAME), Required(Token("}")))
inlineenv = Series(beginenv, OneOrMore(Alternative(command, block, text)), endenv)
paragraph = OneOrMore(Series(NegativeLookahead(blockcmd), Alternative(command, block, text), RE('')))
sequence = OneOrMore(Series(paragraph, Optional(PARSEP)))
parblock = Series(Token("{"), sequence, Required(Token("}")))
blockenv = Series(beginenv, sequence, Required(endenv))
document = Series(Optional(PARSEP), ZeroOrMore(Series(Optional(PARSEP), paragraph)), Required(EOF))
preamble = OneOrMore(command)
latexdoc = Series(preamble, document)
root__ = latexdoc
def get_grammar() -> LaTeXGrammar:
global thread_local_LaTeX_grammar_singleton
try:
grammar = thread_local_LaTeX_grammar_singleton
return grammar
except NameError:
thread_local_LaTeX_grammar_singleton = LaTeXGrammar()
return thread_local_LaTeX_grammar_singleton
#######################################################################
#
# AST SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
def streamline_whitespace(node):
assert node.tag_name in ['WSPC', ':Whitespace']
s = str(node)
c = s.find('%')
n = s.find('\n')
if c >= 0:
node.result = (' ' if (n >= c) or (n < 0) else '\n')+ s[c:].rstrip(' \t')
elif s.find('\n') >= 0:
node.result = '\n'
else:
node.result = ' '
LaTeX_AST_transformation_table = {
# AST Transformations for the LaTeX-grammar
"+":
remove_empty,
"latexdoc": [],
"preamble": [],
"document": [],
"blockenv": [],
"parblock": [],
"sequence":
flatten,
"paragraph":
[flatten(lambda node: not node.parser.name or node.parser.name == "text"),
join('text', ':Whitespace')],
"inlineenv": [],
"beginenv": [],
"endenv": [],
"command": [],
"config": [],
"block": [remove_brackets, reduce_single_child],
"text":
[reduce_single_child, join('text', 'word_sequence', ':Whitespace')],
"cfgtext": [flatten, reduce_single_child],
"word_sequence":
[collapse],
"blockcmd": [],
"CMDNAME":
[remove_expendables, reduce_single_child],
"NAME": [],
"ESCAPED": [reduce_single_child],
"BRACKETS": [],
"TEXTCHUNK": [],
"WSPC, :Whitespace":
streamline_whitespace,
"LF":
replace_content(lambda node: '\n'),
"PARSEP":
replace_content(lambda node: '\n\n'),
"EOF": [],
"*":
replace_by_single_child,
}
LaTeXTransform = partial(traverse, processing_table=LaTeX_AST_transformation_table)
# LaTeXTransform = lambda tree : 1
def get_transformer() -> TransformationFunc:
return LaTeXTransform
#######################################################################
#
# COMPILER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
class LaTeXCompiler(Compiler):
"""Compiler for the abstract-syntax-tree of a LaTeX source file.
"""
def __init__(self, grammar_name="LaTeX", grammar_source=""):
super(LaTeXCompiler, self).__init__(grammar_name, grammar_source)
assert re.match('\w+\Z', grammar_name)
def on_latexdoc(self, node):
return node.as_sexpr()
def on_preamble(self, node):
pass
def on_document(self, node):
pass
def on_blockenv(self, node):
pass
def on_parblock(self, node):
pass
def on_sequence(self, node):
pass
def on_paragraph(self, node):
pass
def on_inlineenv(self, node):
pass
def on_beginenv(self, node):
pass
def on_endenv(self, node):
pass
def on_command(self, node):
pass
def on_config(self, node):
pass
def on_block(self, node):
pass
def on_text(self, node):
pass
def on_cfgtext(self, node):
pass
def on_word_sequence(self, node):
pass
def on_blockcmd(self, node):
pass
def on_CMDNAME(self, node):
pass
def on_NAME(self, node):
pass
def on_ESCAPED(self, node):
pass
def on_BRACKETS(self, node):
pass
def on_TEXTCHUNK(self, node):
pass
def on_WSPC(self, node):
pass
def on_LF(self, node):
pass
def on_PARSEP(self, node):
pass
def on_EOF(self, node):
pass
def get_compiler(grammar_name="LaTeX", grammar_source="") -> LaTeXCompiler:
global thread_local_LaTeX_compiler_singleton
try:
compiler = thread_local_LaTeX_compiler_singleton
compiler.set_grammar_name(grammar_name, grammar_source)
return compiler
except NameError:
thread_local_LaTeX_compiler_singleton = \
LaTeXCompiler(grammar_name, grammar_source)
return thread_local_LaTeX_compiler_singleton
#######################################################################
#
# END OF DHPARSER-SECTIONS
#
#######################################################################
def compile_src(source):
"""Compiles ``source`` and returns (result, errors, ast).
"""
with logging("LOGS"):
compiler = get_compiler()
cname = compiler.__class__.__name__
log_file_name = os.path.basename(os.path.splitext(source)[0]) \