11.3.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit 44c61528 authored by di68kap's avatar di68kap

- removed MetaParser._keep_node(). This kind of optimization is now done...

- removed MetaParser._keep_node(). This kind of optimization is now done inplace which is considerably faster
parent 3d95076e
......@@ -1348,6 +1348,17 @@ class DropWhitespace(Whitespace):
class MetaParser(Parser):
"""Class Meta-Parser contains functions for the optimization of
retrun values of parsers that call other parsers (i.e descendants
of classes UnaryParser and NaryParser).
The optimization consists in flattening the tree by eliminating
anonymous nodes. This is the same as what the function
DHParser.transform.flatten() does, only at an earlier stage.
The reasoning is that the earlier the tree is reduced, the less work
reamins to do at all the later processing stages.
"""
def _return_value(self, node: Optional[Node]) -> Node:
"""
Generates a return node if a single node has been returned from
......@@ -1401,16 +1412,6 @@ class MetaParser(Parser):
return EMPTY_NODE # avoid creation of a node object for anonymous empty nodes
return Node(self.tag_name, results) # unoptimized code
def _keep_node(self, node):
"""
Returns True, if a node returned by a descendant parser should be kept.
False, if it should be sorted out. A node is kept, if it is empty and
neither the parser of the descendant parser is a named parser.
"""
if self.grammar.flatten_tree__:
return node._result or node.tag_name[0:1] != ':'
return node != EMPTY_NODE # EMPTY_NODE will always be sorted out...
class UnaryParser(MetaParser):
"""
......@@ -1548,7 +1549,7 @@ class ZeroOrMore(Option):
node, text = self.parser(text)
if not node:
break
if self._keep_node(node):
if node._result or node.tag_name[0:1] != ':': # drop anonymous empty nodes
results += (node,)
if len(text) == n:
break # avoid infinite loop
......@@ -1599,7 +1600,7 @@ class OneOrMore(UnaryParser):
if not node:
break
match_flag = True
if self._keep_node(node):
if node._result or node.tag_name[0:1] != ':': # drop anonymous empty nodes
results += (node,)
if len(text_) == n:
break # avoid infinite loop
......@@ -1758,7 +1759,7 @@ class Series(NaryParser):
else:
results += (node,)
break
if self._keep_node(node): # optimization
if node._result or node.tag_name[0:1] != ':': # drop anonymous empty nodes
results += (node,)
# assert len(results) <= len(self.parsers) \
# or len(self.parsers) >= len([p for p in results if p.tag_name != ZOMBIE_TAG])
......@@ -1960,7 +1961,7 @@ class AllOf(NaryParser):
for i, parser in enumerate(parsers):
node, text__ = parser(text_)
if node:
if self._keep_node(node):
if node._result or node.tag_name[0:1] != ':': # drop anonymous empty nodes
results += (node,)
text_ = text__
del parsers[i]
......@@ -2027,7 +2028,7 @@ class SomeOf(NaryParser):
for i, parser in enumerate(parsers):
node, text__ = parser(text_)
if node:
if self._keep_node(node):
if node._result or node.tag_name[0:1] != ':': # drop anonymous empty nodes
results += (node,)
text_ = text__
del parsers[i]
......
# parse.py - parser combinators for DHParser
#
# Copyright 2016 by Eckhart Arnold (arnold@badw.de)
# Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.
"""
Module ``parse`` contains the python classes and functions for
DHParser's packrat-parser. It's central class is the
``Grammar``-class, which is the base class for any concrete
Grammar. Grammar-objects are callable and parsing is done by
calling a Grammar-object with a source text as argument.
The different parsing functions are callable descendants of class
``Parser``. Usually, they are organized in a tree and defined
within the namespace of a grammar-class. See ``ebnf.EBNFGrammar``
for an example.
"""
from collections import defaultdict
import copy
from DHParser.error import Error, linebreaks, line_col
from DHParser.log import is_logging, HistoryRecord
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME
from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from DHParser.syntaxtree import Node, FrozenNode, RootNode, WHITESPACE_PTYPE, \
TOKEN_PTYPE, ZOMBIE_TAG, ResultType
from DHParser.toolkit import sane_parser_name, escape_control_characters, get_config_value, \
re, typing, cython
from DHParser.configuration import CONFIG_PRESET
from typing import Callable, cast, List, Tuple, Set, Dict, DefaultDict, Union, Optional, Any
__all__ = ('Parser',
'UnknownParserError',
'GrammarErrorType',
'GrammarError',
'Grammar',
'EMPTY_NODE',
'PreprocessorToken',
'Token',
'DropToken',
'RegExp',
'RE',
'TKN',
'Whitespace',
'DropWhitespace',
'mixin_comment',
'MetaParser',
'UnaryParser',
'NaryParser',
'Synonym',
'Option',
'ZeroOrMore',
'OneOrMore',
'Series',
'Alternative',
'AllOf',
'SomeOf',
'Unordered',
'Required',
'Lookahead',
'NegativeLookahead',
'Lookbehind',
'NegativeLookbehind',
'last_value',
'counterpart',
'accumulate',
'Capture',
'Retrieve',
'Pop',
'Forward')
########################################################################
#
# Parser base class
#
########################################################################
EMPTY_NODE = FrozenNode(':EMPTY__', '')
class ParserError(Exception):
"""
A `ParserError` is thrown for those parser errors that allow the
controlled re-entrance of the parsing process after the error occurred.
If a reentry-rule has been configured for the parser where the error
occurred, the parser guard can resume the parsing process.
Currently, the only case when a `ParserError` is thrown (and not some
different kind of error like `UnknownParserError`, is when a `Series`-
detects a missing mandatory element.
"""
def __init__(self, node: Node, rest: StringView, first_throw: bool):
self.node = node # type: Node
self.rest = rest # type: StringView
self.first_throw = first_throw # type: bool
def __str__(self):
return "%i: %s %s" % (self.node.pos, str(self.rest[:25]), repr(self.node))
ResumeList = List[Union[str, Any]] # list of strings or regular expressiones
def reentry_point(rest: StringView, rules: ResumeList) -> int:
"""
Finds the point where parsing should resume after a ParserError has been caught.
Args:
rest: The rest of the parsed text or, in other words, the point where
a ParserError was thrown.
rules: A list of strings or regular expressions. The rest of the text is
searched for each of these. The closest match is the point where
parsing will be resumed.
Returns:
The integer index of the closest reentry point or -1 if no reentry-point
was found.
"""
upper_limit = len(rest) + 1
i = upper_limit
#find closest match
for rule in rules:
if isinstance(rule, str):
k = rest.find(rule)
i = min(k if k >= 0 else upper_limit, i)
else:
m = rest.search(rule)
if m:
i = min(rest.index(m.start()), i)
# in case no rule matched return -1
if i == upper_limit:
i = -1
return i
ApplyFunc = Callable[['Parser'], None]
FlagFunc = Callable[[ApplyFunc, Set[ApplyFunc]], bool]
class Parser:
"""
(Abstract) Base class for Parser combinator parsers. Any parser
object that is actually used for parsing (i.e. no mock parsers)
should should be derived from this class.
Since parsers can contain other parsers (see classes UnaryOperator
and NaryOperator) they form a cyclical directed graph. A root
parser is a parser from which all other parsers can be reached.
Usually, there is one root parser which serves as the starting
point of the parsing process. When speaking of "the root parser"
it is this root parser object that is meant.
There are two different types of parsers:
1. *Named parsers* for which a name is set in field `parser.pname`.
The results produced by these parsers can later be retrieved in
the AST by the parser name.
2. *Anonymous parsers* where the name-field just contains the empty
string. AST-transformation of Anonymous parsers can be hooked
only to their class name, and not to the individual parser.
Parser objects are callable and parsing is done by calling a parser
object with the text to parse.
If the parser matches it returns a tuple consisting of a node
representing the root of the concrete syntax tree resulting from the
match as well as the substring `text[i:]` where i is the length of
matched text (which can be zero in the case of parsers like
`ZeroOrMore` or `Option`). If `i > 0` then the parser has "moved
forward".
If the parser does not match it returns `(None, text). **Note** that
this is not the same as an empty match `("", text)`. Any empty match
can for example be returned by the `ZeroOrMore`-parser in case the
contained parser is repeated zero times.
Attributes and Properties:
visited: Mapping of places this parser has already been to
during the current parsing process onto the results the
parser returned at the respective place. This dictionary
is used to implement memoizing.
recursion_counter: Mapping of places to how often the parser
has already been called recursively at this place. This
is needed to implement left recursion. The number of
calls becomes irrelevant once a resault has been memoized.
cycle_detection: The apply()-method uses this variable to make
sure that one and the same function will not be applied
(recursively) a second time, if it has already been
applied to this parser.
_grammar: A reference to the Grammar object to which the parser
is attached.
"""
def __init__(self) -> None:
# assert isinstance(name, str), str(name)
self.pname = '' # type: str
self.tag_name = self.ptype # type: str
self.cycle_detection = set() # type: Set[ApplyFunc]
try:
self._grammar = GRAMMAR_PLACEHOLDER # type: Grammar
except NameError:
pass
self.reset()
def __deepcopy__(self, memo):
""" Deepcopy method of the parser. Upon instantiation of a Grammar-
object, parsers will be deep-copied to the Grammar object. If a
derived parser-class changes the signature of the constructor,
`__deepcopy__`-method must be replaced (i.e. overridden without
calling the same method from the superclass) by the derived class.
"""
duplicate = self.__class__()
duplicate.pname = self.pname
duplicate.tag_name = self.tag_name
return duplicate
def __repr__(self):
return self.pname + self.ptype
def __str__(self):
return self.pname + (' = ' if self.pname else '') + repr(self)
@property
def ptype(self) -> str:
"""Returns a type name for the parser. By default this is the name of
the parser class with an added leading colon ':'. """
return ':' + self.__class__.__name__
@property
def repr(self) -> str:
"""Returns the parser's name if it has a name and self.__repr___() otherwise."""
return self.pname if self.pname else self.__repr__()
def reset(self):
"""Initializes or resets any parser variables. If overwritten,
the `reset()`-method of the parent class must be called from the
`reset()`-method of the derived class."""
self.visited = dict() # type: Dict[int, Tuple[Optional[Node], StringView]]
self.recursion_counter = defaultdict(int) # type: DefaultDict[int, int]
@cython.locals(location=cython.int, gap=cython.int, i=cython.int)
def __call__(self: 'Parser', text: StringView) -> Tuple[Optional[Node], StringView]:
"""Applies the parser to the given text. This is a wrapper method that adds
the business intelligence that is common to all parsers. The actual parsing is
done in the overridden method `_parse()`.
"""
grammar = self._grammar
location = grammar.document_length__ - len(text)
try:
if grammar.last_rb__loc__ >= location:
grammar.rollback_to__(location)
# if location has already been visited by the current parser,
# return saved result
if location in self.visited:
# no history recording in case of memoized results
return self.visited[location]
# break left recursion at the maximum allowed depth
if grammar.left_recursion_depth__:
if self.recursion_counter[location] > grammar.left_recursion_depth__:
grammar.recursion_locations__.add(location)
return None, text
self.recursion_counter[location] += 1
if grammar.history_tracking__:
grammar.call_stack__.append(self.repr if self.tag_name in (':RegExp', ':Token')
else self.tag_name)
grammar.moving_forward__ = True
try:
# PARSER CALL: run _parse() method
node, rest = self._parse(text)
except ParserError as error:
# does this play well with variable setting? add rollback clause here? tests needed...
gap = len(text) - len(error.rest)
rules = grammar.resume_rules__.get(self.pname, [])
rest = error.rest[len(error.node):]
i = reentry_point(rest, rules)
if i >= 0 or self == grammar.start_parser__:
# apply reentry-rule or catch error at root-parser
if i < 0:
i = 1
nd = Node(ZOMBIE_TAG, rest[:i]).with_pos(location)
rest = rest[i:]
assert error.node.children or (not error.node.result)
if error.first_throw:
node = error.node
node.result = node.children + (nd,)
else:
# TODO: ggf. Fehlermeldung, die sagt, wo es weitergeht anfügen
# dürfte allerdings erst an den nächsten(!) Knoten angehängt werden (wie?)
node = Node(self.tag_name,
(Node(ZOMBIE_TAG, text[:gap]).with_pos(location),
error.node, nd))
elif error.first_throw:
raise ParserError(error.node, error.rest, first_throw=False)
else:
result = (Node(ZOMBIE_TAG, text[:gap]).with_pos(location), error.node) if gap \
else error.node # type: ResultType
if grammar.tree__.errors[-1].code == Error.MANDATORY_CONTINUATION_AT_EOF: # EXPERIMENTAL!!
node = error.node
else:
raise ParserError(Node(self.tag_name, result).with_pos(location),
text, first_throw=False)
if grammar.left_recursion_depth__:
self.recursion_counter[location] -= 1
# don't clear recursion_locations__ !!!
if node is None:
# retrieve an earlier match result (from left recursion) if it exists
if location in grammar.recursion_locations__:
if location in self.visited:
node, rest = self.visited[location]
if location != grammar.last_recursion_location__:
grammar.tree__.add_error(
node, Error("Left recursion encountered. "
"Refactor grammar to avoid slow parsing.",
node.pos if node else location,
Error.LEFT_RECURSION_WARNING))
grammar.last_recursion_location__ = location
# don't overwrite any positive match (i.e. node not None) in the cache
# and don't add empty entries for parsers returning from left recursive calls!
elif grammar.memoization__:
# otherwise also cache None-results
self.visited[location] = (None, rest)
else:
# assert node._pos < 0 or node == EMPTY_NODE
node._pos = location
# assert node._pos >= 0 or node == EMPTY_NODE, \
# str("%i < %i" % (grammar.document_length__, location))
if (grammar.last_rb__loc__ < location
and (grammar.memoization__ or location in grammar.recursion_locations__)):
# - variable manipulating parsers will not be entered into the cache,
# because caching would interfere with changes of variable state
# - in case of left recursion, the first recursive step that
# matches will store its result in the cache
# TODO: need a unit-test concerning interference of variable manipulation and left recursion algorithm?
self.visited[location] = (node, rest)
# Mind that memoized parser calls will not appear in the history record!
# Does this make sense? Or should it be changed?
if grammar.history_tracking__:
# don't track returning parsers except in case an error has occurred
# remaining = len(rest)
if grammar.moving_forward__:
record = HistoryRecord(grammar.call_stack__, node, text,
grammar.line_col__(text))
grammar.history__.append(record)
elif node:
nid = id(node) # type: int
if nid in grammar.tree__.error_nodes:
record = HistoryRecord(grammar.call_stack__, node, text,
grammar.line_col__(text),
grammar.tree__.error_nodes[nid])
grammar.history__.append(record)
grammar.moving_forward__ = False
grammar.call_stack__.pop()
except RecursionError:
node = Node(ZOMBIE_TAG, str(text[:min(10, max(1, text.find("\n")))]) + " ...")
node._pos = location
grammar.tree__.new_error(node, "maximum recursion depth of parser reached; "
"potentially due to too many errors!")
rest = EMPTY_STRING_VIEW
return node, rest
def __add__(self, other: 'Parser') -> 'Series':
"""The + operator generates a series-parser that applies two
parsers in sequence."""
return Series(self, other)
def __or__(self, other: 'Parser') -> 'Alternative':
"""The | operator generates an alternative parser that applies
the first parser and, if that does not match, the second parser.
"""
return Alternative(self, other)
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
"""Applies the parser to the given `text` and returns a node with
the results or None as well as the text at the position right behind
the matching string."""
raise NotImplementedError
@property
def grammar(self) -> 'Grammar':
try:
if self._grammar != GRAMMAR_PLACEHOLDER:
return self._grammar
else:
raise AssertionError('Grammar has not yet been set!')
except (AttributeError, NameError):
raise AttributeError('Parser placeholder does not have a grammar!')
@grammar.setter
def grammar(self, grammar: 'Grammar'):
try:
if self._grammar == GRAMMAR_PLACEHOLDER:
self._grammar = grammar
# self._grammar_assigned_notifier()
elif self._grammar != grammar:
raise AssertionError("Parser has already been assigned"
"to a different Grammar object!")
except AttributeError:
pass # ignore setting of grammar attribute for placeholder parser
except NameError: # Cython: No access to GRAMMA_PLACEHOLDER, yet :-(
self._grammar = grammar
def _apply(self, func: ApplyFunc, flip: FlagFunc) -> bool:
"""
Applies function `func(parser)` recursively to this parser and all
descendant parsers, if any exist.
In order to break cycles, function `flip` is called, which should
return `True`, if this parser has already been visited. If not, it
flips the cycle detection flag and returns `False`.
This is a protected function and should not called from outside
class Parser or any of its descendants. The entry point for external
calls is the method `apply()` without underscore!
"""
if flip(func, self.cycle_detection):
return False
else:
func(self)
return True
def apply(self, func: ApplyFunc):
"""
Applies function `func(parser)` recursively to this parser and all
descendant parsers, if any exist. Traversal is pre-order.
"""
def positive_flip(f: ApplyFunc, flagged: Set[Callable]) -> bool:
"""Returns True, if function `f` has already been applied to this
parser and sets the flag accordingly. Interprets `f in flagged == True`
as meaning that `f` has already been applied."""
if f in flagged:
return True
else:
flagged.add(f)
return False
def negative_flip(f: ApplyFunc, flagged: Set[Callable]) -> bool:
"""Returns True, if function `f` has already been applied to this
parser and sets the flag accordingly. Interprets `f in flagged == False`
as meaning that `f` has already been applied."""
if f not in flagged:
return True
else:
flagged.remove(f)
return False
if func in self.cycle_detection:
self._apply(func, negative_flip)
else:
self._apply(func, positive_flip)
PARSER_PLACEHOLDER = Parser()
########################################################################
#
# Grammar class, central administration of all parser of a grammar
#
########################################################################
def mixin_comment(whitespace: str, comment: str) -> str:
"""
Returns a regular expression that merges comment and whitespace
regexps. Thus comments cann occur whereever whitespace is allowed
and will be skipped just as implicit whitespace.
Note, that because this works on the level of regular expressions,
nesting comments is not possible. It also makes it much harder to
use directives inside comments (which isn't recommended, anyway).
"""
if comment:
return '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
return whitespace
class UnknownParserError(KeyError):
"""UnknownParserError is raised if a Grammar object is called with a
parser that does not exist or if in the course of parsing a parser
is referred to that does not exist."""
GrammarErrorType = Tuple[str, Parser, Error] # TODO: replace with a named tuple?
class GrammarError(Exception):
"""GrammarError will be raised if static analysis reveals errors
in the grammar.
"""
def __init__(self, static_analysis_result: List[GrammarErrorType]):
assert static_analysis_result # must not be empty
self.errors = static_analysis_result
def __str__(self):
if len(self.errors) == 1:
return str(self.errors[0][2])
return '\n' + '\n'.join(("%i. " % (i + 1) + str(err_tuple[2]))
for i, err_tuple in enumerate(self.errors))
class Grammar:
r"""
Class Grammar directs the parsing process and stores global state
information of the parsers, i.e. state information that is shared
accross parsers.
Grammars are basically collections of parser objects, which are
connected to an instance object of class Grammar. There exist two
ways of connecting parsers to grammar objects: Either by passing
the root parser object to the constructor of a Grammar object
("direct instantiation"), or by assigning the root parser to the
class variable "root__" of a descendant class of class Grammar.
Example for direct instantiation of a grammar::