Commit bb4c86c8 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

memoization optimization begun

parent 388c95f4
...@@ -996,7 +996,7 @@ expressions, for example: ...@@ -996,7 +996,7 @@ expressions, for example:
... expression = ~ term { ("+" | "-") term } ... expression = ~ term { ("+" | "-") term }
... term = factor { ("*" | "/") factor } ... term = factor { ("*" | "/") factor }
... factor = number | group ... factor = number | group
... group = "(" §expression ")" ... group = "(" expression ")"
... number = /\\\\d+/~ ... number = /\\\\d+/~
... EOF = /$/''' ... EOF = /$/'''
>>> arithmetic = create_parser(arithmetic_grammar, "arithmetic") >>> arithmetic = create_parser(arithmetic_grammar, "arithmetic")
......
...@@ -33,7 +33,7 @@ for an example. ...@@ -33,7 +33,7 @@ for an example.
from collections import defaultdict from collections import defaultdict
import copy import copy
from typing import Callable, cast, List, Tuple, Set, AbstractSet, Dict, \ from typing import Callable, cast, List, Tuple, Set, AbstractSet, Dict, \
DefaultDict, Sequence, Union, Optional, Iterator DefaultDict, Sequence, Union, Optional, Iterator, Hashable, NamedTuple
from DHParser.configuration import get_config_value from DHParser.configuration import get_config_value
from DHParser.error import Error, ErrorCode, MANDATORY_CONTINUATION, \ from DHParser.error import Error, ErrorCode, MANDATORY_CONTINUATION, \
...@@ -384,6 +384,13 @@ class Parser: ...@@ -384,6 +384,13 @@ class Parser:
`pname`, otherwise it is the name of the parser's type `pname`, otherwise it is the name of the parser's type
prefixed with a colon ":". prefixed with a colon ":".
eq_class: A unique number for the class of functionally
equivalent parsers that this parser belongs to.
(This serves the purpose of optimizing memoization,
by tying memoization dictionaries to the classes
of functionally equivalent parsers, rather than to
the individual parsers themselves.)
visited: Mapping of places this parser has already been to visited: Mapping of places this parser has already been to
during the current parsing process onto the results the during the current parsing process onto the results the
parser returned at the respective place. This dictionary parser returned at the respective place. This dictionary
...@@ -413,6 +420,7 @@ class Parser: ...@@ -413,6 +420,7 @@ class Parser:
self.disposable = True # type: bool self.disposable = True # type: bool
self.drop_content = False # type: bool self.drop_content = False # type: bool
self.tag_name = self.ptype # type: str self.tag_name = self.ptype # type: str
self.eq_class = id(self) # type: int
self.cycle_detection = set() # type: Set[ApplyFunc] self.cycle_detection = set() # type: Set[ApplyFunc]
# this indirection is required for Cython-compatibility # this indirection is required for Cython-compatibility
self._parse_proxy = self._parse # type: ParseFunc self._parse_proxy = self._parse # type: ParseFunc
...@@ -726,8 +734,50 @@ class Parser: ...@@ -726,8 +734,50 @@ class Parser:
else: else:
return self._apply(func, [], positive_flip) return self._apply(func, [], positive_flip)
def _signature(self) -> Hashable:
"""This method should be implemented by any non-abstract descendant
parser class. The implementation must make sure that all instances
that have the same signature always yield the same parsing result
for the same piece of text.
It does not hurt, but only wastes an opportunity for optimization,
if two functionally equivalent parsers provide different signatures.
It is a serious mistake, though, if two functionally non-equivalent
parsers have the same signature.
By returning `id(self)` this mistake will become impossible, but
it will turn signature-based memoization-optimizatiopn off for
this parser.
"""
return id(self)
def signature(self) -> Hashable:
"""Returns a value that is identical for two different
parser objects if they are functionally equivalent, i.e.
yield the same return value for the same call parameters:
>>> a = Text('[')
>>> b = Text('[')
>>> c = Text(']')
>>> a is b
False
>>> a.signature() == b.signature()
True
>>> a.signature() == c.signature()
False
The purpose of parser-signatures is to optimize better
memoization in cases of code repetition in the grammar.
DO NOT OVERRIDE THIS METHOD. In order to implement a
signature function, the protected method `_signature`
should be overridden instead.
"""
return self.pname if self.pname else self._signature()
def static_error(self, msg: str, code: ErrorCode) -> 'AnalysisError': def static_error(self, msg: str, code: ErrorCode) -> 'AnalysisError':
return self.symbol, self, Error(msg, 0, code) return AnalysisError(self.symbol, self, Error(msg, 0, code))
def static_analysis(self) -> List['AnalysisError']: def static_analysis(self) -> List['AnalysisError']:
"""Analyses the parser for logical errors after the grammar has been """Analyses the parser for logical errors after the grammar has been
...@@ -741,7 +791,23 @@ def copy_parser_base_attrs(src: Parser, duplicate: Parser): ...@@ -741,7 +791,23 @@ def copy_parser_base_attrs(src: Parser, duplicate: Parser):
duplicate.disposable = src.disposable duplicate.disposable = src.disposable
duplicate.drop_content = src.drop_content duplicate.drop_content = src.drop_content
duplicate.tag_name = src.tag_name duplicate.tag_name = src.tag_name
duplicate.eq_class = src.eq_class
def determine_eq_classes(root: Parser):
"""Sorts the parsers originating in root (imperfectly) into equivalence
classes and assigns respective the class identifier to the `eq_class`-field
of each parser."""
eq_classes: Dict[Hashable, int] = {}
def assign_eq_class(parser_stack: List[Parser]) -> bool:
nonlocal eq_classes
p = parser_stack[-1]
signature = p.signature()
p.eq_class = eq_classes.set_default(signature, id(p))
return False
root.apply(assign_eq_class)
def Drop(parser: Parser) -> Parser: def Drop(parser: Parser) -> Parser:
...@@ -764,6 +830,7 @@ def get_parser_placeholder() -> Parser: ...@@ -764,6 +830,7 @@ def get_parser_placeholder() -> Parser:
PARSER_PLACEHOLDER.disposable = False PARSER_PLACEHOLDER.disposable = False
PARSER_PLACEHOLDER.drop_content = False PARSER_PLACEHOLDER.drop_content = False
PARSER_PLACEHOLDER.tag_name = ':PLACEHOLDER__' PARSER_PLACEHOLDER.tag_name = ':PLACEHOLDER__'
PARSER_PLACEHOLDER.eq_class = id(PARSER_PLACEHOLDER)
return cast(Parser, PARSER_PLACEHOLDER) return cast(Parser, PARSER_PLACEHOLDER)
...@@ -846,8 +913,11 @@ def mixin_nonempty(whitespace: str) -> str: ...@@ -846,8 +913,11 @@ def mixin_nonempty(whitespace: str) -> str:
return whitespace return whitespace
AnalysisError = Tuple[str, Parser, Error] # pname, parser, error # AnalysisError = Tuple[str, Parser, Error] # pname, parser, error
# TODO: replace with a named tuple? class AnalysisError(NamedTuple):
symbol: str
parser: Parser
error: Error
class GrammarError(Exception): class GrammarError(Exception):
...@@ -1191,12 +1261,6 @@ class Grammar: ...@@ -1191,12 +1261,6 @@ class Grammar:
def __deepcopy__(self, memo): def __deepcopy__(self, memo):
"""Deepcopy method of the parser. Upon instantiation of a Grammar-
object, parsers will be deep-copied to the Grammar object. If a
derived parser-class changes the signature of the `__init__`-constructor,
`__deepcopy__`-method must be replaced (i.e. overridden without
calling the same method from the superclass) by the derived class.
"""
duplicate = self.__class__(self.root_parser__) duplicate = self.__class__(self.root_parser__)
duplicate.history_tracking__ = self.history_tracking__ duplicate.history_tracking__ = self.history_tracking__
duplicate.resume_notices__ = self.resume_notices__ duplicate.resume_notices__ = self.resume_notices__
...@@ -1732,19 +1796,28 @@ def is_grammar_placeholder(grammar: Optional[Grammar]) -> bool: ...@@ -1732,19 +1796,28 @@ def is_grammar_placeholder(grammar: Optional[Grammar]) -> bool:
# #
######################################################################## ########################################################################
class Always(Parser): class Unparameterized(Parser):
"""Unparameterized parsers do not receive any parameters on instantian.
As a consequence, different instances of the same unparameterized
parser are always functionally equivalent."""
def _signature(self) -> Hashable:
return self.__class__.__name__,
class Always(Unparameterized):
"""A parser that always matches, but does not capture anything.""" """A parser that always matches, but does not capture anything."""
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]: def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
return EMPTY_NODE, text return EMPTY_NODE, text
class Never(Parser): class Never(Unparameterized):
"""A parser that never matches.""" """A parser that never matches."""
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]: def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
return None, text return None, text
class AnyChar(Parser): class AnyChar(Unparameterized):
"""A parser that returns the next unicode character of the document """A parser that returns the next unicode character of the document
whatever that is. The parser fails only at the very end of the text.""" whatever that is. The parser fails only at the very end of the text."""
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]: def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
...@@ -1853,6 +1926,9 @@ class Text(Parser): ...@@ -1853,6 +1926,9 @@ class Text(Parser):
return '`%s`' % abbreviate_middle(self.text, 80) return '`%s`' % abbreviate_middle(self.text, 80)
# return ("'%s'" if self.text.find("'") <= 0 else '"%s"') % abbreviate_middle(self.text, 80) # return ("'%s'" if self.text.find("'") <= 0 else '"%s"') % abbreviate_middle(self.text, 80)
def _signature(self) -> Hashable:
return self.__class__.__name__, self.text
class RegExp(Parser): class RegExp(Parser):
r""" r"""
...@@ -1914,6 +1990,9 @@ class RegExp(Parser): ...@@ -1914,6 +1990,9 @@ class RegExp(Parser):
return '/' + escape_ctrl_chars('%s' % abbreviate_middle(pattern, 118))\ return '/' + escape_ctrl_chars('%s' % abbreviate_middle(pattern, 118))\
.replace('/', '\\/') + '/' .replace('/', '\\/') + '/'
def _signature(self) -> Hashable:
return self.__class__.__name__, self.regexp.pattern
def DropText(text: str) -> Text: def DropText(text: str) -> Text:
return cast(Text, Drop(Text(text))) return cast(Text, Drop(Text(text)))
...@@ -2312,12 +2391,9 @@ class UnaryParser(CombinedParser): ...@@ -2312,12 +2391,9 @@ class UnaryParser(CombinedParser):
def sub_parsers(self) -> Tuple['Parser', ...]: def sub_parsers(self) -> Tuple['Parser', ...]:
return (self.parser,) return (self.parser,)
#
# def location_info(self) -> str: def _signature(self) -> Hashable:
# """Returns a description of the location of the parser within the grammar return self.__class__.__name__, self.parser.signature()
# for the purpose of transparent error reporting."""
# return "%s: %s%s(%s%s)" % (self.symbol, self.pname or '_', self.ptype,
# self.parser.pname or '_', self.parser.ptype)
class NaryParser(CombinedParser): class NaryParser(CombinedParser):
...@@ -2348,6 +2424,9 @@ class NaryParser(CombinedParser): ...@@ -2348,6 +2424,9 @@ class NaryParser(CombinedParser):
def sub_parsers(self) -> Tuple['Parser', ...]: def sub_parsers(self) -> Tuple['Parser', ...]:
return self.parsers return self.parsers
def _signature(self) -> Hashable:
return (self.__class__.__name__,) + tuple(p.signature() for p in self.parsers)
class Option(UnaryParser): class Option(UnaryParser):
r""" r"""
...@@ -2594,6 +2673,9 @@ class Counted(UnaryParser): ...@@ -2594,6 +2673,9 @@ class Counted(UnaryParser):
BAD_REPETITION_COUNT)) BAD_REPETITION_COUNT))
return errors return errors
def _signature(self) -> Hashable:
return self.__class__.__name__, self.parser.signature(), self.repetitions
NO_MANDATORY = 2**30 NO_MANDATORY = 2**30
...@@ -3140,6 +3222,10 @@ class Interleave(MandatoryNary): ...@@ -3140,6 +3222,10 @@ class Interleave(MandatoryNary):
BAD_REPETITION_COUNT)) BAD_REPETITION_COUNT))
return errors return errors
def _signature(self) -> Hashable:
return (self.__class__.__name__,) \
+ tuple(p.signature() for p in self.parsers) \
+ tuple(self.repetitions)
######################################################################## ########################################################################
# #
...@@ -3178,7 +3264,7 @@ class Lookahead(FlowParser): ...@@ -3178,7 +3264,7 @@ class Lookahead(FlowParser):
def static_analysis(self) -> List[AnalysisError]: def static_analysis(self) -> List[AnalysisError]:
errors = super().static_analysis() errors = super().static_analysis()
if self.parser.is_optional(): if self.parser.is_optional():
errors.append((self.pname, self, Error( errors.append(AnalysisError(self.pname, self, Error(
'Lookahead %s does not make sense with optional parser "%s"!' 'Lookahead %s does not make sense with optional parser "%s"!'
% (self.pname, str(self.parser)), % (self.pname, str(self.parser)),
0, LOOKAHEAD_WITH_OPTIONAL_PARSER))) 0, LOOKAHEAD_WITH_OPTIONAL_PARSER)))
...@@ -3324,6 +3410,9 @@ class ContextSensitive(UnaryParser): ...@@ -3324,6 +3410,9 @@ class ContextSensitive(UnaryParser):
rb_loc -= 1 rb_loc -= 1
return rb_loc return rb_loc
def _signature(self) -> Hashable:
return id(self)
class Capture(ContextSensitive): class Capture(ContextSensitive):
""" """
...@@ -3357,12 +3446,12 @@ class Capture(ContextSensitive): ...@@ -3357,12 +3446,12 @@ class Capture(ContextSensitive):
def static_analysis(self) -> List[AnalysisError]: def static_analysis(self) -> List[AnalysisError]:
errors = super().static_analysis() errors = super().static_analysis()
if not self.pname: if not self.pname:
errors.append((self.pname, self, Error( errors.append(AnalysisError(self.pname, self, Error(
'Capture only works as named parser! Error in parser: ' + str(self), 'Capture only works as named parser! Error in parser: ' + str(self),
0, CAPTURE_WITHOUT_PARSERNAME 0, CAPTURE_WITHOUT_PARSERNAME
))) )))
if self.parser.apply(lambda plist: plist[-1].drop_content): if self.parser.apply(lambda plist: plist[-1].drop_content):
errors.append((self.pname, self, Error( errors.append(AnalysisError(self.pname, self, Error(
'Captured symbol "%s" contains parsers that drop content, ' 'Captured symbol "%s" contains parsers that drop content, '
'which can lead to unintended results!' % (self.pname or str(self)), 'which can lead to unintended results!' % (self.pname or str(self)),
0, CAPTURE_DROPPED_CONTENT_WARNING 0, CAPTURE_DROPPED_CONTENT_WARNING
...@@ -3776,3 +3865,9 @@ class Forward(UnaryParser): ...@@ -3776,3 +3865,9 @@ class Forward(UnaryParser):
if is_parser_placeholder(self.parser): if is_parser_placeholder(self.parser):
return tuple() return tuple()
return self.parser, return self.parser,
def _signature(self) -> Hashable:
# This code should theoretically never be reached, except when debugging
signature = self.pname or self.parser.pname
assert signature
return signature
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment