Commit 5dc91c94 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- parse.py: DropWhitespace renadmed to DropRegExt + more refined synonym support

parent 8233c3e6
......@@ -34,7 +34,7 @@ from DHParser.compile import CompilerError, Compiler, compile_source, visitor_na
from DHParser.configuration import THREAD_LOCALS, get_config_value
from DHParser.error import Error
from DHParser.parse import Grammar, mixin_comment, mixin_noempty, Forward, RegExp, \
DropWhitespace, NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, \
DropRegExp, NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, \
Token, GrammarError
from DHParser.preprocess import nil_preprocessor, PreprocessorFunc
from DHParser.syntaxtree import Node, WHITESPACE_PTYPE, TOKEN_PTYPE
......@@ -83,7 +83,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, suspend_logging, resume_logging, is_filename, load_if_file, \\
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropWhitespace, \\
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropRegExp, \\
Lookbehind, Lookahead, Alternative, Pop, Token, DropToken, Synonym, AllOf, SomeOf, \\
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \\
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \\
......@@ -189,7 +189,7 @@ class EBNFGrammar(Grammar):
COMMENT__ = r'#.*(?:\n|$)'
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
wsp__ = DropWhitespace(WSP_RE__)
wsp__ = DropRegExp(WSP_RE__)
EOF = NegativeLookahead(RegExp('.'))
whitespace = Series(RegExp('~'), wsp__)
regexp = Series(RegExp('/(?:(?<!\\\\)\\\\(?:/)|[^/])*?/'), wsp__)
......@@ -807,7 +807,7 @@ class EBNFCompiler(Compiler):
if DROP_WSPC in self.directives.drop:
definitions.append((EBNFCompiler.DROP_WHITESPACE_PARSER_KEYWORD,
'DropWhitespace(%s)' % EBNFCompiler.WHITESPACE_KEYWORD))
'DropRegExp(%s)' % EBNFCompiler.WHITESPACE_KEYWORD))
definitions.append((EBNFCompiler.WHITESPACE_PARSER_KEYWORD,
'Whitespace(%s)' % EBNFCompiler.WHITESPACE_KEYWORD))
definitions.append((EBNFCompiler.WHITESPACE_KEYWORD,
......
......@@ -59,7 +59,7 @@ __all__ = ('Parser',
'RE',
'TKN',
'Whitespace',
'DropWhitespace',
'DropRegExp',
'mixin_comment',
'mixin_noempty',
'MetaParser',
......@@ -1486,10 +1486,11 @@ class Whitespace(RegExp):
return '~'
class DropWhitespace(Whitespace):
class DropRegExp(Whitespace):
"""
Parses whitespace but never returns it. Instead EMPTY_NODE is returned
on a match. Violates the invariant: str(parse(text)) == text !
Parses a text with a regular expression but never returns the match.
Instead EMPTY_NODE is returned on a match.
Violates the invariant: str(parse(text)) == text !
"""
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
......@@ -2516,16 +2517,20 @@ class Synonym(UnaryParser):
RegExp('\d\d\d\d') carries the name 'JAHRESZAHL' or 'jahr'.
"""
def __init__(self, parser: Parser) -> None:
assert not (isinstance(parser, DropWhitespace) or isinstance(parser, DropToken))
assert not (isinstance(parser, DropRegExp) or isinstance(parser, DropToken))
super(Synonym, self).__init__(parser)
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
node, text = self.parser(text)
node, text = self.parser._parse(text) # circumvent Parser.__call__
if node:
if node == EMPTY_NODE:
return Node(self.tag_name, ''), text
return Node(self.tag_name, node), text
return None, text
if self.anonymous:
if node.tag_name[0] != ':': # implies != EMPTY_NODE
node.tag_name = self.tag_name
else:
if node == EMPTY_NODE:
return Node(self.tag_name, ''), text
node.tag_name = self.tag_name
return node, text
def __repr__(self):
return self.pname or self.parser.repr
......
......@@ -21,7 +21,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropWhitespace, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropRegExp, \
Lookbehind, Lookahead, Alternative, Pop, Token, DropToken, Synonym, AllOf, SomeOf, \
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
......@@ -69,7 +69,8 @@ class ArithmeticGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
VARIABLE = Series(RegExp('[A-Za-z]'), dwsp__)
NUMBER = Series(RegExp('(?:0|(?:[1-9]\\d*))(?:\\.\\d+)?'), dwsp__)
NEGATIVE = RegExp('[-]')
......
......@@ -21,7 +21,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropWhitespace, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropRegExp, \
Lookbehind, Lookahead, Alternative, Pop, Token, DropToken, Synonym, AllOf, SomeOf, \
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
......@@ -73,7 +73,8 @@ class ArithmeticRightRecursiveGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
VARIABLE = RegExp('[a-dj-z]')
NUMBER = RegExp('(?:0|(?:[1-9]\\d*))(?:\\.\\d+)?')
MINUS = RegExp('-')
......
......@@ -21,7 +21,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropWhitespace, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropRegExp, \
Lookbehind, Lookahead, Alternative, Pop, Token, DropToken, Synonym, AllOf, SomeOf, \
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
......@@ -70,7 +70,8 @@ class ArithmeticRightRecursiveGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
VARIABLE = Series(RegExp('[A-Za-z]'), dwsp__)
NUMBER = Series(RegExp('(?:0|(?:[1-9]\\d*))(?:\\.\\d+)?'), dwsp__)
NEGATIVE = RegExp('[-]')
......
......@@ -21,7 +21,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropWhitespace, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropRegExp, \
Lookbehind, Lookahead, Alternative, Pop, Token, DropToken, Synonym, AllOf, SomeOf, \
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
......@@ -69,7 +69,8 @@ class ArithmeticSimpleGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
VARIABLE = Series(RegExp('[A-Za-z]'), dwsp__)
NUMBER = Series(RegExp('(?:0|(?:[1-9]\\d*))(?:\\.\\d+)?'), dwsp__)
NEGATIVE = RegExp('[-]')
......
......@@ -73,121 +73,178 @@ Match-test "entry"
### AST
entry
:RegExp "@"
:RegExp
"@"
type
WORD "Online"
:Token "{"
"Online"
:Token
"{"
key
NO_BLANK_STRING "wikipedia-duhem-quine"
:Token ","
"wikipedia-duhem-quine"
:Token
","
:Whitespace
" % A"
" "
field
WORD
:RegExp "editor"
:Whitespace " "
:Token "="
:Whitespace " "
:RegExp
"editor"
:Whitespace
" "
:Token
"="
:Whitespace
" "
content
:Token "{"
:Token
"{"
text
CONTENT_STRING "Wikipedia"
:Token "}"
:Token ","
CONTENT_STRING
"Wikipedia"
:Token
"}"
:Token
","
:Whitespace
""
" "
field
WORD
:RegExp "title"
:Whitespace " "
:Token "="
:Whitespace " "
:RegExp
"title"
:Whitespace
" "
:Token
"="
:Whitespace
" "
content
:Token "{"
:Token
"{"
text
CONTENT_STRING
:RegExp "Duhem-Quine"
WS " "
:RegExp "thesis"
:Token "}"
:Token ","
:RegExp
"Duhem-Quine"
WS
" "
:RegExp
"thesis"
:Token
"}"
:Token
","
:Whitespace
" % B"
" "
field
WORD
:RegExp "year"
:Whitespace " "
:Token "="
:Whitespace " "
:RegExp
"year"
:Whitespace
" "
:Token
"="
:Whitespace
" "
content
:Token "{"
:Token
"{"
text
CONTENT_STRING "2017"
:Token "}"
:Token ","
CONTENT_STRING
"2017"
:Token
"}"
:Token
","
:Whitespace
""
" "
field
WORD
:RegExp "date"
:Whitespace " "
:Token "="
:Whitespace " "
:RegExp
"date"
:Whitespace
" "
:Token
"="
:Whitespace
" "
content
:Token "{"
:Token
"{"
text
CONTENT_STRING
:RegExp "2017-08-19"
:RegExp
"2017-08-19"
WS
" % C"
" "
:Token "}"
:Token ","
:Token
"}"
:Token
","
:Whitespace
""
" "
field
WORD
:RegExp "url"
:Whitespace " "
:Token "="
:Whitespace " "
:RegExp
"url"
:Whitespace
" "
:Token
"="
:Whitespace
" "
content
:Token "{"
:Token
"{"
text
CONTENT_STRING
:RegExp "https://en.wikipedia.org/w/index.php?title=Duhem\"
ESC "%"
:RegExp "E2\"
ESC "%"
:RegExp "80\"
ESC "%"
:RegExp "93Quine\"
ESC "_"
:RegExp "thesis\"
ESC "&"
:RegExp "oldid=772834991"
:Token "}"
:Token ","
:RegExp
"https://en.wikipedia.org/w/index.php?title=Duhem\"
ESC
"%"
:RegExp
"E2\"
ESC
"%"
:RegExp
"80\"
ESC
"%"
:RegExp
"93Quine\"
ESC
"_"
:RegExp
"thesis\"
ESC
"&"
:RegExp
"oldid=772834991"
:Token
"}"
:Token
","
:Whitespace
""
" "
field
WORD
:RegExp "organization"
:Whitespace " "
:Token "="
:Whitespace " "
:RegExp
"organization"
:Whitespace
" "
:Token
"="
:Whitespace
" "
content
:Token "{"
:Token
"{"
text
CONTENT_STRING "Wikipedia"
:Token "}"
CONTENT_STRING
"Wikipedia"
:Token
"}"
:Whitespace
""
""
:Token "}"
\ No newline at end of file
:Token
"}"
\ No newline at end of file
......@@ -21,7 +21,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, suspend_logging, resume_logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropWhitespace, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropRegExp, \
Lookbehind, Lookahead, Alternative, Pop, Token, DropToken, Synonym, AllOf, SomeOf, \
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
......@@ -71,7 +71,8 @@ class EBNFGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
EOF = NegativeLookahead(RegExp('.'))
whitespace = Series(RegExp('~'), dwsp__)
regexp = Series(RegExp('/(?:(?<!\\\\)\\\\(?:/)|[^/])*?/'), dwsp__)
......
......@@ -21,7 +21,7 @@ try:
except ImportError:
import re
from DHParser import is_filename, Grammar, Compiler, Lookbehind, Alternative, Pop, \
Synonym, Whitespace, DropWhitespace, Token, DropToken, \
Synonym, Whitespace, DropRegExp, Token, DropToken, \
Option, NegativeLookbehind, OneOrMore, RegExp, Series, Capture, Lookahead, \
ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source, \
PreprocessorFunc, TransformationDict, \
......@@ -73,7 +73,7 @@ class LaTeXGrammar(Grammar):
whitespace__ = Whitespace(WHITESPACE__)
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropWhitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
EOF = RegExp('(?!.)')
BACKSLASH = RegExp('[\\\\]')
LB = RegExp('\\s*?\\n|$')
......
......@@ -26,7 +26,7 @@ from DHParser import start_logging, is_filename, load_if_file, \
Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
grammar_changed, last_value, counterpart, accumulate, PreprocessorFunc, \
Node, TransformationFunc, TransformationDict, Token, DropToken, DropWhitespace, \
Node, TransformationFunc, TransformationDict, Token, DropToken, DropRegExp, \
traverse, remove_children_if, is_anonymous, access_thread_locals, \
reduce_single_child, replace_by_single_child, replace_or_reduce, remove_whitespace, \
remove_empty, remove_tokens, flatten, is_insignificant_whitespace, \
......@@ -76,7 +76,8 @@ class XMLGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
EOF = NegativeLookahead(RegExp('.'))
S = RegExp('\\s+')
Char = RegExp('\\x09|\\x0A|\\x0D|[\\u0020-\\uD7FF]|[\\uE000-\\uFFFD]|[\\U00010000-\\U0010FFFF]')
......
......@@ -21,7 +21,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, is_filename, load_if_file, Grammar, Compiler, nil_preprocessor, \
PreprocessorToken, Whitespace, DropWhitespace, DropToken, \
PreprocessorToken, Whitespace, DropRegExp, DropToken, \
Lookbehind, Lookahead, Alternative, Pop, Token, Synonym, AllOf, SomeOf, Unordered, \
Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
......@@ -70,7 +70,8 @@ class XMLSnippetGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
EOF = NegativeLookahead(RegExp('.'))
S = RegExp('\\s+')
Char = RegExp('\\x09|\\x0A|\\x0D|[\\u0020-\\uD7FF]|[\\uE000-\\uFFFD]|[\\U00010000-\\U0010FFFF]')
......
......@@ -20,7 +20,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropWhitespace, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropRegExp, \
Lookbehind, Lookahead, Alternative, Pop, Token, DropToken, Synonym, AllOf, SomeOf, \
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
......@@ -71,7 +71,8 @@ class jsonGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
_EOF = NegativeLookahead(RegExp('.'))
EXP = Option(Series(Alternative(DropToken("E"), DropToken("e")), Option(Alternative(DropToken("+"), DropToken("-"))), RegExp('[0-9]+')))
DOT = Token(".")
......
......@@ -21,7 +21,7 @@ try:
except ImportError:
import re
from DHParser import start_logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropWhitespace, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, DropRegExp, \
Lookbehind, Lookahead, Alternative, Pop, Token, DropToken, Synonym, AllOf, SomeOf, \
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
......@@ -72,7 +72,8 @@ class yamlGrammar(Grammar):
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
dwsp__ = DropWhitespace(WSP_RE__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = DropRegExp(WSP_RE__)
EOF = NegativeLookahead(RegExp('.'))
EXP = Option(Series(Alternative(DropToken("E"), DropToken("e")), Option(Alternative(DropToken("+"), DropToken("-"))), RegExp('[0-9]+')))
FRAC = Option(Series(DropToken("."), RegExp('[0-9]+')))
......
......@@ -391,7 +391,7 @@ class TestSynonymDetection:
grammar = grammar_provider(ebnf)()
assert grammar['a'].pname == 'a', grammar['a'].pname
assert grammar['b'].pname == 'b', grammar['b'].pname
assert grammar('b').as_sxpr().count('b') == 2
assert grammar('b').as_sxpr() == '(a "b")'
class TestFlowControlOperators:
......
......@@ -321,7 +321,7 @@ class TestRootNode:
# wrong
number = RE(r'\d+') | RE(r'\d+') + RE(r'\.') + RE(r'\d+')
result = str(Grammar(number)("3.1416"))
assert result.startswith('3 <<< Error on ".1416" | Parser stopped before end! trying to recover'), \
assert result.startswith('3 <<< Error on ".1416" | Parser stopped before end!'), \
str(result)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment