Commit eaa04b6f authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- better error reporting if parser did not match at all

- further amendments to LaTeX example
parent e4a8c2fd
......@@ -66,18 +66,18 @@ try:
except ImportError:
import re
try:
from typing import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union
from typing import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union, Optional
# try:
# from typing import Collection
# except ImportError:
# pass
except ImportError:
from .typing34 import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union
from .typing34 import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union, Optional
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER, ParserBase, \
Node, TransformationFunc
from DHParser.toolkit import load_if_file, error_messages
from DHParser.toolkit import load_if_file, error_messages, line_col
__all__ = ('PreprocessorFunc',
'HistoryRecord',
......@@ -149,7 +149,7 @@ class HistoryRecord:
parser call, which ist either MATCH, FAIL (i.e. no match)
or ERROR.
"""
__slots__ = ('call_stack', 'node', 'remaining')
__slots__ = ('call_stack', 'node', 'remaining', 'line_col')
MATCH = "MATCH"
ERROR = "ERROR"
......@@ -159,6 +159,12 @@ class HistoryRecord:
self.call_stack = call_stack # type: List['Parser']
self.node = node # type: Node
self.remaining = remaining # type: int
document = call_stack[-1].grammar.document__ if call_stack else ''
self.line_col = line_col(document, len(document) - remaining) # type: Tuple[int, int]
def __str__(self):
return 'line %i, column %i: %s "%s"' % \
(self.line_col[0], self.line_col[1], self.stack, str(self.node))
def err_msg(self) -> str:
return self.ERROR + ": " + "; ".join(self.node._errors).replace('\n', '\\')
......@@ -179,6 +185,43 @@ class HistoryRecord:
else slice(-self.remaining, None))
@staticmethod
def last_match(history: List['HistoryRecord']) -> Optional['HistoryRecord']:
"""
Returns the last match from the parsing-history.
Args:
history: the parsing-history as a list of HistoryRecord objects
Returns:
the history record of the last match or none if either history is
empty or no parser could match
"""
for record in reversed(history):
if record.status == HistoryRecord.MATCH:
return record
return None
@staticmethod
def most_advanced_match(history: List['HistoryRecord']) -> Optional['HistoryRecord']:
"""
Returns the closest-to-the-end-match from the parsing-history.
Args:
history: the parsing-history as a list of HistoryRecord objects
Returns:
the history record of the closest-to-the-end-match or none if either history is
empty or no parser could match
"""
remaining = -1
result = None
for record in history:
if (record.status == HistoryRecord.MATCH and
(record.remaining < remaining or remaining < 0)):
result = record
remaining = record.remaining
return result
def add_parser_guard(parser_func):
"""
Add a wrapper function to a parser functions (i.e. Parser.__call__ method)
......@@ -323,6 +366,9 @@ class Parser(ParserBase, metaclass=ParserMetaClass):
sure that one and the same function will not be applied
(recursively) a second time, if it has already been
applied to this parser.
grammar: A reference to the Grammar object to which the parser
is attached.
"""
ApplyFunc = Callable[['Parser'], None]
......@@ -510,7 +556,7 @@ class Grammar:
Attributes:
all_parsers__: A set of all parsers connected to this grammar object
hostory_tracking: A flag indicating that the parsing history shall
hostory_tracking__: A flag indicating that the parsing history shall
be tracked
wsp_left_parser__: A parser for the default left-adjacent-whitespace
......@@ -725,7 +771,10 @@ class Grammar:
fwd = rest.find("\n") + 1 or len(rest)
skip, rest = rest[:fwd], rest[fwd:]
if result is None:
error_msg = "Parser did not match! Invalid source file?"
error_msg = 'Parser did not match! Invalid source file?' \
'\n Most advanced: %s\n Last match: %s;' % \
(str(HistoryRecord.most_advanced_match(self.history__)),
str(HistoryRecord.last_match(self.history__)))
else:
stitches.append(result)
error_msg = "Parser stopped before end" + \
......@@ -775,6 +824,7 @@ class Grammar:
which have been dismissed.
"""
self.rollback__.append((location, func))
# print("push: line %i, col %i" % line_col(self.document__, len(self.document__) - location))
self.last_rb__loc__ = location
......@@ -783,10 +833,15 @@ class Grammar:
Rolls back the variable stacks (`self.variables`) to its
state at an earlier location in the parsed document.
"""
# print("rollback: line %i, col %i" % line_col(self.document__, len(self.document__) - location))
while self.rollback__ and self.rollback__[-1][0] <= location:
loc, rollback_func = self.rollback__.pop()
assert not loc > self.last_rb__loc__
# assert not loc > self.last_rb__loc__, \
# "Rollback confusion: line %i, col %i < line %i, col %i" % \
# (*line_col(self.document__, len(self.document__) - loc),
# *line_col(self.document__, len(self.document__) - self.last_rb__loc__))
rollback_func()
# print("rb to: line %i, col %i" % line_col(self.document__, len(self.document__) - loc))
self.last_rb__loc__ == self.rollback__[-1][0] if self.rollback__ \
else (len(self.document__) + 1)
......
......@@ -68,7 +68,8 @@ figure = "\begin{figure}" sequence §"\end{figure}"
quotation = ("\begin{quotation}" sequence §"\end{quotation}")
| ("\begin{quote}" sequence §"\end{quote}")
verbatim = "\begin{verbatim}" sequence §"\end{verbatim}"
tabular = "\begin{tabular}" tabular_config sequence §"\end{tabular}"
tabular = "\begin{tabular}" tabular_config { tabular_cell } §"\end{tabular}"
tabular_cell = sequence
tabular_config = "{" /[lcr|]+/~ §"}"
......@@ -96,14 +97,15 @@ inline_math = /\$/ /[^$]*/ §/\$/
#### commands ####
command = known_command | text_command | generic_command
known_command = footnote | includegraphics | caption
text_command = TXTCOMMAND | ESCAPED | BRACKETS
known_command = footnote | includegraphics | caption | multicolumn
text_command = TXTCOMMAND | ESCAPED | BRACKETS | LINEFEED
generic_command = !no_command CMDNAME [[ //~ config ] //~ block ]
footnote = "\footnote" block_of_paragraphs
includegraphics = "\includegraphics" [ config ] block
caption = "\caption" block
multicolumn = "\multicolumn" "{" INTEGER "}"
tabular_config block_of_paragraphs
#######################################################################
#
......@@ -112,7 +114,8 @@ caption = "\caption" block
#######################################################################
config = "[" text §"]"
config = "[" cfg_text §"]"
cfg_text = { ([//~] text) | CMDNAME }
block = /{/ //~ { !blockcmd text_element //~ } §/}/
text = TEXTCHUNK { //~ TEXTCHUNK }
......@@ -137,14 +140,16 @@ CMDNAME = /\\(?:(?!_)\w)+/~
TXTCOMMAND = /\\text\w+/
ESCAPED = /\\[%$&_\/{}]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
LINEFEED = /[\\][\\]/
NAME = /\w+/~
INTEGER = /\d+/~
TEXTCHUNK = /[^\\%$&\{\}\[\]\s\n]+/ # some piece of text excluding whitespace,
# linefeed and special characters
LF = !GAP /[ \t]*\n[ \t]*/ # linefeed but not an empty line
LFF = //~ -&LB [ WSPC ] # at least one linefeed
WSPC = { COMMENT__ | /\s+/ }+
LFF = ~/\n?/ -&LB [ WSPC ] # at least one linefeed
WSPC = { COMMENT__ | /\s+/ }+ # arbitrary horizontal or vertical whitespace
# WSPC = { /\s+/~ | ~/\s+/ }+ # arbitrary horizontal or vertical whitespace
PARSEP = { GAP }+ # paragraph separator
GAP = /[ \t]*(?:\n[ \t]*)+\n/~ # at least one empty line, i.e.
......
......@@ -117,7 +117,8 @@ class LaTeXGrammar(Grammar):
quotation = ("\begin{quotation}" sequence §"\end{quotation}")
| ("\begin{quote}" sequence §"\end{quote}")
verbatim = "\begin{verbatim}" sequence §"\end{verbatim}"
tabular = "\begin{tabular}" tabular_config sequence §"\end{tabular}"
tabular = "\begin{tabular}" tabular_config { tabular_cell } §"\end{tabular}"
tabular_cell = sequence
tabular_config = "{" /[lcr|]+/~ §"}"
......@@ -145,14 +146,15 @@ class LaTeXGrammar(Grammar):
#### commands ####
command = known_command | text_command | generic_command
known_command = footnote | includegraphics | caption
text_command = TXTCOMMAND | ESCAPED | BRACKETS
known_command = footnote | includegraphics | caption | multicolumn
text_command = TXTCOMMAND | ESCAPED | BRACKETS | LINEFEED
generic_command = !no_command CMDNAME [[ //~ config ] //~ block ]
footnote = "\footnote" block_of_paragraphs
includegraphics = "\includegraphics" [ config ] block
caption = "\caption" block
multicolumn = "\multicolumn" "{" INTEGER "}"
tabular_config block_of_paragraphs
#######################################################################
#
......@@ -161,7 +163,8 @@ class LaTeXGrammar(Grammar):
#######################################################################
config = "[" text §"]"
config = "[" cfg_text §"]"
cfg_text = { ([//~] text) | CMDNAME }
block = /{/ //~ { !blockcmd text_element //~ } §/}/
text = TEXTCHUNK { //~ TEXTCHUNK }
......@@ -186,14 +189,16 @@ class LaTeXGrammar(Grammar):
TXTCOMMAND = /\\text\w+/
ESCAPED = /\\[%$&_\/{}]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
LINEFEED = /[\\][\\]/
NAME = /\w+/~
INTEGER = /\d+/~
TEXTCHUNK = /[^\\%$&\{\}\[\]\s\n]+/ # some piece of text excluding whitespace,
# linefeed and special characters
LF = !GAP /[ \t]*\n[ \t]*/ # linefeed but not an empty line
LFF = //~ -&LB [ WSPC ] # at least one linefeed
WSPC = { COMMENT__ | /\s+/ }+
LFF = ~/\n?/ -&LB [ WSPC ] # at least one linefeed
WSPC = { COMMENT__ | /\s+/ }+ # arbitrary horizontal or vertical whitespace
# WSPC = { /\s+/~ | ~/\s+/ }+ # arbitrary horizontal or vertical whitespace
PARSEP = { GAP }+ # paragraph separator
GAP = /[ \t]*(?:\n[ \t]*)+\n/~ # at least one empty line, i.e.
......@@ -209,8 +214,9 @@ class LaTeXGrammar(Grammar):
block_of_paragraphs = Forward()
end_generic_block = Forward()
paragraph = Forward()
tabular_config = Forward()
text_element = Forward()
source_hash__ = "61275add092114be64d558c654b94bcb"
source_hash__ = "fc3ee1800932b561e9cec1e22aab7157"
parser_initialization__ = "upon instantiation"
COMMENT__ = r'%.*(?:\n|$)'
WSP__ = mixin_comment(whitespace=r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?', comment=r'%.*(?:\n|$)')
......@@ -222,10 +228,12 @@ class LaTeXGrammar(Grammar):
GAP = RE('[ \\t]*(?:\\n[ \\t]*)+\\n')
PARSEP = OneOrMore(GAP)
WSPC = OneOrMore(Alternative(RegExp(COMMENT__), RegExp('\\s+')))
LFF = Series(RE(''), Lookbehind(LB), Optional(WSPC))
LFF = Series(RE('\\n?', wR='', wL=WSP__), Lookbehind(LB), Optional(WSPC))
LF = Series(NegativeLookahead(GAP), RegExp('[ \\t]*\\n[ \\t]*'))
TEXTCHUNK = RegExp('[^\\\\%$&\\{\\}\\[\\]\\s\\n]+')
INTEGER = RE('\\d+')
NAME = Capture(RE('\\w+'))
LINEFEED = RegExp('[\\\\][\\\\]')
BRACKETS = RegExp('[\\[\\]]')
ESCAPED = RegExp('\\\\[%$&_/{}]')
TXTCOMMAND = RegExp('\\\\text\\w+')
......@@ -235,13 +243,15 @@ class LaTeXGrammar(Grammar):
no_command = Alternative(Token("\\begin{"), Token("\\end"), Series(BACKSLASH, structural))
text = Series(TEXTCHUNK, ZeroOrMore(Series(RE(''), TEXTCHUNK)))
block = Series(RegExp('{'), RE(''), ZeroOrMore(Series(NegativeLookahead(blockcmd), text_element, RE(''))), Required(RegExp('}')))
config = Series(Token("["), text, Required(Token("]")))
cfg_text = ZeroOrMore(Alternative(Series(Optional(RE('')), text), CMDNAME))
config = Series(Token("["), cfg_text, Required(Token("]")))
multicolumn = Series(Token("\\multicolumn"), Token("{"), INTEGER, Token("}"), tabular_config, block_of_paragraphs)
caption = Series(Token("\\caption"), block)
includegraphics = Series(Token("\\includegraphics"), Optional(config), block)
footnote = Series(Token("\\footnote"), block_of_paragraphs)
generic_command = Series(NegativeLookahead(no_command), CMDNAME, Optional(Series(Optional(Series(RE(''), config)), RE(''), block)))
text_command = Alternative(TXTCOMMAND, ESCAPED, BRACKETS)
known_command = Alternative(footnote, includegraphics, caption)
text_command = Alternative(TXTCOMMAND, ESCAPED, BRACKETS, LINEFEED)
known_command = Alternative(footnote, includegraphics, caption, multicolumn)
command = Alternative(known_command, text_command, generic_command)
inline_math = Series(RegExp('\\$'), RegExp('[^$]*'), Required(RegExp('\\$')))
end_environment = Series(RegExp('\\\\end{'), Required(Pop(NAME)), Required(RegExp('}')))
......@@ -255,8 +265,9 @@ class LaTeXGrammar(Grammar):
paragraph.set(OneOrMore(Series(NegativeLookahead(blockcmd), text_element, RE(''))))
sequence = OneOrMore(Series(Alternative(paragraph, block_environment), Optional(PARSEP)))
block_of_paragraphs.set(Series(RE('{'), sequence, Required(RegExp('}'))))
tabular_config = Series(Token("{"), RE('[lcr|]+'), Required(Token("}")))
tabular = Series(Token("\\begin{tabular}"), tabular_config, sequence, Required(Token("\\end{tabular}")))
tabular_config.set(Series(Token("{"), RE('[lcr|]+'), Required(Token("}"))))
tabular_cell = Synonym(sequence)
tabular = Series(Token("\\begin{tabular}"), tabular_config, ZeroOrMore(tabular_cell), Required(Token("\\end{tabular}")))
verbatim = Series(Token("\\begin{verbatim}"), sequence, Required(Token("\\end{verbatim}")))
quotation = Alternative(Series(Token("\\begin{quotation}"), sequence, Required(Token("\\end{quotation}"))), Series(Token("\\begin{quote}"), sequence, Required(Token("\\end{quote}"))))
figure = Series(Token("\\begin{figure}"), sequence, Required(Token("\\end{figure}")))
......
[match:command]
1 : \includegraphics[width=\textwidth]{Graph.eps}
......@@ -38,10 +38,10 @@ One paragraph
Another paragraph
This is {\em an environment that {is \bf spanning} two paragraphs
% This is {\em an environment that {is \bf spanning} two paragraphs
In LaTeX it is allowed that environments continue} over several
paragraphs.
% In LaTeX it is allowed that environments continue} over several
% paragraphs.
% multiline
% comment
Continuation of paragraph.
......
#!/usr/bin/python3
"""tst_LaTeX_doc.py - tests with full documents in subdir 'testdata'
Author: Eckhart Arnold <arnold@badw.de>
Copyright 2017 Bavarian Academy of Sciences and Humanities
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import sys
import DHParser.dsl
from DHParser import toolkit
sys.path.extend(['../../', '../', './'])
if not DHParser.dsl.recompile_grammar('LaTeX.ebnf', force=False): # recompiles Grammar only if it has changed
print('\nErrors while recompiling "LaTeX.ebnf":\n--------------------------------------\n\n')
with open('LaTeX_ebnf_ERRORS.txt') as f:
print(f.read())
sys.exit(1)
from LaTeXCompiler import get_grammar, get_transformer
parser = get_grammar()
transformer = get_transformer()
def fail_on_error(src, result):
if result.error_flag:
print(result.as_sxpr())
for e in toolkit.error_messages(src, result.collect_errors()):
print(e)
sys.exit(1)
with toolkit.logging(True):
files = os.listdir('testdata')
files.sort()
for file in files:
if file.lower().endswith('.tex'):
with open(os.path.join('testdata', file), 'r') as f:
doc = f.read()
print('\n\nParsing document: "%s"\n' % file)
result = parser(doc)
parser.log_parsing_history__()
fail_on_error(doc, result)
ast = transformer(result)
fail_on_error(doc, ast)
print(ast.as_sxpr())
......@@ -26,13 +26,14 @@ import DHParser.dsl
sys.path.extend(['../../', '../', './'])
from DHParser import testing
from DHParser import toolkit
if not DHParser.dsl.recompile_grammar('LaTeX.ebnf', force=True): # recompiles Grammar only if it has changed
if not DHParser.dsl.recompile_grammar('LaTeX.ebnf', force=False): # recompiles Grammar only if it has changed
print('\nErrors while recompiling "LaTeX.ebnf":\n--------------------------------------\n\n')
with open('LaTeX_ebnf_ERRORS.txt') as f:
print(f.read())
sys.exit(1)
from DHParser import toolkit
from LaTeXCompiler import get_grammar, get_transformer
with toolkit.logging(True):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment