Commit a4bed6e6 authored by Eckhart Arnold's avatar Eckhart Arnold

- Refactoring LaTeX.ebnf

parent a765df39
......@@ -133,12 +133,12 @@ StrictResultType = Union[ChildrenType, str]
ResultType = Union[ChildrenType, 'Node', str, None]
def oneliner_sxpr(sxpr: str) -> str:
"""Returns S-expression `sxpr` as a one liner without unnecessary
def flatten_sxpr(sxpr: str) -> str:
"""Returns S-expression `sxpr` as a one-liner without unnecessary
whitespace.
Example:
>>> oneliner_sxpr('(a\\n (b\\n c\\n )\\n)\\n')
>>> flatten_sxpr('(a\\n (b\\n c\\n )\\n)\\n')
'(a (b c))'
"""
return re.sub('\s(?=\))', '', re.sub('\s+', ' ', sxpr)).strip()
......@@ -199,11 +199,13 @@ class Node:
self._pos = -1 # type: int
self.parser = parser or ZOMBIE_PARSER
def __str__(self):
if self.children:
return "".join(str(child) for child in self.children)
return str(self.result)
def __repr__(self):
mpargs = {'name': self.parser.name, 'ptype': self.parser.ptype}
parg = "MockParser({name}, {ptype})".format(**mpargs)
......@@ -211,30 +213,35 @@ class Node:
"(" + ", ".join(repr(child) for child in self.children) + ")"
return "Node(%s, %s)" % (parg, rarg)
def __eq__(self, other):
# return str(self.parser) == str(other.parser) and self.result == other.result
return self.tag_name == other.tag_name and self.result == other.result
def __hash__(self):
return hash(self.tag_name)
def __deepcopy__(self, memodict={}):
result = copy.deepcopy(self.result)
other = Node(self.parser, result)
other._pos = self._pos
return other
@property # this needs to be a (dynamic) property, in case sef.parser gets updated
def tag_name(self) -> str:
return self.parser.name or self.parser.ptype
@property
def result(self) -> StrictResultType:
return self._result
@result.setter
def result(self, result: ResultType):
# # made obsolete by static type checking with mypy is done
# # made obsolete by static type checking with mypy
# assert ((isinstance(result, tuple) and all(isinstance(child, Node) for child in result))
# or isinstance(result, Node)
# or isinstance(result, str)), str(result)
......@@ -244,15 +251,18 @@ class Node:
self.error_flag = any(r.error_flag for r in self._children) \
if self._children else False # type: bool
@property
def children(self) -> ChildrenType:
return self._children
@property
def len(self) -> int:
# DEBUGGING: print(self.tag_name, str(self.pos), str(self._len), str(self)[:10].replace('\n','.'))
return self._len
@property
def pos(self) -> int:
assert self._pos >= 0, "position value not initialized!"
......@@ -267,16 +277,19 @@ class Node:
child.pos = pos + offset
offset += child.len
@property
def errors(self) -> List[Error]:
return [Error(self.pos, err) for err in self._errors]
def add_error(self, error_str: str) -> 'Node':
assert isinstance(error_str, str)
self._errors.append(error_str)
self.error_flag = True
return self
def propagate_error_flags(self) -> None:
"""Recursively propagates error flags set on child nodes to its
parents. This can be used if errors are added to descendant
......@@ -286,6 +299,7 @@ class Node:
child.propagate_error_flags()
self.error_flag = self.error_flag or child.error_flag
def collect_errors(self, clear_errors=False) -> List[Error]:
"""
Returns all errors of this node or any child node in the form
......@@ -301,6 +315,7 @@ class Node:
errors.extend(child.collect_errors(clear_errors))
return errors
def _tree_repr(self, tab, openF, closeF, dataF=identity, density=0) -> str:
"""
Generates a tree representation of this node and its children
......@@ -346,6 +361,7 @@ class Node:
else:
return head + '\n'.join([tab + dataF(s) for s in res.split('\n')]) + tail.lstrip(D)
def as_sxpr(self, src: str=None) -> str:
"""
Returns content as S-expression, i.e. in lisp-like form.
......@@ -373,6 +389,7 @@ class Node:
return self._tree_repr(' ', opening, lambda node: '\n)', pretty, density=0)
def as_xml(self, src: str=None) -> str:
"""
Returns content as XML-tree.
......@@ -397,10 +414,12 @@ class Node:
return self._tree_repr(' ', opening, closing, density=1)
def structure(self) -> str:
"""Return structure (and content) as S-expression on a single line
without any line breaks."""
return oneliner_sxpr(self.as_sxpr())
return flatten_sxpr(self.as_sxpr())
def content(self) -> str:
"""
......@@ -412,6 +431,7 @@ class Node:
return (
' <<< Error on "%s" | %s >>> ' % (s, '; '.join(self._errors))) if self._errors else s
def find(self, match_function: Callable) -> Iterator['Node']:
"""Finds nodes in the tree that match a specific criterion.
......@@ -433,6 +453,7 @@ class Node:
for nd in child.find(match_function):
yield nd
# def range(self, match_first, match_last):
# """Iterates over the range of nodes, starting from the first
# node for which ``match_first`` becomes True until the first node
......@@ -473,13 +494,14 @@ class Node:
# return self.result,
# return nav(path.split('/'))
def log(self, log_file_name):
if is_logging():
st_file_name = log_file_name
with open(os.path.join(log_dir(), st_file_name), "w", encoding="utf-8") as f:
with open(os.path.join(log_dir(), log_file_name), "w", encoding="utf-8") as f:
f.write(self.as_sxpr())
def mock_syntax_tree(sxpr):
"""
Generates a tree of nodes from an S-expression.
......
......@@ -27,8 +27,7 @@ except ImportError:
import re
from DHParser import error_messages
from DHParser.toolkit import is_logging
from DHParser.syntaxtree import mock_syntax_tree, oneliner_sxpr
from DHParser.syntaxtree import mock_syntax_tree, flatten_sxpr
__all__ = ('unit_from_configfile',
'unit_from_json',
......@@ -150,15 +149,15 @@ def grammar_unit(test_unit, parser_factory, transformer_factory, report=True, ve
infostr = ' match-test "' + test_name + '" ... '
errflag = len(errata)
cst = parser(test_code, parser_name)
cst.log("match_%s_%s.cst" % (parser_name, test_name))
cst.log("%s_match_%s_%s.cst" % (unit_name, parser_name, test_name))
tests.setdefault('__cst__', {})[test_name] = cst
if "ast" in tests or report:
ast = copy.deepcopy(cst)
transform(ast)
tests.setdefault('__ast__', {})[test_name] = ast
ast.log("match_%s_%s.ast" % (parser_name, test_name))
ast.log("%s_match_%s_%s.ast" % (unit_name, parser_name, test_name))
if cst.error_flag:
errata.append('Match test "%s" for parser "%s" failed:\n\tExpr.: %s\n\n\t%s' %
errata.append('Match test "%s" for parser "%s" failed:\n\tExpr.: %s\n\n\t%s\n\n' %
(test_name, parser_name, '\n\t'.join(test_code.split('\n')),
'\n\t'.join(m.replace('\n', '\n\t\t') for m in
error_messages(test_code, cst.collect_errors()))))
......@@ -174,8 +173,8 @@ def grammar_unit(test_unit, parser_factory, transformer_factory, report=True, ve
errata.append('Abstract syntax tree test "%s" for parser "%s" failed:'
'\n\tExpr.: %s\n\tExpected: %s\n\tReceived: %s'
% (test_name, parser_name, '\n\t'.join(test_code.split('\n')),
oneliner_sxpr(compare.as_sxpr()),
oneliner_sxpr(ast.as_sxpr())))
flatten_sxpr(compare.as_sxpr()),
flatten_sxpr(ast.as_sxpr())))
tests.setdefault('__err__', {})[test_name] = errata[-1]
if verbose:
print(infostr + ("OK" if len(errata) == errflag else "FAIL"))
......@@ -187,8 +186,6 @@ def grammar_unit(test_unit, parser_factory, transformer_factory, report=True, ve
infostr = ' fail-test "' + test_name + '" ... '
errflag = len(errata)
cst = parser(test_code, parser_name)
# doesn't make sense to write cst for fail-tests
# cst.log("fail_%s_%s.cst" % (parser_name, test_name))
if not cst.error_flag:
errata.append('Fail test "%s" for parser "%s" yields match instead of '
'expected failure!' % (test_name, parser_name))
......
......@@ -54,11 +54,11 @@ Index = "\printindex" [PARSEP]
#### block environments ####
block_environment = known_environment | generic_block
known_environment = itemize | enumerate | figure | table | quotation
known_environment = itemize | enumerate | figure | tabular | quotation
| verbatim
generic_block = begin_generic_block sequence §end_generic_block
begin_generic_block = -&LB begin_environment -&LB
end_generic_block = -&LB end_environment -&LB
begin_generic_block = -&LB begin_environment LFF
end_generic_block = -&LB end_environment LFF
itemize = "\begin{itemize}" [PARSEP] { item } §"\end{itemize}"
enumerate = "\begin{enumerate}" [PARSEP] {item } §"\end{enumerate}"
......@@ -68,13 +68,13 @@ figure = "\begin{figure}" sequence §"\end{figure}"
quotation = ("\begin{quotation}" sequence §"\end{quotation}")
| ("\begin{quote}" sequence §"\end{quote}")
verbatim = "\begin{verbatim}" sequence §"\end{verbatim}"
table = "\begin{tabular}" table_config sequence §"\end{tabular}"
table_config = "{" /[lcr|]+/~ §"}"
tabular = "\begin{tabular}" tabular_config sequence §"\end{tabular}"
tabular_config = "{" /[lcr|]+/~ §"}"
#### paragraphs and sequences of paragraphs ####
block_of_paragraphs = /{/ sequence §/}/
block_of_paragraphs = /{/~ sequence §/}/
sequence = { (paragraph | block_environment ) [PARSEP] }+
paragraph = { !blockcmd text_element //~ }+
......@@ -85,20 +85,21 @@ text_element = command | text | block | inline_environment
inline_environment = known_inline_env | generic_inline_env
known_inline_env = inline_math
generic_inline_env = (begin_inline_env { text_element }+ §end_inline_env)
begin_inline_env = (-!LB begin_environment) | (begin_environment -!LB)
generic_inline_env = begin_inline_env //~ paragraph §end_inline_env
begin_inline_env = (-!LB begin_environment) | (begin_environment !LFF)
end_inline_env = end_environment
# (-!LB end_environment) | (end_environment -!LB) # ambiguity with genric_block when EOF
begin_environment = "\begin{" §NAME §"}"
end_environment = "\end{" §::NAME §"}"
## (-!LB end_environment) | (end_environment !LFF) # ambiguity with genric_block when EOF
begin_environment = /\\begin{/ §NAME §/}/
end_environment = /\\end{/ §::NAME §/}/
inline_math = "$" /[^$]*/ §"$"
inline_math = /\$/ /[^$]*/ §/\$/
#### commands ####
command = known_command | generic_command
command = known_command | text_command | generic_command
known_command = footnote | includegraphics | caption
text_command = TXTCOMMAND | ESCAPED | BRACKETS
generic_command = !no_command CMDNAME [[ //~ config ] //~ block ]
footnote = "\footnote" block_of_paragraphs
......@@ -113,12 +114,9 @@ caption = "\caption" block
#######################################################################
config = "[" cfgtext §"]"
block = /{/ { text_element } §/}/
text = { cfgtext | (BRACKETS //~) }+
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
config = "[" text §"]"
block = /{/ //~ { !blockcmd text_element //~ } §/}/
text = TEXTCHUNK { //~ TEXTCHUNK }
no_command = "\begin{" | "\end" | BACKSLASH structural
blockcmd = BACKSLASH ( ( "begin{" | "end{" )
......@@ -138,13 +136,17 @@ structural = "subsection" | "section" | "chapter" | "subsubsection"
CMDNAME = /\\(?:(?!_)\w)+/~
TXTCOMMAND = /\\text\w+/
ESCAPED = /\\[%$&_\/{}]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
NAME = /\w+/~
ESCAPED = /\\[%$&_\/]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
TEXTCHUNK = /[^\\%$&\{\}\[\]\s\n]+/ # some piece of text excluding whitespace,
# linefeed and special characters
LF = !GAP /[ \t]*\n[ \t]*/ # linefeed but not an empty line
LFF = //~ -&LB WSPC # at least one linefeed
WSPC = { ~/\s+/~ } # arbitrary horizontal or vertical whitespace
PARSEP = { GAP }+ # paragraph separator
GAP = /[ \t]*(?:\n[ \t]*)+\n/~ # at least one empty line, i.e.
# [whitespace] linefeed [whitespace] linefeed
......
......@@ -107,27 +107,27 @@ class LaTeXGrammar(Grammar):
#### block environments ####
block_environment = known_environment | generic_block
known_environment = itemize | enumerate | figure | table | quotation
known_environment = itemize | enumerate | figure | tabular | quotation
| verbatim
generic_block = begin_generic_block sequence §end_generic_block
begin_generic_block = -&LB begin_environment -&LB
end_generic_block = -&LB end_environment -&LB
begin_generic_block = -&LB begin_environment LFF
end_generic_block = -&LB end_environment LFF
itemize = "\begin{itemize}" [PARSEP] { item } §"\end{itemize}"
enumerate = "\begin{enumerate}" [PARSEP] {item } §"\end{enumerate}"
item = "\item" [PARSEP] sequence
figure = "\begin{figure}" sequence "\end{figure}"
quotation = ("\begin{quotation}" sequence "\end{quotation}")
| ("\begin{quote}" sequence "\end{quote}")
verbatim = "\begin{verbatim}" sequence "\end{verbatim}"
table = "\begin{tabular}" table_config sequence "\end{tabular}"
table_config = "{" /[lcr|]+/~ "}"
figure = "\begin{figure}" sequence §"\end{figure}"
quotation = ("\begin{quotation}" sequence §"\end{quotation}")
| ("\begin{quote}" sequence §"\end{quote}")
verbatim = "\begin{verbatim}" sequence §"\end{verbatim}"
tabular = "\begin{tabular}" tabular_config sequence §"\end{tabular}"
tabular_config = "{" /[lcr|]+/~ §"}"
#### paragraphs and sequences of paragraphs ####
block_of_paragraphs = /{/ sequence §/}/
block_of_paragraphs = /{/~ sequence §/}/
sequence = { (paragraph | block_environment ) [PARSEP] }+
paragraph = { !blockcmd text_element //~ }+
......@@ -138,20 +138,21 @@ class LaTeXGrammar(Grammar):
inline_environment = known_inline_env | generic_inline_env
known_inline_env = inline_math
generic_inline_env = (begin_inline_env { text_element }+ §end_inline_env)
begin_inline_env = (-!LB begin_environment) | (begin_environment -!LB)
generic_inline_env = begin_inline_env //~ paragraph §end_inline_env
begin_inline_env = (-!LB begin_environment) | (begin_environment !LFF)
end_inline_env = end_environment
# (-!LB end_environment) | (end_environment -!LB) # ambiguity with genric_block when EOF
begin_environment = "\begin{" §NAME §"}"
end_environment = "\end{" §::NAME §"}"
## (-!LB end_environment) | (end_environment !LFF) # ambiguity with genric_block when EOF
begin_environment = /\\begin{/ §NAME §/}/
end_environment = /\\end{/ §::NAME §/}/
inline_math = "$" /[^$]*/ "$"
inline_math = /\$/ /[^$]*/ §/\$/
#### commands ####
command = known_command | generic_command
command = known_command | text_command | generic_command
known_command = footnote | includegraphics | caption
text_command = TXTCOMMAND | ESCAPED | BRACKETS
generic_command = !no_command CMDNAME [[ //~ config ] //~ block ]
footnote = "\footnote" block_of_paragraphs
......@@ -166,12 +167,9 @@ class LaTeXGrammar(Grammar):
#######################################################################
config = "[" cfgtext §"]"
block = /{/ { text_element } §/}/
text = { cfgtext | (BRACKETS //~) }+
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
config = "[" text §"]"
block = /{/ //~ { !blockcmd text_element //~ } §/}/
text = TEXTCHUNK { //~ TEXTCHUNK }
no_command = "\begin{" | "\end" | BACKSLASH structural
blockcmd = BACKSLASH ( ( "begin{" | "end{" )
......@@ -191,13 +189,17 @@ class LaTeXGrammar(Grammar):
CMDNAME = /\\(?:(?!_)\w)+/~
TXTCOMMAND = /\\text\w+/
ESCAPED = /\\[%$&_\/{}]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
NAME = /\w+/~
ESCAPED = /\\[%$&_\/]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
TEXTCHUNK = /[^\\%$&\{\}\[\]\s\n]+/ # some piece of text excluding whitespace,
# linefeed and special characters
LF = !GAP /[ \t]*\n[ \t]*/ # linefeed but not an empty line
LFF = //~ -&LB WSPC # at least one linefeed
WSPC = { ~/\s+/~ } # arbitrary horizontal or vertical whitespace
PARSEP = { GAP }+ # paragraph separator
GAP = /[ \t]*(?:\n[ \t]*)+\n/~ # at least one empty line, i.e.
# [whitespace] linefeed [whitespace] linefeed
......@@ -211,8 +213,9 @@ class LaTeXGrammar(Grammar):
block_environment = Forward()
block_of_paragraphs = Forward()
end_generic_block = Forward()
paragraph = Forward()
text_element = Forward()
source_hash__ = "9cdeab7d908861b396d3667373fdcb9a"
source_hash__ = "b06aca9481c1e5bd756caadb8b707dff"
parser_initialization__ = "upon instantiation"
COMMENT__ = r'%.*(?:\n|$)'
WSP__ = mixin_comment(whitespace=r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?', comment=r'%.*(?:\n|$)')
......@@ -223,50 +226,52 @@ class LaTeXGrammar(Grammar):
LB = RegExp('\\s*?\\n|$')
GAP = RE('[ \\t]*(?:\\n[ \\t]*)+\\n')
PARSEP = OneOrMore(GAP)
WSPC = ZeroOrMore(RE('\\s+', wL=WSP__))
LFF = Series(RE(''), Lookbehind(LB), WSPC)
LF = Series(NegativeLookahead(GAP), RegExp('[ \\t]*\\n[ \\t]*'))
TEXTCHUNK = RegExp('[^\\\\%$&\\{\\}\\[\\]\\s\\n]+')
BRACKETS = RegExp('[\\[\\]]')
ESCAPED = RegExp('\\\\[%$&_/]')
NAME = Capture(RE('\\w+'))
BRACKETS = RegExp('[\\[\\]]')
ESCAPED = RegExp('\\\\[%$&_/{}]')
TXTCOMMAND = RegExp('\\\\text\\w+')
CMDNAME = RE('\\\\(?:(?!_)\\w)+')
structural = Alternative(Token("subsection"), Token("section"), Token("chapter"), Token("subsubsection"), Token("paragraph"), Token("subparagraph"), Token("item"))
blockcmd = Series(BACKSLASH, Alternative(Series(Alternative(Token("begin{"), Token("end{")), Alternative(Token("enumerate"), Token("itemize"), Token("figure"), Token("quote"), Token("quotation"), Token("tabular")), Token("}")), structural, begin_generic_block, end_generic_block))
no_command = Alternative(Token("\\begin{"), Token("\\end"), Series(BACKSLASH, structural))
word_sequence = OneOrMore(Series(TEXTCHUNK, RE('')))
cfgtext = OneOrMore(Alternative(word_sequence, Series(ESCAPED, RE(''))))
text = OneOrMore(Alternative(cfgtext, Series(BRACKETS, RE(''))))
block = Series(RegExp('{'), ZeroOrMore(text_element), Required(RegExp('}')))
config = Series(Token("["), cfgtext, Required(Token("]")))
text = Series(TEXTCHUNK, ZeroOrMore(Series(RE(''), TEXTCHUNK)))
block = Series(RegExp('{'), RE(''), ZeroOrMore(Series(NegativeLookahead(blockcmd), text_element, RE(''))), Required(RegExp('}')))
config = Series(Token("["), text, Required(Token("]")))
caption = Series(Token("\\caption"), block)
includegraphics = Series(Token("\\includegraphics"), Optional(config), block)
footnote = Series(Token("\\footnote"), block_of_paragraphs)
generic_command = Series(NegativeLookahead(no_command), CMDNAME, Optional(Series(Optional(Series(RE(''), config)), RE(''), block)))
text_command = Alternative(TXTCOMMAND, ESCAPED, BRACKETS)
known_command = Alternative(footnote, includegraphics, caption)
command = Alternative(known_command, generic_command)
inline_math = Series(Token("$"), RegExp('[^$]*'), Token("$"))
end_environment = Series(Token("\\end{"), Required(Pop(NAME)), Required(Token("}")))
begin_environment = Series(Token("\\begin{"), Required(NAME), Required(Token("}")))
command = Alternative(known_command, text_command, generic_command)
inline_math = Series(RegExp('\\$'), RegExp('[^$]*'), Required(RegExp('\\$')))
end_environment = Series(RegExp('\\\\end{'), Required(Pop(NAME)), Required(RegExp('}')))
begin_environment = Series(RegExp('\\\\begin{'), Required(NAME), Required(RegExp('}')))
end_inline_env = Synonym(end_environment)
begin_inline_env = Alternative(Series(NegativeLookbehind(LB), begin_environment), Series(begin_environment, NegativeLookbehind(LB)))
generic_inline_env = Series(begin_inline_env, OneOrMore(text_element), Required(end_inline_env))
begin_inline_env = Alternative(Series(NegativeLookbehind(LB), begin_environment), Series(begin_environment, NegativeLookahead(LFF)))
generic_inline_env = Series(begin_inline_env, RE(''), paragraph, Required(end_inline_env))
known_inline_env = Synonym(inline_math)
inline_environment = Alternative(known_inline_env, generic_inline_env)
text_element.set(Alternative(command, text, block, inline_environment))
paragraph = OneOrMore(Series(NegativeLookahead(blockcmd), text_element, RE('')))
paragraph.set(OneOrMore(Series(NegativeLookahead(blockcmd), text_element, RE(''))))
sequence = OneOrMore(Series(Alternative(paragraph, block_environment), Optional(PARSEP)))
block_of_paragraphs.set(Series(RegExp('{'), sequence, Required(RegExp('}'))))
table_config = Series(Token("{"), RE('[lcr|]+'), Token("}"))
table = Series(Token("\\begin{tabular}"), table_config, sequence, Token("\\end{tabular}"))
verbatim = Series(Token("\\begin{verbatim}"), sequence, Token("\\end{verbatim}"))
quotation = Alternative(Series(Token("\\begin{quotation}"), sequence, Token("\\end{quotation}")), Series(Token("\\begin{quote}"), sequence, Token("\\end{quote}")))
figure = Series(Token("\\begin{figure}"), sequence, Token("\\end{figure}"))
block_of_paragraphs.set(Series(RE('{'), sequence, Required(RegExp('}'))))
tabular_config = Series(Token("{"), RE('[lcr|]+'), Required(Token("}")))
tabular = Series(Token("\\begin{tabular}"), tabular_config, sequence, Required(Token("\\end{tabular}")))
verbatim = Series(Token("\\begin{verbatim}"), sequence, Required(Token("\\end{verbatim}")))
quotation = Alternative(Series(Token("\\begin{quotation}"), sequence, Required(Token("\\end{quotation}"))), Series(Token("\\begin{quote}"), sequence, Required(Token("\\end{quote}"))))
figure = Series(Token("\\begin{figure}"), sequence, Required(Token("\\end{figure}")))
item = Series(Token("\\item"), Optional(PARSEP), sequence)
enumerate = Series(Token("\\begin{enumerate}"), Optional(PARSEP), ZeroOrMore(item), Required(Token("\\end{enumerate}")))
itemize = Series(Token("\\begin{itemize}"), Optional(PARSEP), ZeroOrMore(item), Required(Token("\\end{itemize}")))
end_generic_block.set(Series(Lookbehind(LB), end_environment, Lookbehind(LB)))
begin_generic_block.set(Series(Lookbehind(LB), begin_environment, Lookbehind(LB)))
end_generic_block.set(Series(Lookbehind(LB), end_environment, LFF))
begin_generic_block.set(Series(Lookbehind(LB), begin_environment, LFF))
generic_block = Series(begin_generic_block, sequence, Required(end_generic_block))
known_environment = Alternative(itemize, enumerate, figure, table, quotation, verbatim)
known_environment = Alternative(itemize, enumerate, figure, tabular, quotation, verbatim)
block_environment.set(Alternative(known_environment, generic_block))
Index = Series(Token("\\printindex"), Optional(PARSEP))
Bibliography = Series(Token("\\bibliography"), block, Optional(PARSEP))
......@@ -369,17 +374,18 @@ LaTeX_AST_transformation_table = {
"inline_math": [remove_brackets, reduce_single_child],
"command": [],
"known_command": [],
"text_command": [],
"generic_command": [flatten],
"footnote": [],
"includegraphics": [],
"caption": [],
"config": [remove_brackets],
"block": [remove_brackets, reduce_single_child(is_anonymous)],
"block": [remove_brackets, flatten],
"text": collapse,
"cfgtext, word_sequence": [],
"no_command, blockcmd": [],
"structural": [],
"CMDNAME": [remove_whitespace, reduce_single_child(is_anonymous)],
"TXTCOMMAND": [remove_whitespace, reduce_single_child(is_anonymous)],
"NAME": [reduce_single_child, remove_whitespace, reduce_single_child],
"ESCAPED": [replace_content(lambda node: str(node)[1:])],
"BRACKETS": [],
......
# latex Grammar
@ whitespace = /[ \t]*\n?(?!\s*\n)[ \t]*/ # whitespace, including at most one linefeed
@ comment = /%.*(?:\n|$)/
latexdoc = preamble document
preamble = { command }+
genericenv = beginenv sequence §endenv
beginenv = "\begin" §( "{" name "}" )
endenv = "\end" §( "{" ::name "}" )
name = /\w+/~
comand = cmdname [ config ] block
cmdname = /\\\w+/
config = "[" cfgtext §"]"
sequence = { partext | parblock }
parblock = "{" { partext | parblock } §"}"
block = "{" { text | block } §"}"
partext = text | PARSEP
text = cfgtext | brackets
cfgtext = chunk | escaped | WSPC
ESCAPED = /\\[%$&]/
BRACKET = /[\[\]]/ # left or right square bracket: [ ]
TEXTCHUNK = /[^\\%$&\{\}\[\]\s\n]+/ # some piece of text excluding whitespace,
# linefeed and special characters
WSPC = /[ \t]*\n?(?!\s*\n)[ \t]*/ # whitespace, including at most one linefeed
LF = /[ \t]*\n(?!\s*\n)/ # a linefeed, but not an empty line (i.e. par)
PARSEP = /\s*\n\s*\n/ # at least one empty line, i.e.
# [whitespace] linefeed [whitespace] linefeed
[match:text]
1 : Some plain text
[fail:text]
1 : Low-level text must not contain \& escaped characters.
2 : Low-level text must not contain ] [ brackets.
3 : Low-level text must not contain { environments }.
4 : Low-level text must not contain any \commands.
[match:text_element]
1 : \command
2 : \textbackslash
3 : \footnote{footnote}
4 : [
5 : \begin{generic} unknown inline environment \end{generic}
6 : \begin{small} known inline environment \end{small}
7: {\em block}
......@@ -26,6 +26,11 @@
8 : Unknwon \xy commands within paragraphs may be simple
or \xy{complex}.
9 : paragraphs may contain all of these: \{ escaped \} characters,
{\bf blocks}, [ brackets ], \begin{tiny} environments \end{tiny}
and \textbackslash text-commands or other commands like this
\footnote{footnote}
[fail:paragraph]
1 : \begin{enumerate}
......
......@@ -34,20 +34,27 @@
[match:inline_environment]
1 : """\begin{generic}inline environment\end{generic}
"""
1 : """\begin{generic}inline environment\end{generic}"""
2 : """\begin{generic}inline environment
\end{generic}
"""
\end{generic}"""
3 : "$ inline math $"
[fail:inline_environment]
3 : """\begin{generic}