Commit 37399df4 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

preprocess.py: Fehlerkorrekturen

parent a7817711
......@@ -30,6 +30,7 @@ cannot completely be described entirely with context-free grammars.
import bisect
import functools
import os
from typing import Union, Optional, Callable, Tuple, NamedTuple, List, Any
from DHParser.toolkit import re, dataclasses
......@@ -42,6 +43,7 @@ __all__ = ('RX_TOKEN_NAME',
'SourceMap',
'SourceMapFunc',
'PreprocessorFunc',
'Preprocessed',
'PreprocessorResult',
'make_token',
'strip_tokens',
......@@ -51,7 +53,9 @@ __all__ = ('RX_TOKEN_NAME',
'neutral_mapping',
'tokenized_to_original_mapping',
'source_map',
'with_source_mapping')
'with_source_mapping',
'gen_find_include_func',
'preprocess_includes')
#######################################################################
......@@ -96,8 +100,7 @@ class IncludeMap(SourceMap):
file_names: List[str] # list of file_names to which the source locations relate
def has_includes(self) -> bool:
L = len(self.file_names)
return L > 1 or (L == 1 and self.file_names[0] != self.source_name)
return any(fname != self.source_name for fname in self.file_names)
class IncludeInfo(NamedTuple):
......@@ -111,7 +114,7 @@ PreprocessorResult = Union[str, Preprocessed]
FindIncludeFunc = Union[Callable[[str, int], IncludeInfo], # (document: str, start: int)
functools.partial]
PreprocessorFunc = Union[Callable[[str, str], PreprocessorResult],
PreprocessorFunc = Union[Callable[[str, str], PreprocessorResult], # text: str, filename: str
functools.partial]
......@@ -302,8 +305,16 @@ def with_source_mapping(result: PreprocessorResult) -> Preprocessed:
"""
if isinstance(result, str):
srcmap = tokenized_to_original_mapping(result)
mapping_func = functools.partial(source_map, srcmap=srcmap)
return Preprocessed(result, mapping_func)
token_mapping = functools.partial(source_map, srcmap=srcmap)
return Preprocessed(result, token_mapping)
# else: # DOES NOT WORK, because there is no way to reliably find out whether
# # token back-mapping has already been done by the provided mapping
# text, mapping = cast(Preprocessed, result)
# if not (hasattr(mapping, 'func') and mapping.func == source_map):
# srcmap = tokenized_to_original_mapping(text)
# token_mapping = functools.partial(source_map, srcmap=srcmap)
# return Preprocessed(
# text, functools.partial(_apply_mappings, mappings=[token_mapping, mapping]))
return result
......@@ -314,7 +325,7 @@ def with_source_mapping(result: PreprocessorResult) -> Preprocessed:
#######################################################################
def generate_find_include_func(rx: Union[str, Any],
def gen_find_include_func(rx: Union[str, Any],
comment_rx: Optional[Union[str, Any]] = None) -> FindIncludeFunc:
if isinstance(rx, str): rx = re.compile(rx)
if isinstance(comment_rx, str): comment_rx = re.compile(comment_rx)
......@@ -362,11 +373,13 @@ def generate_include_map(source_name: str,
raise ValueError(f'Circular include of {source_name} detected!')
file_names.add(source_name)
dirname = os.path.dirname(source_name)
source_pointer = 0
source_offset = 0
result_pointer = 0
last_begin = -1
begin, length, include_name = find_next(source_text, 0)
include_name = os.path.join(dirname, include_name)
while begin >= 0:
assert begin > last_begin
source_delta = begin - source_pointer
......@@ -396,6 +409,7 @@ def generate_include_map(source_name: str,
source_offset += length - inner_length
map.offsets.append(source_offset)
begin, length, include_name = find_next(source_text, source_pointer)
include_name = os.path.join(dirname, include_name)
rest = source_text[source_pointer:]
if rest:
result.append(rest)
......@@ -417,8 +431,8 @@ def srcmap_includes(position: int, inclmap: IncludeMap) -> SourceLocation:
raise ValueError
def preprocess_includes(source_name: str,
source_text: Optional[str],
def preprocess_includes(source_text: Optional[str],
source_name: str,
find_next_include: FindIncludeFunc) -> Preprocessed:
if not source_text:
with open(source_name, 'r', encoding='utf-8') as f:
......
......@@ -3,7 +3,7 @@
# preamble
@ literalws = right
@ whitespace = /[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?/ # insignificant whitespace, including at most one linefeed
@ comment = /%.*/
@ comment = /%.*/ # note: trailing linefeed is not part of the comment proper
@ reduction = merge_treetops
@ disposable = _WSPC, _GAP, _LB, _PARSEP, _LETTERS, _NAME, INTEGER, FRAC,
_QUALIFIED, TEXT_NOPAR, TEXT, _block_content,
......@@ -18,11 +18,11 @@
#
########################################################################
latexdoc = preamble document
latexdoc = preamble §document
preamble = { [_WSPC] command }+
document = [_WSPC] "\begin{document}"
frontpages
§frontpages
(Chapters | Sections)
[Bibliography] [Index] [_WSPC]
"\end{document}" [_WSPC] §EOF
......@@ -115,11 +115,12 @@ inline_math = /\$/ /[^$]*/ §/\$/
#### commands ####
command = known_command | text_command | generic_command
known_command = citet | citep | footnote | includegraphics | caption
| multicolumn | hline | cline | documentclass | pdfinfo
| hypersetup
text_command = TXTCOMMAND | ESCAPED | BRACKETS
generic_command = !no_command CMDNAME [[ ~ config ] ~ block ]
generic_command = !no_command CMDNAME [[ ~ config ] { ~ block }+ ]
| `{` CMDNAME _block_content §`}`
citet = "\citet" [config] block
......@@ -195,10 +196,9 @@ WARN_Komma = ","
#
#######################################################################
CMDNAME = /\\(?:(?![\d_])\w)+/~
TXTCOMMAND = /\\text\w+/
ESCAPED = /\\[%$&_\/{}]/
ESCAPED = /\\[%$&_\/{} ]/
SPECIAL = /[$&_\/\\\\]/
BRACKETS = /[\[\]]/ # left or right square bracket: [ ]
LINEFEED = /[\\][\\]/
......
......@@ -48,7 +48,8 @@ from DHParser import start_logging, suspend_logging, resume_logging, is_filename
trace_history, has_descendant, neg, has_ancestor, optional_last_value, insert, \
positions_of, replace_tag_names, add_attributes, delimit_children, merge_connected, \
has_attr, has_parent, ThreadLocalSingletonFactory, Error, canonical_error_strings, \
has_errors, apply_unless, WARNING, ERROR, FATAL, EMPTY_NODE, TreeReduction, CombinedParser
has_errors, apply_unless, WARNING, ERROR, FATAL, EMPTY_NODE, TreeReduction, CombinedParser, \
Preprocessed, neutral_mapping, preprocess_includes, gen_find_include_func, flatten_sxpr
#######################################################################
......@@ -57,12 +58,13 @@ from DHParser import start_logging, suspend_logging, resume_logging, is_filename
#
#######################################################################
def nop(arg):
return arg
RX_TEX_INPUT = r'\\input{(?P<name>.*)}'
def LaTeXPreprocessor(text):
return text, nop
def LaTeXPreprocessor(text: str, file_name: str) -> Preprocessed:
find_includes = gen_find_include_func(RX_TEX_INPUT, LaTeXGrammar.comment_rx__)
return preprocess_includes(text, file_name, find_includes)
def get_preprocessor() -> PreprocessorFunc:
......@@ -83,7 +85,7 @@ class LaTeXGrammar(Grammar):
paragraph = Forward()
param_block = Forward()
text_element = Forward()
source_hash__ = "74b31b1a6754004694c1d25e614d7f32"
source_hash__ = "49543176de36a2f3271970b00b62761d"
disposable__ = re.compile('_WSPC$|_GAP$|_LB$|_PARSEP$|_LETTERS$|_NAME$|INTEGER$|FRAC$|_QUALIFIED$|TEXT_NOPAR$|TEXT$|_block_content$|block_environment$|known_environment$|text_element$|line_element$|inline_environment$|known_inline_env$|info_block$|begin_inline_env$|end_inline_env$|command$|known_command$')
static_analysis_pending__ = [] # type: List[bool]
parser_initialization__ = ["upon instantiation"]
......@@ -120,7 +122,7 @@ class LaTeXGrammar(Grammar):
LINEFEED = RegExp('[\\\\][\\\\]')
BRACKETS = RegExp('[\\[\\]]')
SPECIAL = RegExp('[$&_/\\\\\\\\]')
ESCAPED = RegExp('\\\\[%$&_/{}]')
ESCAPED = RegExp('\\\\[%$&_/{} ]')
TXTCOMMAND = RegExp('\\\\text\\w+')
CMDNAME = Series(RegExp('\\\\(?:(?![\\d_])\\w)+'), dwsp__)
WARN_Komma = Series(Text(","), dwsp__)
......@@ -169,7 +171,7 @@ class LaTeXGrammar(Grammar):
generic_inline_env = Series(begin_inline_env, dwsp__, paragraph, end_inline_env, mandatory=3)
known_inline_env = Synonym(inline_math)
inline_environment = Alternative(known_inline_env, generic_inline_env)
generic_command = Alternative(Series(NegativeLookahead(no_command), CMDNAME, Option(Series(Option(Series(dwsp__, config)), dwsp__, block))), Series(Drop(Text("{")), CMDNAME, _block_content, Drop(Text("}")), mandatory=3))
generic_command = Alternative(Series(NegativeLookahead(no_command), CMDNAME, Option(Series(Option(Series(dwsp__, config)), OneOrMore(Series(dwsp__, block))))), Series(Drop(Text("{")), CMDNAME, _block_content, Drop(Text("}")), mandatory=3))
SubParagraph = Series(Series(Drop(Text("\\subparagraph")), dwsp__), heading, Option(sequence))
SubParagraphs = OneOrMore(Series(Option(_WSPC), SubParagraph))
frontpages = Synonym(sequence)
......@@ -201,13 +203,13 @@ class LaTeXGrammar(Grammar):
Sections = OneOrMore(Series(Option(_WSPC), Section))
Chapter = Series(Series(Drop(Text("\\chapter")), dwsp__), heading, ZeroOrMore(Alternative(sequence, Sections)))
Chapters = OneOrMore(Series(Option(_WSPC), Chapter))
document = Series(Option(_WSPC), Series(Drop(Text("\\begin{document}")), dwsp__), frontpages, Alternative(Chapters, Sections), Option(Bibliography), Option(Index), Option(_WSPC), Series(Drop(Text("\\end{document}")), dwsp__), Option(_WSPC), EOF, mandatory=9)
document = Series(Option(_WSPC), Series(Drop(Text("\\begin{document}")), dwsp__), frontpages, Alternative(Chapters, Sections), Option(Bibliography), Option(Index), Option(_WSPC), Series(Drop(Text("\\end{document}")), dwsp__), Option(_WSPC), EOF, mandatory=2)
param_block.set(Series(Series(Drop(Text("{")), dwsp__), Option(parameters), Series(Drop(Text("}")), dwsp__)))
block.set(Series(Series(Drop(Text("{")), dwsp__), _block_content, Drop(Text("}")), mandatory=2))
text_element.set(Alternative(line_element, LINEFEED))
paragraph.set(OneOrMore(Series(NegativeLookahead(blockcmd), text_element, Option(S))))
block_environment.set(Alternative(known_environment, generic_block))
latexdoc = Series(preamble, document)
latexdoc = Series(preamble, document, mandatory=1)
root__ = TreeReduction(latexdoc, CombinedParser.MERGE_TREETOPS)
......@@ -267,6 +269,9 @@ def transform_generic_command(context: List[Node]):
def transform_generic_block(context: List[Node]):
node = context[-1]
if not node.children or not node.children[0].children:
context[0].new_error(node, 'unknown kind of block: ' + flatten_sxpr(node.as_sxpr()))
else:
# assert node.children[0].tag_name == "begin_generic_block"
# assert node.children[0].children[0].tag_name == "begin_environment"
# assert node.children[-1].tag_name == "end_generic_block"
......@@ -346,6 +351,7 @@ LaTeX_AST_transformation_table = {
"structural": [],
"CMDNAME": [remove_whitespace, reduce_single_child],
"TXTCOMMAND": [remove_whitespace, reduce_single_child],
"NO_CMD": [add_error("unknown kind of command")],
"NAME": [reduce_single_child, remove_whitespace, reduce_single_child],
"ESCAPED": [transform_content(lambda node: str(node)[1:])],
"BRACKETS": [],
......@@ -940,7 +946,8 @@ if __name__ == "__main__":
if errors:
for err_str in canonical_error_strings(errors, file_names[0]):
print(err_str)
if has_errors(errors, ERROR):
sys.exit(1)
else:
print(result.serialize(how='default' if args.xml is None else 'xml')
if isinstance(result, Node) else result)
......@@ -92,6 +92,15 @@
"""
8: """
%\title{Vorlesung: Grundlagen des Entscheidens I}
%\author{Eckhart Arnold}
%\date{Stand: 6. Juli 2009}
%\maketitle
"""
[fail:_WSPC]
10: "X"
......
......@@ -41,4 +41,8 @@
/Keywords (Computer Simulations, Validation of Simulations)
}"""
13*: """\usepackage[pdftex]{hyperref}"""
13: """\usepackage[pdftex]{hyperref}"""
14: """\numberwithin{equation}{section}"""
15: """\newcommand{\marginline}{\marginnote}"""
16: """\renewcommand{\marginfont}{\scriptsize}"""
17: """\ """
[match:paragraph]
M1: """\ { }"""
[match:frontpages]
M1: """
%\title{Vorlesung: Grundlagen des Entscheidens I}
%\author{Eckhart Arnold}
%\date{Stand: 6. Juli 2009}
%\maketitle
\begin{titlepage}
\begin{center}
\ { }
\vspace{0.5cm}
{\Large Vorlesung: Grundlagen des Entscheidens I}
\vspace{0.75cm}
Sommersemester 2009
\vspace{0.5cm}
Stand: 6. Juli 2009 \\~\\
Hinweis: Das Skript wurde bisher noch wenig Korrektur gelesen und das letzte Kapitel
fehlt leider ganz. Es enthält jedem Menge Tippfehler und auch vereinzelte sachliche Fehler
können nicht ganz ausgeschlossen werden. Trotzdem: Viel Spaß beim Durcharbeiten!
\vspace{0.5cm}
Dozent: Dr. Eckhart Arnold
\vspace{1cm}
\includegraphics[width=6cm]{Grafiken/pe_logo.eps}
\vspace{0.25cm}
{\Large Universität Bayreuth}
\vspace{1.75cm}
\includegraphics[width=2.5cm]{Grafiken/CC-BY-SA.eps}
\vspace{0.5cm}
\begin{small}
Dieses Material ist frei zugänglich und darf unter den Bedingungen der
Creative-Commons-Lizenz BY-SA 4.0 weiter gegeben werden.
\vspace{0.5cm}
Die Klausel BY-SA besagt: Der Name des Autors muss bei abgeleiteten Werken
genannt werden, und abgeleitete Werke oder Kopien müssen ebenfalls unter
dieser Lizenz weitergegeben werden.
\end{small}
\end{center}
\end{titlepage}
\tableofcontents
\newpage
\setlength{\marginparwidth}{2cm}
"""
\ No newline at end of file
......@@ -48,8 +48,9 @@ if not DHParser.dsl.recompile_grammar(grammar_path, force=False):
sys.exit(1)
from LaTeXParser import get_grammar, get_transformer, get_compiler
from LaTeXParser import get_preprocessor, get_grammar, get_transformer, get_compiler
preprocessor = get_preprocessor()
parser = get_grammar()
transformer = get_transformer()
compiler = get_compiler()
......@@ -81,8 +82,10 @@ def tst_func():
with open(filepath, 'r', encoding='utf-8') as f:
doc = f.read()
print('\n\nParsing document: "%s"' % file)
result = parser(doc)
print(f'\n\nPreprocessing document: "{file}"')
preprocessed, source_mapper = preprocessor(doc, file)
print(f'\n\nParsing document: "{file}"')
result = parser(preprocessed)
print("Number of CST-nodes: " + str(tree_size(result)))
# print("Number of empty nodes: " + str(count_nodes(result,
# lambda n: not bool(n.result))))
......
......@@ -36,7 +36,7 @@ from DHParser.dsl import grammar_provider
from DHParser import compile_source
from DHParser.preprocess import make_token, tokenized_to_original_mapping, source_map, \
BEGIN_TOKEN, END_TOKEN, TOKEN_DELIMITER, SourceMapFunc, SourceMap, chain_preprocessors, \
strip_tokens, generate_find_include_func, preprocess_includes, IncludeInfo
strip_tokens, gen_find_include_func, preprocess_includes, IncludeInfo
from DHParser.toolkit import lstrip_docstring, typing, re
from DHParser.testing import TFFN
from typing import Tuple, Dict
......@@ -219,14 +219,14 @@ class TestTokenParsing:
class TestHelpers:
def test_generate_find_include_func(self):
rx = re.compile(r'include\((?P<name>[^)\n]*)\)')
find = generate_find_include_func(rx)
find = gen_find_include_func(rx)
info = find('''321include(sub.txt)xyz''', 0)
assert info == IncludeInfo(3, 16, 'sub.txt')
def test_generate_find_include_w_comments(self):
rx = re.compile(r'include\((?P<name>[^)\n]*)\)')
comment_rx = re.compile(r'#.*(?:\n|$)')
find = generate_find_include_func(rx, comment_rx)
find = gen_find_include_func(rx, comment_rx)
test = '''a
b # include(alpha)
c include(beta)
......@@ -275,8 +275,8 @@ class TestIncludes:
def test_simple_include(self):
def perform(main, sub):
self.create_files({'main.txt': main, 'sub.txt': sub})
find_func = generate_find_include_func(r'include\((?P<name>[^)\n]*)\)')
text, mapping = preprocess_includes('main.txt', None, find_func)
find_func = gen_find_include_func(r'include\((?P<name>[^)\n]*)\)')
text, mapping = preprocess_includes(None, 'main.txt', find_func)
# print(mapping)
assert text == main.replace('include(sub.txt)', 'abc'), text
for i in range(len(text)):
......@@ -298,8 +298,8 @@ class TestIncludes:
def test_complex_include(self):
def perform(**ensemble):
self.create_files(ensemble)
find_func = generate_find_include_func(r'#include\((?P<name>[^)\n]*)\)')
text, mapping = preprocess_includes('main', None, find_func)
find_func = gen_find_include_func(r'#include\((?P<name>[^)\n]*)\)')
text, mapping = preprocess_includes(None, 'main', find_func)
# print(mapping)
substrings = {}
for k, v in reversed(ensemble.items()):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment