LaTeXCompiler.py 30.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
#!/usr/bin/python

#######################################################################
#
# SYMBOLS SECTION - Can be edited. Changes will be preserved.
#
#######################################################################


10
from collections import defaultdict
11
12
import os
import sys
Eckhart Arnold's avatar
Eckhart Arnold committed
13
14
from functools import partial

15
16
17
18
try:
    import regex as re
except ImportError:
    import re
19
from DHParser import is_filename, Grammar, Compiler, Lookbehind, Alternative, Pop, \
di68kap's avatar
di68kap committed
20
21
    Synonym, Whitespace, Token, \
    Option, NegativeLookbehind, OneOrMore, RegExp, Series, Capture, \
22
    ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source, \
23
    PreprocessorFunc, TransformationDict, \
Eckhart Arnold's avatar
Eckhart Arnold committed
24
    Node, TransformationFunc, traverse, remove_children_if, is_anonymous, \
Eckhart Arnold's avatar
Eckhart Arnold committed
25
    reduce_single_child, replace_by_single_child, remove_whitespace, \
26
27
    flatten, is_empty, collapse, replace_content, replace_content_by, remove_brackets, \
    is_one_of, traverse_locally, remove_tokens, remove_nodes, TOKEN_PTYPE, Error
28
from DHParser.log import logging
29
30
31
32


#######################################################################
#
Eckhart Arnold's avatar
Eckhart Arnold committed
33
# PREPROCESSOR SECTION - Can be edited. Changes will be preserved.
34
35
36
#
#######################################################################

Eckhart Arnold's avatar
Eckhart Arnold committed
37
def LaTeXPreprocessor(text):
38
39
    return text

Eckhart Arnold's avatar
Eckhart Arnold committed
40
41
def get_preprocessor() -> PreprocessorFunc:
    return LaTeXPreprocessor
42
43
44
45
46
47
48
49
50
51
52


#######################################################################
#
# PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!
#
#######################################################################

class LaTeXGrammar(Grammar):
    r"""Parser for a LaTeX source file, with this grammar:
    
53
    # LaTeX-Grammar for DHParser
54
    
55
    # preamble
56
    @ whitespace = /[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?/    # optional whitespace, including at most one linefeed
Eckhart Arnold's avatar
Eckhart Arnold committed
57
    @ comment    = /%.*/
58
    
Eckhart Arnold's avatar
Eckhart Arnold committed
59
60
61
62
63
64
    ########################################################################
    #
    # outer document structure
    #
    ########################################################################
    
65
    latexdoc       = preamble document
66
    preamble       = { [WSPC] command }+
67
    
eckhart's avatar
eckhart committed
68
69
70
    document       = [WSPC] "\begin{document}"
                     frontpages
                     (Chapters | Sections)
Eckhart Arnold's avatar
Eckhart Arnold committed
71
72
                     [Bibliography] [Index] [WSPC]
                     "\end{document}" [WSPC] §EOF
73
74
75
76
77
78
79
80
81
    frontpages     = sequence
    
    
    #######################################################################
    #
    # document structure
    #
    #######################################################################
    
eckhart's avatar
eckhart committed
82
83
    Chapters       = { [WSPC] Chapter }+
    Chapter        = "\chapter" heading { sequence | Sections }
84
    
eckhart's avatar
eckhart committed
85
86
    Sections       = { [WSPC] Section }+
    Section        = "\section" heading { sequence | SubSections }
87
    
eckhart's avatar
eckhart committed
88
89
    SubSections    = { [WSPC] SubSection }+
    SubSection     = "\subsection" heading { sequence | SubSubSections }
90
    
eckhart's avatar
eckhart committed
91
92
    SubSubSections = { [WSPC] SubSubSection }+
    SubSubSection  = "\subsubsection" heading { sequence | Paragraphs }
93
    
eckhart's avatar
eckhart committed
94
95
    Paragraphs     = { [WSPC] Paragraph  }+
    Paragraph      = "\paragraph" heading { sequence | SubParagraphs }
96
    
eckhart's avatar
eckhart committed
97
98
    SubParagraphs  = { [WSPC] SubParagraph }+
    SubParagraph   = "\subparagraph" heading [ sequence ]
99
    
eckhart's avatar
eckhart committed
100
101
    Bibliography   = [WSPC] "\bibliography" heading
    Index          = [WSPC] "\printindex"
102
    
103
    heading        = block
104
105
106
107
108
109
110
111
112
    
    #######################################################################
    #
    # document content
    #
    #######################################################################
    
    
    #### block environments ####
113
    
Eckhart Arnold's avatar
Eckhart Arnold committed
114
    block_environment   = known_environment | generic_block
Eckhart Arnold's avatar
Eckhart Arnold committed
115
    known_environment   = itemize | enumerate | figure | tabular | quotation
116
                        | verbatim
Eckhart Arnold's avatar
Eckhart Arnold committed
117
    generic_block       = begin_generic_block sequence §end_generic_block
Eckhart Arnold's avatar
Eckhart Arnold committed
118
119
    begin_generic_block = -&LB begin_environment LFF
    end_generic_block   = -&LB  end_environment LFF
120
    
Eckhart Arnold's avatar
Eckhart Arnold committed
121
122
    itemize             = "\begin{itemize}" [WSPC] { item } §"\end{itemize}"
    enumerate           = "\begin{enumerate}" [WSPC] {item } §"\end{enumerate}"
eckhart's avatar
eckhart committed
123
    item                = "\item" sequence
124
    
Eckhart Arnold's avatar
Eckhart Arnold committed
125
126
127
128
    figure              = "\begin{figure}" sequence §"\end{figure}"
    quotation           = ("\begin{quotation}" sequence §"\end{quotation}")
                        | ("\begin{quote}" sequence §"\end{quote}")
    verbatim            = "\begin{verbatim}" sequence §"\end{verbatim}"
Eckhart Arnold's avatar
Eckhart Arnold committed
129
    tabular             = "\begin{tabular}" tabular_config { tabular_row } §"\end{tabular}"
130
131
    tabular_row         = (multicolumn | tabular_cell) { "&" (multicolumn | tabular_cell) }
                          "\\" ( hline | { cline } )
132
    tabular_cell        = { line_element //~ }
Eckhart Arnold's avatar
Eckhart Arnold committed
133
    tabular_config      = "{" /[lcr|]+/~ §"}"
134
    
Eckhart Arnold's avatar
Eckhart Arnold committed
135
    
136
137
    #### paragraphs and sequences of paragraphs ####
    
138
    block_of_paragraphs = "{" [sequence] §"}"
eckhart's avatar
eckhart committed
139
    sequence            = [WSPC] { (paragraph | block_environment ) [PARSEP] }+
140
141
142
    paragraph           = { !blockcmd text_element //~ }+
    text_element        = line_element | LINEFEED
    line_element        = text | block | inline_environment | command
143
    
Eckhart Arnold's avatar
Eckhart Arnold committed
144
    
145
146
    #### inline enivronments ####
    
147
    inline_environment  = known_inline_env | generic_inline_env
148
    known_inline_env    = inline_math
Eckhart Arnold's avatar
Eckhart Arnold committed
149
150
    generic_inline_env  = begin_inline_env //~ paragraph §end_inline_env
    begin_inline_env    = (-!LB begin_environment) | (begin_environment !LFF)
151
    end_inline_env      = end_environment
eckhart's avatar
eckhart committed
152
                          ## (-!LB end_environment)   | (end_environment !LFF)  # ambiguity with generic_block when EOF
153
154
    begin_environment   = /\\begin{/ §NAME /}/
    end_environment     = /\\end{/ §::NAME /}/
155
    
Eckhart Arnold's avatar
Eckhart Arnold committed
156
    inline_math         = /\$/ /[^$]*/ §/\$/
157
    
Eckhart Arnold's avatar
Eckhart Arnold committed
158
    
159
160
    #### commands ####
    
Eckhart Arnold's avatar
Eckhart Arnold committed
161
    command             = known_command | text_command | generic_command
162
163
    known_command       = citet | citep | footnote | includegraphics | caption
                        | multicolumn | hline | cline | documentclass | pdfinfo
164
    text_command        = TXTCOMMAND | ESCAPED | BRACKETS
Eckhart Arnold's avatar
Eckhart Arnold committed
165
    generic_command     = !no_command CMDNAME [[ //~ config ] //~ block ]
166
    
167
168
    citet               = "\citet" [config] block
    citep               = ("\citep" | "\cite") [config] block
169
    footnote            = "\footnote" block_of_paragraphs
Eckhart Arnold's avatar
Eckhart Arnold committed
170
    includegraphics     = "\includegraphics" [ config ] block
Eckhart Arnold's avatar
Eckhart Arnold committed
171
    caption             = "\caption" block
Eckhart Arnold's avatar
Eckhart Arnold committed
172
173
174
    multicolumn         = "\multicolumn" "{" INTEGER "}" tabular_config block_of_paragraphs
    hline               = "\hline"
    cline               = "\cline{" INTEGER "-" INTEGER "}"
175
176
    documentclass       = "\documentclass" [ config ] block
    pdfinfo             = "\pdfinfo" block
Eckhart Arnold's avatar
Eckhart Arnold committed
177
    
178
    
179
180
181
182
183
184
    #######################################################################
    #
    # low-level text and character sequences
    #
    #######################################################################
    
185
    
186
    config     = "[" cfg_text §"]"
Eckhart Arnold's avatar
Eckhart Arnold committed
187
    cfg_text   = { ([//~] text) | CMDNAME | SPECIAL }
Eckhart Arnold's avatar
Eckhart Arnold committed
188
189
    block      = /{/ //~ { !blockcmd text_element //~ } §/}/
    text       = TEXTCHUNK { //~ TEXTCHUNK }
190
    
Eckhart Arnold's avatar
Eckhart Arnold committed
191
192
193
194
195
    no_command = "\begin{" | "\end" | BACKSLASH structural
    blockcmd   = BACKSLASH ( ( "begin{" | "end{" )
                             ( "enumerate" | "itemize" | "figure" | "quote"
                             | "quotation" | "tabular") "}"
                           | structural | begin_generic_block | end_generic_block )
196
197
198
    
    structural = "subsection" | "section" | "chapter" | "subsubsection"
               | "paragraph" | "subparagraph" | "item"
199
200
201
202
    
    
    #######################################################################
    #
203
    # primitives
204
205
    #
    #######################################################################
206
    
207
    
208
    CMDNAME    = /\\(?:(?!_)\w)+/~
Eckhart Arnold's avatar
Eckhart Arnold committed
209
210
    TXTCOMMAND = /\\text\w+/
    ESCAPED    = /\\[%$&_\/{}]/
Eckhart Arnold's avatar
Eckhart Arnold committed
211
    SPECIAL    = /[$&_\\\\\/]/
Eckhart Arnold's avatar
Eckhart Arnold committed
212
    BRACKETS   = /[\[\]]/                       # left or right square bracket: [ ]
213
    LINEFEED   = /[\\][\\]/
Eckhart Arnold's avatar
Eckhart Arnold committed
214
    
215
    NAME       = /\w+/~
216
    INTEGER    = /\d+/~
217
218
219
    
    TEXTCHUNK  = /[^\\%$&\{\}\[\]\s\n]+/        # some piece of text excluding whitespace,
                                                # linefeed and special characters
Eckhart Arnold's avatar
Eckhart Arnold committed
220
221
222
    LF         = NEW_LINE { COMMENT__ WHITESPACE__ }   # linefeed but not an empty line
    LFF        = NEW_LINE [ WSPC ]              # at least one linefeed
    PARSEP     = { WHITESPACE__ COMMENT__ } GAP [WSPC] # paragraph separator
223
    WSPC       = { COMMENT__ | /\s+/ }+         # arbitrary horizontal or vertical whitespace
Eckhart Arnold's avatar
Eckhart Arnold committed
224
    GAP        = /[ \t]*(?:\n[ \t]*)+\n/~       # at least one empty line, i.e.
225
                                                # [whitespace] linefeed [whitespace] linefeed
Eckhart Arnold's avatar
Eckhart Arnold committed
226
    NEW_LINE   = /[ \t]*/ [COMMENT__] /\n/
227
228
    LB         = /\s*?\n|$/                     # backwards line break for Lookbehind-Operator
                                                # beginning of text marker '$' added for test code
Eckhart Arnold's avatar
Eckhart Arnold committed
229
230
231
    BACKSLASH  = /[\\]/
    
    EOF        = /(?!.)/                        # End-Of-File
232
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
233
    begin_generic_block = Forward()
234
    block_environment = Forward()
235
    block_of_paragraphs = Forward()
Eckhart Arnold's avatar
Eckhart Arnold committed
236
    end_generic_block = Forward()
Eckhart Arnold's avatar
Eckhart Arnold committed
237
    paragraph = Forward()
238
    tabular_config = Forward()
239
    text_element = Forward()
240
    source_hash__ = "79e85f223d89452f2ba796f9c40daac9"
241
    parser_initialization__ = "upon instantiation"
Eckhart Arnold's avatar
Eckhart Arnold committed
242
243
    COMMENT__ = r'%.*'
    WHITESPACE__ = r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?'
244
    WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
di68kap's avatar
di68kap committed
245
    wsp__ = Whitespace(WSP_RE__)
Eckhart Arnold's avatar
Eckhart Arnold committed
246
    EOF = RegExp('(?!.)')
Eckhart Arnold's avatar
Eckhart Arnold committed
247
248
    BACKSLASH = RegExp('[\\\\]')
    LB = RegExp('\\s*?\\n|$')
di68kap's avatar
di68kap committed
249
    NEW_LINE = Series(RegExp('[ \\t]*'), Option(RegExp(COMMENT__)), RegExp('\\n'))
di68kap's avatar
di68kap committed
250
    GAP = Series(RegExp('[ \\t]*(?:\\n[ \\t]*)+\\n'), wsp__)
Eckhart Arnold's avatar
Eckhart Arnold committed
251
    WSPC = OneOrMore(Alternative(RegExp(COMMENT__), RegExp('\\s+')))
di68kap's avatar
di68kap committed
252
253
254
    PARSEP = Series(ZeroOrMore(Series(RegExp(WHITESPACE__), RegExp(COMMENT__))), GAP, Option(WSPC))
    LFF = Series(NEW_LINE, Option(WSPC))
    LF = Series(NEW_LINE, ZeroOrMore(Series(RegExp(COMMENT__), RegExp(WHITESPACE__))))
Eckhart Arnold's avatar
Eckhart Arnold committed
255
    TEXTCHUNK = RegExp('[^\\\\%$&\\{\\}\\[\\]\\s\\n]+')
di68kap's avatar
di68kap committed
256
257
    INTEGER = Series(RegExp('\\d+'), wsp__)
    NAME = Capture(Series(RegExp('\\w+'), wsp__))
258
    LINEFEED = RegExp('[\\\\][\\\\]')
Eckhart Arnold's avatar
Eckhart Arnold committed
259
    BRACKETS = RegExp('[\\[\\]]')
Eckhart Arnold's avatar
Eckhart Arnold committed
260
    SPECIAL = RegExp('[$&_\\\\\\\\/]')
Eckhart Arnold's avatar
Eckhart Arnold committed
261
262
    ESCAPED = RegExp('\\\\[%$&_/{}]')
    TXTCOMMAND = RegExp('\\\\text\\w+')
di68kap's avatar
di68kap committed
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
    CMDNAME = Series(RegExp('\\\\(?:(?!_)\\w)+'), wsp__)
    structural = Alternative(Series(Token("subsection"), wsp__), Series(Token("section"), wsp__), Series(Token("chapter"), wsp__), Series(Token("subsubsection"), wsp__), Series(Token("paragraph"), wsp__), Series(Token("subparagraph"), wsp__), Series(Token("item"), wsp__))
    blockcmd = Series(BACKSLASH, Alternative(Series(Alternative(Series(Token("begin{"), wsp__), Series(Token("end{"), wsp__)), Alternative(Series(Token("enumerate"), wsp__), Series(Token("itemize"), wsp__), Series(Token("figure"), wsp__), Series(Token("quote"), wsp__), Series(Token("quotation"), wsp__), Series(Token("tabular"), wsp__)), Series(Token("}"), wsp__)), structural, begin_generic_block, end_generic_block))
    no_command = Alternative(Series(Token("\\begin{"), wsp__), Series(Token("\\end"), wsp__), Series(BACKSLASH, structural))
    text = Series(TEXTCHUNK, ZeroOrMore(Series(RegExp(''), wsp__, TEXTCHUNK)))
    block = Series(RegExp('{'), RegExp(''), wsp__, ZeroOrMore(Series(NegativeLookahead(blockcmd), text_element, RegExp(''), wsp__)), RegExp('}'), mandatory=4)
    cfg_text = ZeroOrMore(Alternative(Series(Option(Series(RegExp(''), wsp__)), text), CMDNAME, SPECIAL))
    config = Series(Series(Token("["), wsp__), cfg_text, Series(Token("]"), wsp__), mandatory=2)
    pdfinfo = Series(Series(Token("\\pdfinfo"), wsp__), block)
    documentclass = Series(Series(Token("\\documentclass"), wsp__), Option(config), block)
    cline = Series(Series(Token("\\cline{"), wsp__), INTEGER, Series(Token("-"), wsp__), INTEGER, Series(Token("}"), wsp__))
    hline = Series(Token("\\hline"), wsp__)
    multicolumn = Series(Series(Token("\\multicolumn"), wsp__), Series(Token("{"), wsp__), INTEGER, Series(Token("}"), wsp__), tabular_config, block_of_paragraphs)
    caption = Series(Series(Token("\\caption"), wsp__), block)
    includegraphics = Series(Series(Token("\\includegraphics"), wsp__), Option(config), block)
    footnote = Series(Series(Token("\\footnote"), wsp__), block_of_paragraphs)
    citep = Series(Alternative(Series(Token("\\citep"), wsp__), Series(Token("\\cite"), wsp__)), Option(config), block)
    citet = Series(Series(Token("\\citet"), wsp__), Option(config), block)
    generic_command = Series(NegativeLookahead(no_command), CMDNAME, Option(Series(Option(Series(RegExp(''), wsp__, config)), RegExp(''), wsp__, block)))
282
    text_command = Alternative(TXTCOMMAND, ESCAPED, BRACKETS)
283
    known_command = Alternative(citet, citep, footnote, includegraphics, caption, multicolumn, hline, cline, documentclass, pdfinfo)
Eckhart Arnold's avatar
Eckhart Arnold committed
284
    command = Alternative(known_command, text_command, generic_command)
285
286
287
    inline_math = Series(RegExp('\\$'), RegExp('[^$]*'), RegExp('\\$'), mandatory=2)
    end_environment = Series(RegExp('\\\\end{'), Pop(NAME), RegExp('}'), mandatory=1)
    begin_environment = Series(RegExp('\\\\begin{'), NAME, RegExp('}'), mandatory=1)
288
    end_inline_env = Synonym(end_environment)
di68kap's avatar
di68kap committed
289
    begin_inline_env = Alternative(Series(NegativeLookbehind(LB), begin_environment), Series(begin_environment, NegativeLookahead(LFF)))
di68kap's avatar
di68kap committed
290
    generic_inline_env = Series(begin_inline_env, RegExp(''), wsp__, paragraph, end_inline_env, mandatory=4)
291
    known_inline_env = Synonym(inline_math)
292
    inline_environment = Alternative(known_inline_env, generic_inline_env)
293
294
    line_element = Alternative(text, block, inline_environment, command)
    text_element.set(Alternative(line_element, LINEFEED))
di68kap's avatar
di68kap committed
295
    paragraph.set(OneOrMore(Series(NegativeLookahead(blockcmd), text_element, RegExp(''), wsp__)))
eckhart's avatar
eckhart committed
296
    sequence = Series(Option(WSPC), OneOrMore(Series(Alternative(paragraph, block_environment), Option(PARSEP))))
di68kap's avatar
di68kap committed
297
298
299
300
301
302
303
304
305
306
307
    block_of_paragraphs.set(Series(Series(Token("{"), wsp__), Option(sequence), Series(Token("}"), wsp__), mandatory=2))
    tabular_config.set(Series(Series(Token("{"), wsp__), RegExp('[lcr|]+'), wsp__, Series(Token("}"), wsp__), mandatory=3))
    tabular_cell = ZeroOrMore(Series(line_element, RegExp(''), wsp__))
    tabular_row = Series(Alternative(multicolumn, tabular_cell), ZeroOrMore(Series(Series(Token("&"), wsp__), Alternative(multicolumn, tabular_cell))), Series(Token("\\\\"), wsp__), Alternative(hline, ZeroOrMore(cline)))
    tabular = Series(Series(Token("\\begin{tabular}"), wsp__), tabular_config, ZeroOrMore(tabular_row), Series(Token("\\end{tabular}"), wsp__), mandatory=3)
    verbatim = Series(Series(Token("\\begin{verbatim}"), wsp__), sequence, Series(Token("\\end{verbatim}"), wsp__), mandatory=2)
    quotation = Alternative(Series(Series(Token("\\begin{quotation}"), wsp__), sequence, Series(Token("\\end{quotation}"), wsp__), mandatory=2), Series(Series(Token("\\begin{quote}"), wsp__), sequence, Series(Token("\\end{quote}"), wsp__), mandatory=2))
    figure = Series(Series(Token("\\begin{figure}"), wsp__), sequence, Series(Token("\\end{figure}"), wsp__), mandatory=2)
    item = Series(Series(Token("\\item"), wsp__), sequence)
    enumerate = Series(Series(Token("\\begin{enumerate}"), wsp__), Option(WSPC), ZeroOrMore(item), Series(Token("\\end{enumerate}"), wsp__), mandatory=3)
    itemize = Series(Series(Token("\\begin{itemize}"), wsp__), Option(WSPC), ZeroOrMore(item), Series(Token("\\end{itemize}"), wsp__), mandatory=3)
di68kap's avatar
di68kap committed
308
309
    end_generic_block.set(Series(Lookbehind(LB), end_environment, LFF))
    begin_generic_block.set(Series(Lookbehind(LB), begin_environment, LFF))
310
    generic_block = Series(begin_generic_block, sequence, end_generic_block, mandatory=2)
Eckhart Arnold's avatar
Eckhart Arnold committed
311
    known_environment = Alternative(itemize, enumerate, figure, tabular, quotation, verbatim)
Eckhart Arnold's avatar
Eckhart Arnold committed
312
    block_environment.set(Alternative(known_environment, generic_block))
313
    heading = Synonym(block)
di68kap's avatar
di68kap committed
314
315
316
    Index = Series(Option(WSPC), Series(Token("\\printindex"), wsp__))
    Bibliography = Series(Option(WSPC), Series(Token("\\bibliography"), wsp__), heading)
    SubParagraph = Series(Series(Token("\\subparagraph"), wsp__), heading, Option(sequence))
eckhart's avatar
eckhart committed
317
    SubParagraphs = OneOrMore(Series(Option(WSPC), SubParagraph))
di68kap's avatar
di68kap committed
318
    Paragraph = Series(Series(Token("\\paragraph"), wsp__), heading, ZeroOrMore(Alternative(sequence, SubParagraphs)))
eckhart's avatar
eckhart committed
319
    Paragraphs = OneOrMore(Series(Option(WSPC), Paragraph))
di68kap's avatar
di68kap committed
320
    SubSubSection = Series(Series(Token("\\subsubsection"), wsp__), heading, ZeroOrMore(Alternative(sequence, Paragraphs)))
eckhart's avatar
eckhart committed
321
    SubSubSections = OneOrMore(Series(Option(WSPC), SubSubSection))
di68kap's avatar
di68kap committed
322
    SubSection = Series(Series(Token("\\subsection"), wsp__), heading, ZeroOrMore(Alternative(sequence, SubSubSections)))
eckhart's avatar
eckhart committed
323
    SubSections = OneOrMore(Series(Option(WSPC), SubSection))
di68kap's avatar
di68kap committed
324
    Section = Series(Series(Token("\\section"), wsp__), heading, ZeroOrMore(Alternative(sequence, SubSections)))
eckhart's avatar
eckhart committed
325
    Sections = OneOrMore(Series(Option(WSPC), Section))
di68kap's avatar
di68kap committed
326
    Chapter = Series(Series(Token("\\chapter"), wsp__), heading, ZeroOrMore(Alternative(sequence, Sections)))
eckhart's avatar
eckhart committed
327
    Chapters = OneOrMore(Series(Option(WSPC), Chapter))
328
    frontpages = Synonym(sequence)
di68kap's avatar
di68kap committed
329
    document = Series(Option(WSPC), Series(Token("\\begin{document}"), wsp__), frontpages, Alternative(Chapters, Sections), Option(Bibliography), Option(Index), Option(WSPC), Series(Token("\\end{document}"), wsp__), Option(WSPC), EOF, mandatory=9)
di68kap's avatar
di68kap committed
330
331
    preamble = OneOrMore(Series(Option(WSPC), command))
    latexdoc = Series(preamble, document)
332
333
334
335
336
337
338
339
    root__ = latexdoc
    
def get_grammar() -> LaTeXGrammar:
    global thread_local_LaTeX_grammar_singleton
    try:
        grammar = thread_local_LaTeX_grammar_singleton
    except NameError:
        thread_local_LaTeX_grammar_singleton = LaTeXGrammar()
Eckhart Arnold's avatar
Eckhart Arnold committed
340
341
        grammar = thread_local_LaTeX_grammar_singleton
    return grammar
342
343
344
345
346
347
348
349
350


#######################################################################
#
# AST SECTION - Can be edited. Changes will be preserved.
#
#######################################################################


351
def streamline_whitespace(context):
352
    if context[-2].parser.ptype == ":_Token":
eckhart's avatar
eckhart committed
353
        return
354
    node = context[-1]
355
    assert node.tag_name in ['WSPC', ':Whitespace']
356
357
    s = node.content
    if s.find('%') >= 0:
358
        node.result = '\n'
359
        # c = s.find('%')
360
361
        # node.result = ('  ' if (n >= c) or (n < 0) else '\n')+ s[c:].rstrip(' \t')
        # node.parser = MockParser('COMMENT', '')
362
363
364
    elif s.find('\n') >= 0:
        node.result = '\n'
    else:
365
        node.result = ' ' if s else ''
366
367


368
369
370
def watch(node):
    print(node.as_sxpr())

eckhart's avatar
eckhart committed
371
372
flatten_structure = flatten(lambda context: is_anonymous(context) or is_one_of(
    context, {"Chapters", "Sections", "SubSections", "SubSubSections", "Paragraphs",
373
              "SubParagraphs", "sequence"}), recursive=True)
eckhart's avatar
eckhart committed
374
375
376


def is_commandname(context):
eckhart's avatar
eckhart committed
377
378
    """Returns True, if last node in the content represents a (potentially
    unknown) LaTeX-command."""
eckhart's avatar
eckhart committed
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
    node = context[-1]
    if node.parser.ptype == TOKEN_PTYPE:
        parent = context[-2]
        if len(parent.children) > 1:
            parent_name = parent.tag_name.lower()
            content = str(node)
            if (content == '\\' + parent_name
                or content == '\\begin{' + parent_name + '}'
                or content == '\\end{' + parent_name + '}'):
                return True
    return False


drop_expendables = remove_children_if(lambda context: is_empty(context) or
                                                      is_one_of(context, {'PARSEP', 'WSPC'}) or
                                                      is_commandname(context))

396

397
398
LaTeX_AST_transformation_table = {
    # AST Transformations for the LaTeX-grammar
eckhart's avatar
eckhart committed
399
    "+": [drop_expendables, flatten_structure],
400
    "latexdoc": [],
401
    "preamble": [traverse_locally({'+': remove_whitespace, 'block': replace_by_single_child})],
eckhart's avatar
eckhart committed
402
    "document": [flatten_structure],
eckhart's avatar
eckhart committed
403
    "pdfinfo": [],
Eckhart Arnold's avatar
Eckhart Arnold committed
404
    "frontpages": reduce_single_child,
405
    "Chapters, Sections, SubSections, SubSubSections, Paragraphs, SubParagraphs": [],
eckhart's avatar
eckhart committed
406
    "Chapter, Section, SubSection, SubSubSection, Paragraph, SubParagraph": [],
Eckhart Arnold's avatar
Eckhart Arnold committed
407
    "heading": reduce_single_child,
408
409
    "Bibliography": [],
    "Index": [],
410
411
    "block_environment": replace_by_single_child,
    "known_environment": replace_by_single_child,
412
    "generic_block": [],
413
    "begin_generic_block, end_generic_block": [remove_nodes('NEW_LINE'), replace_by_single_child],
414
    "itemize, enumerate": [remove_brackets, flatten],
eckhart's avatar
eckhart committed
415
    "item": [],
416
    "figure": [],
Eckhart Arnold's avatar
Eckhart Arnold committed
417
    "quotation": [reduce_single_child, remove_brackets],
418
    "verbatim": [],
eckhart's avatar
eckhart committed
419
    "tabular": [],
Eckhart Arnold's avatar
Eckhart Arnold committed
420
    "tabular_config, block_of_paragraphs": [remove_brackets, reduce_single_child],
eckhart's avatar
eckhart committed
421
422
423
    "tabular_row": [flatten, remove_tokens('&', '\\')],
    "tabular_cell": [flatten, remove_whitespace],
    "multicolumn": [remove_tokens('{', '}')],
Eckhart Arnold's avatar
Eckhart Arnold committed
424
    "hline": [remove_whitespace, reduce_single_child],
425
426
    "sequence": [flatten],
    "paragraph": [flatten],
427
428
429
430
    "text_element": replace_by_single_child,
    "line_element": replace_by_single_child,
    "inline_environment": replace_by_single_child,
    "known_inline_env": replace_by_single_child,
431
    "generic_inline_env": [],
432
    "begin_inline_env, end_inline_env": [replace_by_single_child],
Eckhart Arnold's avatar
Eckhart Arnold committed
433
434
    "begin_environment, end_environment": [remove_brackets, reduce_single_child],
    "inline_math": [remove_brackets, reduce_single_child],
435
436
    "command": replace_by_single_child,
    "known_command": replace_by_single_child,
Eckhart Arnold's avatar
Eckhart Arnold committed
437
    "text_command": [],
438
439
440
441
    "generic_command": [flatten],
    "footnote": [],
    "includegraphics": [],
    "caption": [],
Eckhart Arnold's avatar
Eckhart Arnold committed
442
    "config": [remove_brackets, reduce_single_child],
443
    "block": [remove_brackets, flatten, replace_by_single_child],
444
445
    "text": collapse,
    "no_command, blockcmd": [],
446
    "structural": [],
Eckhart Arnold's avatar
Eckhart Arnold committed
447
448
449
    "CMDNAME": [remove_whitespace, reduce_single_child],
    "TXTCOMMAND": [remove_whitespace, reduce_single_child],
    "NAME": [reduce_single_child, remove_whitespace, reduce_single_child],
450
    "ESCAPED": [replace_content(lambda node: str(node)[1:])],
451
452
    "BRACKETS": [],
    "TEXTCHUNK": [],
453
454
    "LF": [],
    "PARSEP": replace_content(lambda node: '\n\n'),
Eckhart Arnold's avatar
Eckhart Arnold committed
455
    "GAP": [],
456
457
    "LB": [],
    "BACKSLASH": [],
458
    "EOF": [],
459
460
    # "PARSEP": [replace_content_by('\n\n')],
    # "WSPC": [replace_content_by(' ')],
461
    ":Whitespace": streamline_whitespace,
462
    "*": replace_by_single_child
463
464
}

465

466
467
def LaTeXTransform() -> TransformationDict:
    return partial(traverse, processing_table=LaTeX_AST_transformation_table.copy())
468

469

470
def get_transformer() -> TransformationFunc:
471
472
473
474
475
476
477
478
    global thread_local_LaTeX_transformer_singleton
    try:
        transformer = thread_local_LaTeX_transformer_singleton
    except NameError:
        thread_local_LaTeX_transformer_singleton = LaTeXTransform()
        transformer = thread_local_LaTeX_transformer_singleton
    return transformer

479
480
481
482
483
484
485
486


#######################################################################
#
# COMPILER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################

487
488
489
490
491
492

def empty_defaultdict():
    """Returns a defaultdict with an empty defaultdict as default value."""
    return defaultdict(empty_defaultdict)


493
494
495
class LaTeXCompiler(Compiler):
    """Compiler for the abstract-syntax-tree of a LaTeX source file.
    """
496
497
    KNOWN_DOCUMENT_CLASSES = {'book', 'article'}
    KNOWN_LANGUAGES = {'english', 'german'}
498
499
500
501

    def __init__(self, grammar_name="LaTeX", grammar_source=""):
        super(LaTeXCompiler, self).__init__(grammar_name, grammar_source)
        assert re.match('\w+\Z', grammar_name)
502
        self.metadata = defaultdict(empty_defaultdict)
503

504
505
506
507
    # def on_latexdoc(self, node):
    #     self.compile(node['preamble'])
    #     self.compile(node['document'])
    #     return node
508

509
510
    # def on_preamble(self, node):
    #     return node
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534

    # def on_document(self, node):
    #     return node

    # def on_frontpages(self, node):
    #     return node

    # def on_Chapters(self, node):
    #     return node

    # def on_Chapter(self, node):
    #     return node

    # def on_Sections(self, node):
    #     return node

    # def on_Section(self, node):
    #     return node

    # def on_SubSections(self, node):
    #     return node

    # def on_SubSection(self, node):
    #     return node
535

536
537
    # def on_SubSubSections(self, node):
    #     return node
538

539
540
    # def on_SubSubSection(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
541

542
543
    # def on_Paragraphs(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
544

545
546
    # def on_Paragraph(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
547

548
549
    # def on_SubParagraphs(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
550

551
552
    # def on_SubParagraph(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
553

554
555
    # def on_Bibliography(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
556

557
558
    # def on_Index(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
559

560
561
    # def on_heading(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
562

563
564
    # def on_block_environment(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
565

566
567
    # def on_known_environment(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
568

569
570
    # def on_generic_block(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
571

572
573
    # def on_begin_generic_block(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
574

575
576
    # def on_end_generic_block(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
577

578
579
    # def on_itemize(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
580

581
582
    # def on_enumerate(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
583

584
585
    # def on_item(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
586

587
588
    # def on_figure(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
589

590
591
    # def on_quotation(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
592

593
594
    # def on_verbatim(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
595

596
597
    # def on_tabular(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
598

599
600
    # def on_tabular_row(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
601

602
603
    # def on_tabular_cell(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
604

605
606
    # def on_tabular_config(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
607

608
609
    # def on_block_of_paragraphs(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
610

611
612
    # def on_sequence(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
613

614
615
    # def on_paragraph(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
616

617
618
    # def on_text_element(self, node):
    #     return node
619

620
621
    # def on_line_element(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
622

623
624
    # def on_inline_environment(self, node):
    #     return node
625

626
627
    # def on_known_inline_env(self, node):
    #     return node
628

629
630
    # def on_generic_inline_env(self, node):
    #     return node
631

632
633
    # def on_begin_inline_env(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
634

635
636
    # def on_end_inline_env(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
637

638
639
    # def on_begin_environment(self, node):
    #     return node
640

641
642
    # def on_end_environment(self, node):
    #     return node
643

644
645
    # def on_inline_math(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
646

647
648
    # def on_command(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
649

650
651
    # def on_known_command(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
652

653
654
    # def on_text_command(self, node):
    #     return node
655

656
657
    # def on_generic_command(self, node):
    #     return node
658

659
660
    # def on_footnote(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
661

662
663
    # def on_includegraphics(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
664

665
666
    # def on_caption(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
667

668
669
    # def on_multicolumn(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
670

671
672
    # def on_hline(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
673

674
675
    # def on_cline(self, node):
    #     return node
676

677
    def on_documentclass(self, node):
eckhart's avatar
eckhart committed
678
679
680
681
        """
        Saves the documentclass (if known) and the language (if given)
        in the metadata dictionary.
        """
682
683
684
        if 'config' in node:
            for it in {part.strip() for part in node['config'].content.split(',')}:
                if it in self.KNOWN_LANGUAGES:
685
                    if 'language' in node.attr:
686
687
                        self.metadata['language'] = it
                    else:
eckhart's avatar
eckhart committed
688
                        self.tree.new_error(node, 'Only one document language supported. '
689
690
                                            'Using %s, ignoring %s.'
                                            % (self.metadata['language'], it), Error.WARNING)
691
692
693
694
695
696
697
        if node['text'] in self.KNOWN_DOCUMENT_CLASSES:
            self.metadata['documentclass'] = node['text']
        return node

    def on_pdfinfo(self, node):
        return node

698
699
    # def on_config(self, node):
    #     return node
700

701
702
    # def on_cfg_text(self, node):
    #     return node
703

704
705
    # def on_block(self, node):
    #     return node
706

707
708
    # def on_text(self, node):
    #     return node
709

710
711
    # def on_no_command(self, node):
    #     return node
712

713
714
    # def on_blockcmd(self, node):
    #     return node
715

716
717
    # def on_structural(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
718

719
720
    # def on_CMDNAME(self, node):
    #     return node
721

722
723
    # def on_TXTCOMMAND(self, node):
    #     return node
724

725
726
    # def on_ESCAPED(self, node):
    #     return node
727

728
729
    # def on_SPECIAL(self, node):
    #     return node
730

731
732
    # def on_BRACKETS(self, node):
    #     return node
733

734
735
    # def on_LINEFEED(self, node):
    #     return node
736

737
738
    # def on_NAME(self, node):
    #     return node
739

740
741
    # def on_INTEGER(self, node):
    #     return node
742

743
744
    # def on_TEXTCHUNK(self, node):
    #     return node
745

746
747
    # def on_LF(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
748

749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
    # def on_LFF(self, node):
    #     return node

    # def on_PARSEP(self, node):
    #     return node

    # def on_WSPC(self, node):
    #     return node

    # def on_GAP(self, node):
    #     return node

    # def on_NEW_LINE(self, node):
    #     return node

    # def on_LB(self, node):
    #     return node

    # def on_BACKSLASH(self, node):
    #     return node

    # def on_EOF(self, node):
    #     return node
Eckhart Arnold's avatar
Eckhart Arnold committed
772

773
774
775
776
777
778
779
780
781

def get_compiler(grammar_name="LaTeX", grammar_source="") -> LaTeXCompiler:
    global thread_local_LaTeX_compiler_singleton
    try:
        compiler = thread_local_LaTeX_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
    except NameError:
        thread_local_LaTeX_compiler_singleton = \
            LaTeXCompiler(grammar_name, grammar_source)
782
783
        compiler = thread_local_LaTeX_compiler_singleton
    return compiler
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799


#######################################################################
#
# END OF DHPARSER-SECTIONS
#
#######################################################################


def compile_src(source):
    """Compiles ``source`` and returns (result, errors, ast).
    """
    with logging("LOGS"):
        compiler = get_compiler()
        cname = compiler.__class__.__name__
        log_file_name = os.path.basename(os.path.splitext(source)[0]) \
800
801
            if is_filename(source) < 0 else cname[:cname.find('.')] + '_out'    
        result = compile_source(source, get_preprocessor(), 
802
803
804
805
806
807
808
809
810
811
812
813
814
                                get_grammar(),
                                get_transformer(), compiler)
    return result


if __name__ == "__main__":
    if len(sys.argv) > 1:
        result, errors, ast = compile_src(sys.argv[1])
        if errors:
            for error in errors:
                print(error)
            sys.exit(1)
        else:
Eckhart Arnold's avatar
Eckhart Arnold committed
815
            print(result.as_xml() if isinstance(result, Node) else result)
816
817
    else:
        print("Usage: LaTeXCompiler.py [FILENAME]")