11.3.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

LaTeXCompiler.py 17.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/python

#######################################################################
#
# SYMBOLS SECTION - Can be edited. Changes will be preserved.
#
#######################################################################


import os
import sys
12 13
from functools import partial

14 15 16 17
try:
    import regex as re
except ImportError:
    import re
18 19 20
from DHParser.toolkit import logging, is_filename
from DHParser.parsers import Grammar, Compiler, Alternative, Pop, Required, Token, Synonym, \
    Optional, OneOrMore, Series, RE, Capture, \
21
    ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source, \
22
    PreprocessorFunc
23 24 25 26
from DHParser.syntaxtree import traverse, remove_brackets, reduce_single_child, replace_by_single_child, \
    remove_expendables, flatten, join, \
    collapse, replace_content, TransformationFunc, \
    remove_empty
27 28 29 30 31 32 33 34 35 36 37


#######################################################################
#
# SCANNER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################

def LaTeXScanner(text):
    return text

38 39

def get_scanner() -> PreprocessorFunc:
40 41 42 43 44 45 46 47 48 49 50 51
    return LaTeXScanner


#######################################################################
#
# PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!
#
#######################################################################

class LaTeXGrammar(Grammar):
    r"""Parser for a LaTeX source file, with this grammar:
    
52
    # LaTeX-Grammar for DHParser
53 54 55 56 57 58
    
    @ testing    = True
    @ whitespace = /[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?/    # optional whitespace, including at most one linefeed
    @ comment    = /%.*(?:\n|$)/
    
    
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
    latexdoc       = preamble document
    preamble       = { command }+
    
    document       = [PARSEP] "\begin{document}" [PARSEP]
                     frontpages [PARSEP]
                     (Chapters | Sections) [PARSEP]
                     [Bibliography] [Index] [PARSEP]
                     "\end{document}" [PARSEP] §EOF
    frontpages     = sequence
    
    
    #######################################################################
    #
    # document structure
    #
    #######################################################################
    
    Chapters       = { Chapter [PARSEP] }+
    Chapter        = "\Chapter" block [PARSEP] { sequence | Sections }
    
    Sections       = { Section [PARSEP] }+
    Section        = "\Section" block [PARSEP] { sequence | SubSections }
    
    SubSections    = { SubSection [PARSEP] }+
    SubSection     = "\SubSection" block [PARSEP] { sequence | SubSubSections }
    
    SubSubSections = { SubSubSection [PARSEP] }+
    SubSubSection  = "\SubSubSection" block [PARSEP] { sequence | Paragraphs }
    
    Paragraphs     = { Paragraph [PARSEP] }+
    Paragraph      = "\paragraph" block [PARSEP] { sequence | SubParagraphs }
    
    SubParagraphs  = { SubParagraph [PARSEP] }+
    SubParagraph   = "\subparagpaph" block [PARSEP] { sequence }
    
    Bibliography   = "\bibliography" block [PARSEP]
    Index          = "\printindex" [PARSEP]
    
    
    #######################################################################
    #
    # document content
    #
    #######################################################################
    
    
    #### block environments ####
106
    
107 108 109 110
    # TODO: ambiguity between generic bock envieronments and generic inline environments
    
    block_environment   = known_environment | generic_environment
    known_environment   = itemize | enumerate | figure | table | quotation
111
                        | verbatim
112
    generic_environment = begin_environment sequence §end_environment
113
    
114
    itemize             = "\begin{itemize}" [PARSEP] { item } §"\end{itemize}"
115
    enumerate           = "\begin{enumerate}" [PARSEP] {item } §"\end{enumerate}"
116
    item                = "\item" [PARSEP] sequence
117
    
118 119 120 121 122 123
    figure              = "\begin{figure}" sequence "\end{figure}"
    quotation           = ("\begin{quotation}" sequence "\end{quotation}")
                        | ("\begin{quote}" sequence "\end{quote}")
    verbatim            = "\begin{verbatim}" sequence "\end{verbatim}"
    table               = "\begin{tabular}" table_config sequence "\end{tabular}"
    table_config        = "{" /[lcr|]+/~ "}"
124 125
    
    
126 127 128
    #### paragraphs and sequences of paragraphs ####
    
    block_of_paragraphs = /{/ sequence §/}/
129
    sequence            = { (paragraph | block_environment ) [PARSEP] }+
130 131
    
    paragraph           = { !blockcmd text_elements //~ }+
132
    text_elements       = command | text | block | inline_environment
133 134 135 136
    
    
    #### inline enivronments ####
    
137
    inline_environment  = known_inline_env | generic_inline_env
138
    known_inline_env    = inline_math
139 140 141
    generic_inline_env  = begin_environment { text_elements }+ §end_environment
    begin_environment   = "\begin{" §NAME §"}"
    end_environment     = "\end{" §::NAME §"}"
142 143 144 145 146 147 148
    
    inline_math         = "$" MATH "$"
    
    
    #### commands ####
    
    command             = known_command | generic_command
Eckhart Arnold's avatar
Eckhart Arnold committed
149
    known_command       = footnote | includegraphics | caption
150 151 152
    generic_command     = CMDNAME [[ //~ config ] //~ block ]
    
    footnote            = "\footnote" block_of_paragraphs
Eckhart Arnold's avatar
Eckhart Arnold committed
153 154
    includegraphics     = "\includegraphics" config block
    caption             = "\caption" block
155 156 157 158 159 160 161
    
    #######################################################################
    #
    # low-level text and character sequences
    #
    #######################################################################
    
162
    config     = "[" cfgtext §"]"
163
    block      = /{/ { text_elements } §/}/
164 165 166 167 168
    
    text       = { cfgtext | (BRACKETS //~) }+
    cfgtext    = { word_sequence | (ESCAPED //~) }+
    word_sequence = { TEXTCHUNK //~ }+
    
169 170 171 172 173 174 175
    blockcmd   = /[\\]/ ( ( "begin{" | "end{" )
                          ( "enumerate" | "itemize" | "figure" | "quote"
                          | "quotation" | "tabular") "}"
                        | structural)
    
    structural = "subsection" | "section" | "chapter" | "subsubsection"
               | "paragraph" | "subparagraph" | "item"
176 177 178 179 180 181 182
    
    
    #######################################################################
    #
    # Primitives
    #
    #######################################################################
183 184 185
    
    CMDNAME    = /\\(?:(?!_)\w)+/~
    NAME       = /\w+/~
186
    MATH       = /[\w_^{}[\]]*/~
187 188 189 190 191 192 193 194 195 196 197
    
    ESCAPED    = /\\[%$&_\/]/
    BRACKETS   = /[\[\]]/                       # left or right square bracket: [ ]
    TEXTCHUNK  = /[^\\%$&\{\}\[\]\s\n]+/        # some piece of text excluding whitespace,
                                                # linefeed and special characters
    WSPC       = /[ \t]+/                       # (horizontal) whitespace
    LF         = !PARSEP /[ \t]*\n[ \t]*/       # LF but not an empty line
    PARSEP     = /[ \t]*(?:\n[ \t]*)+\n[ \t]*/  # at least one empty line, i.e.
                                                # [whitespace] linefeed [whitespace] linefeed
    EOF        = !/./
    """
198
    block_environment = Forward()
199 200
    block_of_paragraphs = Forward()
    text_elements = Forward()
201
    source_hash__ = "9a8cba2b425d276af78e141d7dda162c"
202 203 204 205 206 207 208 209 210 211 212 213
    parser_initialization__ = "upon instantiation"
    COMMENT__ = r'%.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?', comment=r'%.*(?:\n|$)')
    wspL__ = ''
    wspR__ = WSP__
    EOF = NegativeLookahead(RE('.', wR=''))
    PARSEP = RE('[ \\t]*(?:\\n[ \\t]*)+\\n[ \\t]*', wR='')
    LF = Series(NegativeLookahead(PARSEP), RE('[ \\t]*\\n[ \\t]*', wR=''))
    WSPC = RE('[ \\t]+', wR='')
    TEXTCHUNK = RE('[^\\\\%$&\\{\\}\\[\\]\\s\\n]+', wR='')
    BRACKETS = RE('[\\[\\]]', wR='')
    ESCAPED = RE('\\\\[%$&_/]', wR='')
214
    MATH = RE('[\\w_^{}[\\]]*')
215 216
    NAME = Capture(RE('\\w+'))
    CMDNAME = RE('\\\\(?:(?!_)\\w)+')
217 218 219 220 221 222 223
    structural = Alternative(Token("subsection"), Token("section"), Token("chapter"), Token("subsubsection"),
                             Token("paragraph"), Token("subparagraph"), Token("item"))
    blockcmd = Series(RE('[\\\\]', wR=''), Alternative(Series(Alternative(Token("begin{"), Token("end{")),
                                                              Alternative(Token("enumerate"), Token("itemize"),
                                                                          Token("figure"), Token("quote"),
                                                                          Token("quotation"), Token("tabular")),
                                                              Token("}")), structural))
224 225 226
    word_sequence = OneOrMore(Series(TEXTCHUNK, RE('')))
    cfgtext = OneOrMore(Alternative(word_sequence, Series(ESCAPED, RE(''))))
    text = OneOrMore(Alternative(cfgtext, Series(BRACKETS, RE(''))))
227
    block = Series(RE('{', wR=''), ZeroOrMore(text_elements), Required(RE('}', wR='')))
228
    config = Series(Token("["), cfgtext, Required(Token("]")))
Eckhart Arnold's avatar
Eckhart Arnold committed
229 230
    caption = Series(Token("\\caption"), block)
    includegraphics = Series(Token("\\includegraphics"), config, block)
231 232
    footnote = Series(Token("\\footnote"), block_of_paragraphs)
    generic_command = Series(CMDNAME, Optional(Series(Optional(Series(RE(''), config)), RE(''), block)))
Eckhart Arnold's avatar
Eckhart Arnold committed
233
    known_command = Alternative(footnote, includegraphics, caption)
234 235
    command = Alternative(known_command, generic_command)
    inline_math = Series(Token("$"), MATH, Token("$"))
236 237 238
    end_environment = Series(Token("\\end{"), Required(Pop(NAME)), Required(Token("}")))
    begin_environment = Series(Token("\\begin{"), Required(NAME), Required(Token("}")))
    generic_inline_env = Series(begin_environment, OneOrMore(text_elements), Required(end_environment))
239
    known_inline_env = Synonym(inline_math)
240 241
    inline_environment = Alternative(known_inline_env, generic_inline_env)
    text_elements.set(Alternative(command, text, block, inline_environment))
242
    paragraph = OneOrMore(Series(NegativeLookahead(blockcmd), text_elements, RE('')))
243
    sequence = OneOrMore(Series(Alternative(paragraph, block_environment), Optional(PARSEP)))
244 245 246 247
    block_of_paragraphs.set(Series(RE('{', wR=''), sequence, Required(RE('}', wR=''))))
    table_config = Series(Token("{"), RE('[lcr|]+'), Token("}"))
    table = Series(Token("\\begin{tabular}"), table_config, sequence, Token("\\end{tabular}"))
    verbatim = Series(Token("\\begin{verbatim}"), sequence, Token("\\end{verbatim}"))
Eckhart Arnold's avatar
Eckhart Arnold committed
248
    quotation = Alternative(Series(Token("\\begin{quotation}"), sequence, Token("\\end{quotation}")), Series(Token("\\begin{quote}"), sequence, Token("\\end{quote}")))
249 250
    figure = Series(Token("\\begin{figure}"), sequence, Token("\\end{figure}"))
    item = Series(Token("\\item"), Optional(PARSEP), sequence)
251 252
    enumerate = Series(Token("\\begin{enumerate}"), Optional(PARSEP), ZeroOrMore(item),
                       Required(Token("\\end{enumerate}")))
253
    itemize = Series(Token("\\begin{itemize}"), Optional(PARSEP), ZeroOrMore(item), Required(Token("\\end{itemize}")))
254 255 256
    generic_environment = Series(begin_environment, sequence, Required(end_environment))
    known_environment = Alternative(itemize, enumerate, figure, table, quotation, verbatim)
    block_environment.set(Alternative(known_environment, generic_environment))
257 258 259 260 261 262
    Index = Series(Token("\\printindex"), Optional(PARSEP))
    Bibliography = Series(Token("\\bibliography"), block, Optional(PARSEP))
    SubParagraph = Series(Token("\\subparagpaph"), block, Optional(PARSEP), ZeroOrMore(sequence))
    SubParagraphs = OneOrMore(Series(SubParagraph, Optional(PARSEP)))
    Paragraph = Series(Token("\\paragraph"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubParagraphs)))
    Paragraphs = OneOrMore(Series(Paragraph, Optional(PARSEP)))
Eckhart Arnold's avatar
Eckhart Arnold committed
263
    SubSubSection = Series(Token("\\SubSubSection"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, Paragraphs)))
264
    SubSubSections = OneOrMore(Series(SubSubSection, Optional(PARSEP)))
Eckhart Arnold's avatar
Eckhart Arnold committed
265
    SubSection = Series(Token("\\SubSection"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubSubSections)))
266 267 268 269 270 271
    SubSections = OneOrMore(Series(SubSection, Optional(PARSEP)))
    Section = Series(Token("\\Section"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubSections)))
    Sections = OneOrMore(Series(Section, Optional(PARSEP)))
    Chapter = Series(Token("\\Chapter"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, Sections)))
    Chapters = OneOrMore(Series(Chapter, Optional(PARSEP)))
    frontpages = Synonym(sequence)
Eckhart Arnold's avatar
Eckhart Arnold committed
272
    document = Series(Optional(PARSEP), Token("\\begin{document}"), Optional(PARSEP), frontpages, Optional(PARSEP), Alternative(Chapters, Sections), Optional(PARSEP), Optional(Bibliography), Optional(Index), Optional(PARSEP), Token("\\end{document}"), Optional(PARSEP), Required(EOF))
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
    preamble = OneOrMore(command)
    latexdoc = Series(preamble, document)
    root__ = latexdoc
    
def get_grammar() -> LaTeXGrammar:
    global thread_local_LaTeX_grammar_singleton
    try:
        grammar = thread_local_LaTeX_grammar_singleton
        return grammar
    except NameError:
        thread_local_LaTeX_grammar_singleton = LaTeXGrammar()
        return thread_local_LaTeX_grammar_singleton


#######################################################################
#
# AST SECTION - Can be edited. Changes will be preserved.
#
#######################################################################


def streamline_whitespace(node):
    assert node.tag_name in ['WSPC', ':Whitespace']
    s = str(node)
    c = s.find('%')
    n = s.find('\n')
    if c >= 0:
        node.result = ('  ' if (n >= c) or (n < 0) else '\n')+ s[c:].rstrip(' \t')
    elif s.find('\n') >= 0:
        node.result = '\n'
    else:
        node.result = ' '


LaTeX_AST_transformation_table = {
    # AST Transformations for the LaTeX-grammar
    "+":
        remove_empty,
    "latexdoc": [],
    "preamble": [],
    "document": [],
    "blockenv": [],
    "parblock": [],
    "sequence":
        flatten,
    "paragraph":
        [flatten(lambda node: not node.parser.name or node.parser.name == "text"),
         join('text', ':Whitespace')],
    "inlineenv": [],
    "beginenv": [],
    "endenv": [],
    "command": [],
    "config": [],
    "block": [remove_brackets, reduce_single_child],
    "text":
        [reduce_single_child, join('text', 'word_sequence', ':Whitespace')],
    "cfgtext": [flatten, reduce_single_child],
    "word_sequence":
        [collapse],
    "blockcmd": [],
    "CMDNAME":
        [remove_expendables, reduce_single_child],
    "NAME": [],
    "ESCAPED": [reduce_single_child],
    "BRACKETS": [],
    "TEXTCHUNK": [],
    "WSPC, :Whitespace":
        streamline_whitespace,
    "LF":
        replace_content(lambda node: '\n'),
    "PARSEP":
        replace_content(lambda node: '\n\n'),
    "EOF": [],
    "*":
        replace_by_single_child,
}

LaTeXTransform = partial(traverse, processing_table=LaTeX_AST_transformation_table)
# LaTeXTransform = lambda tree : 1

def get_transformer() -> TransformationFunc:
    return LaTeXTransform


#######################################################################
#
# COMPILER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################

class LaTeXCompiler(Compiler):
    """Compiler for the abstract-syntax-tree of a LaTeX source file.
    """

    def __init__(self, grammar_name="LaTeX", grammar_source=""):
        super(LaTeXCompiler, self).__init__(grammar_name, grammar_source)
        assert re.match('\w+\Z', grammar_name)

    def on_latexdoc(self, node):
        return node.as_sexpr()

    def on_preamble(self, node):
        pass

    def on_document(self, node):
        pass

    def on_blockenv(self, node):
        pass

    def on_parblock(self, node):
        pass

    def on_sequence(self, node):
        pass

    def on_paragraph(self, node):
        pass

    def on_inlineenv(self, node):
        pass

    def on_beginenv(self, node):
        pass

    def on_endenv(self, node):
        pass

    def on_command(self, node):
        pass

    def on_config(self, node):
        pass

    def on_block(self, node):
        pass

    def on_text(self, node):
        pass

    def on_cfgtext(self, node):
        pass

    def on_word_sequence(self, node):
        pass

    def on_blockcmd(self, node):
        pass

    def on_CMDNAME(self, node):
        pass

    def on_NAME(self, node):
        pass

    def on_ESCAPED(self, node):
        pass

    def on_BRACKETS(self, node):
        pass

    def on_TEXTCHUNK(self, node):
        pass

    def on_WSPC(self, node):
        pass

    def on_LF(self, node):
        pass

    def on_PARSEP(self, node):
        pass

    def on_EOF(self, node):
        pass


def get_compiler(grammar_name="LaTeX", grammar_source="") -> LaTeXCompiler:
    global thread_local_LaTeX_compiler_singleton
    try:
        compiler = thread_local_LaTeX_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_LaTeX_compiler_singleton = \
            LaTeXCompiler(grammar_name, grammar_source)
        return thread_local_LaTeX_compiler_singleton 


#######################################################################
#
# END OF DHPARSER-SECTIONS
#
#######################################################################


def compile_src(source):
    """Compiles ``source`` and returns (result, errors, ast).
    """
    with logging("LOGS"):
        compiler = get_compiler()
        cname = compiler.__class__.__name__
        log_file_name = os.path.basename(os.path.splitext(source)[0]) \
            if is_filename(source) < 0 else cname[:cname.find('.')] + '_out'    
        result = compile_source(source, get_scanner(), 
                                get_grammar(),
                                get_transformer(), compiler)
    return result


if __name__ == "__main__":
    if len(sys.argv) > 1:
        result, errors, ast = compile_src(sys.argv[1])
        if errors:
            for error in errors:
                print(error)
            sys.exit(1)
        else:
            print(result)
    else:
        print("Usage: LaTeXCompiler.py [FILENAME]")