11.3.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

LaTeXCompiler.py 17.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/python

#######################################################################
#
# SYMBOLS SECTION - Can be edited. Changes will be preserved.
#
#######################################################################


import os
import sys
12 13
from functools import partial

14 15 16 17
try:
    import regex as re
except ImportError:
    import re
18 19 20
from DHParser.toolkit import logging, is_filename
from DHParser.parsers import Grammar, Compiler, Alternative, Pop, Required, Token, Synonym, \
    Optional, OneOrMore, Series, RE, Capture, \
21
    ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source, \
22
    PreprocessorFunc
23 24 25 26
from DHParser.syntaxtree import traverse, remove_brackets, reduce_single_child, replace_by_single_child, \
    remove_expendables, flatten, join, \
    collapse, replace_content, TransformationFunc, \
    remove_empty
27 28 29 30 31 32 33 34 35 36 37


#######################################################################
#
# SCANNER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################

def LaTeXScanner(text):
    return text

38 39

def get_scanner() -> PreprocessorFunc:
40 41 42 43 44 45 46 47 48 49 50 51
    return LaTeXScanner


#######################################################################
#
# PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!
#
#######################################################################

class LaTeXGrammar(Grammar):
    r"""Parser for a LaTeX source file, with this grammar:
    
52
    # LaTeX-Grammar for DHParser
53 54 55 56 57 58
    
    @ testing    = True
    @ whitespace = /[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?/    # optional whitespace, including at most one linefeed
    @ comment    = /%.*(?:\n|$)/
    
    
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
    latexdoc       = preamble document
    preamble       = { command }+
    
    document       = [PARSEP] "\begin{document}" [PARSEP]
                     frontpages [PARSEP]
                     (Chapters | Sections) [PARSEP]
                     [Bibliography] [Index] [PARSEP]
                     "\end{document}" [PARSEP] §EOF
    frontpages     = sequence
    
    
    #######################################################################
    #
    # document structure
    #
    #######################################################################
    
    Chapters       = { Chapter [PARSEP] }+
    Chapter        = "\Chapter" block [PARSEP] { sequence | Sections }
    
    Sections       = { Section [PARSEP] }+
    Section        = "\Section" block [PARSEP] { sequence | SubSections }
    
    SubSections    = { SubSection [PARSEP] }+
    SubSection     = "\SubSection" block [PARSEP] { sequence | SubSubSections }
    
    SubSubSections = { SubSubSection [PARSEP] }+
    SubSubSection  = "\SubSubSection" block [PARSEP] { sequence | Paragraphs }
    
    Paragraphs     = { Paragraph [PARSEP] }+
    Paragraph      = "\paragraph" block [PARSEP] { sequence | SubParagraphs }
    
    SubParagraphs  = { SubParagraph [PARSEP] }+
    SubParagraph   = "\subparagpaph" block [PARSEP] { sequence }
    
    Bibliography   = "\bibliography" block [PARSEP]
    Index          = "\printindex" [PARSEP]
    
    
    #######################################################################
    #
    # document content
    #
    #######################################################################
    
    
    #### block environments ####
106
    
107 108 109 110
    # TODO: ambiguity between generic bock envieronments and generic inline environments
    
    block_environment   = known_environment | generic_environment
    known_environment   = itemize | enumerate | figure | table | quotation
111
                        | verbatim
112
    generic_environment = begin_environment sequence §end_environment
113
    
114
    itemize             = "\begin{itemize}" [PARSEP] { item } §"\end{itemize}"
115
    enumerate           = "\begin{enumerate}" [PARSEP] {item } §"\end{enumerate}"
116
    item                = "\item" [PARSEP] sequence
117
    
118 119 120 121 122 123
    figure              = "\begin{figure}" sequence "\end{figure}"
    quotation           = ("\begin{quotation}" sequence "\end{quotation}")
                        | ("\begin{quote}" sequence "\end{quote}")
    verbatim            = "\begin{verbatim}" sequence "\end{verbatim}"
    table               = "\begin{tabular}" table_config sequence "\end{tabular}"
    table_config        = "{" /[lcr|]+/~ "}"
124 125
    
    
126 127 128
    #### paragraphs and sequences of paragraphs ####
    
    block_of_paragraphs = /{/ sequence §/}/
129
    sequence            = { (paragraph | block_environment ) [PARSEP] }+
130 131
    
    paragraph           = { !blockcmd text_elements //~ }+
132
    text_elements       = command | text | block | inline_environment
133 134 135 136
    
    
    #### inline enivronments ####
    
137
    inline_environment  = known_inline_env | generic_inline_env
138
    known_inline_env    = inline_math
139 140 141
    generic_inline_env  = begin_environment { text_elements }+ §end_environment
    begin_environment   = "\begin{" §NAME §"}"
    end_environment     = "\end{" §::NAME §"}"
142 143 144 145 146 147 148
    
    inline_math         = "$" MATH "$"
    
    
    #### commands ####
    
    command             = known_command | generic_command
Eckhart Arnold's avatar
Eckhart Arnold committed
149
    known_command       = footnote | includegraphics | caption
150 151 152
    generic_command     = CMDNAME [[ //~ config ] //~ block ]
    
    footnote            = "\footnote" block_of_paragraphs
Eckhart Arnold's avatar
Eckhart Arnold committed
153 154
    includegraphics     = "\includegraphics" config block
    caption             = "\caption" block
155 156 157 158 159 160 161
    
    #######################################################################
    #
    # low-level text and character sequences
    #
    #######################################################################
    
162
    config     = "[" cfgtext §"]"
163
    block      = /{/ { text_elements } §/}/
164 165 166 167 168
    
    text       = { cfgtext | (BRACKETS //~) }+
    cfgtext    = { word_sequence | (ESCAPED //~) }+
    word_sequence = { TEXTCHUNK //~ }+
    
169 170 171 172 173 174 175
    blockcmd   = /[\\]/ ( ( "begin{" | "end{" )
                          ( "enumerate" | "itemize" | "figure" | "quote"
                          | "quotation" | "tabular") "}"
                        | structural)
    
    structural = "subsection" | "section" | "chapter" | "subsubsection"
               | "paragraph" | "subparagraph" | "item"
176 177 178 179 180 181 182
    
    
    #######################################################################
    #
    # Primitives
    #
    #######################################################################
183 184 185
    
    CMDNAME    = /\\(?:(?!_)\w)+/~
    NAME       = /\w+/~
186
    MATH       = /[\w_^{}[\]]*/~
187 188 189 190 191 192 193 194 195 196 197
    
    ESCAPED    = /\\[%$&_\/]/
    BRACKETS   = /[\[\]]/                       # left or right square bracket: [ ]
    TEXTCHUNK  = /[^\\%$&\{\}\[\]\s\n]+/        # some piece of text excluding whitespace,
                                                # linefeed and special characters
    WSPC       = /[ \t]+/                       # (horizontal) whitespace
    LF         = !PARSEP /[ \t]*\n[ \t]*/       # LF but not an empty line
    PARSEP     = /[ \t]*(?:\n[ \t]*)+\n[ \t]*/  # at least one empty line, i.e.
                                                # [whitespace] linefeed [whitespace] linefeed
    EOF        = !/./
    """
198
    block_environment = Forward()
199 200
    block_of_paragraphs = Forward()
    text_elements = Forward()
201
    source_hash__ = "9a8cba2b425d276af78e141d7dda162c"
202 203 204 205 206 207 208 209 210 211 212 213
    parser_initialization__ = "upon instantiation"
    COMMENT__ = r'%.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?', comment=r'%.*(?:\n|$)')
    wspL__ = ''
    wspR__ = WSP__
    EOF = NegativeLookahead(RE('.', wR=''))
    PARSEP = RE('[ \\t]*(?:\\n[ \\t]*)+\\n[ \\t]*', wR='')
    LF = Series(NegativeLookahead(PARSEP), RE('[ \\t]*\\n[ \\t]*', wR=''))
    WSPC = RE('[ \\t]+', wR='')
    TEXTCHUNK = RE('[^\\\\%$&\\{\\}\\[\\]\\s\\n]+', wR='')
    BRACKETS = RE('[\\[\\]]', wR='')
    ESCAPED = RE('\\\\[%$&_/]', wR='')
214
    MATH = RE('[\\w_^{}[\\]]*')
215 216
    NAME = Capture(RE('\\w+'))
    CMDNAME = RE('\\\\(?:(?!_)\\w)+')
217 218 219 220 221 222 223
    structural = Alternative(Token("subsection"), Token("section"), Token("chapter"), Token("subsubsection"),
                             Token("paragraph"), Token("subparagraph"), Token("item"))
    blockcmd = Series(RE('[\\\\]', wR=''), Alternative(Series(Alternative(Token("begin{"), Token("end{")),
                                                              Alternative(Token("enumerate"), Token("itemize"),
                                                                          Token("figure"), Token("quote"),
                                                                          Token("quotation"), Token("tabular")),
                                                              Token("}")), structural))
224 225 226
    word_sequence = OneOrMore(Series(TEXTCHUNK, RE('')))
    cfgtext = OneOrMore(Alternative(word_sequence, Series(ESCAPED, RE(''))))
    text = OneOrMore(Alternative(cfgtext, Series(BRACKETS, RE(''))))
227
    block = Series(RE('{', wR=''), ZeroOrMore(text_elements), Required(RE('}', wR='')))
228
    config = Series(Token("["), cfgtext, Required(Token("]")))
Eckhart Arnold's avatar
Eckhart Arnold committed
229 230
    caption = Series(Token("\\caption"), block)
    includegraphics = Series(Token("\\includegraphics"), config, block)
231 232
    footnote = Series(Token("\\footnote"), block_of_paragraphs)
    generic_command = Series(CMDNAME, Optional(Series(Optional(Series(RE(''), config)), RE(''), block)))
Eckhart Arnold's avatar
Eckhart Arnold committed
233
    known_command = Alternative(footnote, includegraphics, caption)
234 235
    command = Alternative(known_command, generic_command)
    inline_math = Series(Token("$"), MATH, Token("$"))
236 237 238
    end_environment = Series(Token("\\end{"), Required(Pop(NAME)), Required(Token("}")))
    begin_environment = Series(Token("\\begin{"), Required(NAME), Required(Token("}")))
    generic_inline_env = Series(begin_environment, OneOrMore(text_elements), Required(end_environment))
239
    known_inline_env = Synonym(inline_math)
240 241
    inline_environment = Alternative(known_inline_env, generic_inline_env)
    text_elements.set(Alternative(command, text, block, inline_environment))
242
    paragraph = OneOrMore(Series(NegativeLookahead(blockcmd), text_elements, RE('')))
243
    sequence = OneOrMore(Series(Alternative(paragraph, block_environment), Optional(PARSEP)))
244 245 246 247
    block_of_paragraphs.set(Series(RE('{', wR=''), sequence, Required(RE('}', wR=''))))
    table_config = Series(Token("{"), RE('[lcr|]+'), Token("}"))
    table = Series(Token("\\begin{tabular}"), table_config, sequence, Token("\\end{tabular}"))
    verbatim = Series(Token("\\begin{verbatim}"), sequence, Token("\\end{verbatim}"))
Eckhart Arnold's avatar
Eckhart Arnold committed
248
    quotation = Alternative(Series(Token("\\begin{quotation}"), sequence, Token("\\end{quotation}")), Series(Token("\\begin{quote}"), sequence, Token("\\end{quote}")))
249 250
    figure = Series(Token("\\begin{figure}"), sequence, Token("\\end{figure}"))
    item = Series(Token("\\item"), Optional(PARSEP), sequence)
251 252
    enumerate = Series(Token("\\begin{enumerate}"), Optional(PARSEP), ZeroOrMore(item),
                       Required(Token("\\end{enumerate}")))
253
    itemize = Series(Token("\\begin{itemize}"), Optional(PARSEP), ZeroOrMore(item), Required(Token("\\end{itemize}")))
254 255 256
    generic_environment = Series(begin_environment, sequence, Required(end_environment))
    known_environment = Alternative(itemize, enumerate, figure, table, quotation, verbatim)
    block_environment.set(Alternative(known_environment, generic_environment))
257 258 259 260 261 262
    Index = Series(Token("\\printindex"), Optional(PARSEP))
    Bibliography = Series(Token("\\bibliography"), block, Optional(PARSEP))
    SubParagraph = Series(Token("\\subparagpaph"), block, Optional(PARSEP), ZeroOrMore(sequence))
    SubParagraphs = OneOrMore(Series(SubParagraph, Optional(PARSEP)))
    Paragraph = Series(Token("\\paragraph"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubParagraphs)))
    Paragraphs = OneOrMore(Series(Paragraph, Optional(PARSEP)))
Eckhart Arnold's avatar
Eckhart Arnold committed
263
    SubSubSection = Series(Token("\\SubSubSection"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, Paragraphs)))
264
    SubSubSections = OneOrMore(Series(SubSubSection, Optional(PARSEP)))
Eckhart Arnold's avatar
Eckhart Arnold committed
265
    SubSection = Series(Token("\\SubSection"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubSubSections)))
266 267 268 269 270 271
    SubSections = OneOrMore(Series(SubSection, Optional(PARSEP)))
    Section = Series(Token("\\Section"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, SubSections)))
    Sections = OneOrMore(Series(Section, Optional(PARSEP)))
    Chapter = Series(Token("\\Chapter"), block, Optional(PARSEP), ZeroOrMore(Alternative(sequence, Sections)))
    Chapters = OneOrMore(Series(Chapter, Optional(PARSEP)))
    frontpages = Synonym(sequence)
Eckhart Arnold's avatar
Eckhart Arnold committed
272
    document = Series(Optional(PARSEP), Token("\\begin{document}"), Optional(PARSEP), frontpages, Optional(PARSEP), Alternative(Chapters, Sections), Optional(PARSEP), Optional(Bibliography), Optional(Index), Optional(PARSEP), Token("\\end{document}"), Optional(PARSEP), Required(EOF))

    preamble = OneOrMore(command)
    latexdoc = Series(preamble, document)
    root__ = latexdoc
    
def get_grammar() -> LaTeXGrammar:
    global thread_local_LaTeX_grammar_singleton
    try:
        grammar = thread_local_LaTeX_grammar_singleton
        return grammar
    except NameError:
        thread_local_LaTeX_grammar_singleton = LaTeXGrammar()
        return thread_local_LaTeX_grammar_singleton


#######################################################################
#
# AST SECTION - Can be edited. Changes will be preserved.
#
#######################################################################


def streamline_whitespace(node):
    assert node.tag_name in ['WSPC', ':Whitespace']
    s = str(node)
    c = s.find('%')
    n = s.find('\n')
    if c >= 0:
        node.result = ('  ' if (n >= c) or (n < 0) else '\n')+ s[c:].rstrip(' \t')
    elif s.find('\n') >= 0:
        node.result = '\n'
    else:
        node.result = ' '


LaTeX_AST_transformation_table = {
    # AST Transformations for the LaTeX-grammar
    "+":
        remove_empty,
    "latexdoc": [],
    "preamble": [],
    "document": [],
    "blockenv": [],
    "parblock": [],
    "sequence":
        flatten,
    "paragraph":
        [flatten(lambda node: not node.parser.name or node.parser.name == "text"),
         join('text', ':Whitespace')],
    "inlineenv": [],
    "beginenv": [],
    "endenv": [],
    "command": [],
    "config": [],
    "block": [remove_brackets, reduce_single_child],
    "text":
        [reduce_single_child, join('text', 'word_sequence', ':Whitespace')],
    "cfgtext": [flatten, reduce_single_child],
    "word_sequence":
        [collapse],
    "blockcmd": [],
    "CMDNAME":
        [remove_expendables, reduce_single_child],
    "NAME": [],
    "ESCAPED": [reduce_single_child],
    "BRACKETS": [],
    "TEXTCHUNK": [],
    "WSPC, :Whitespace":
        streamline_whitespace,
    "LF":
        replace_content(lambda node: '\n'),
    "PARSEP":
        replace_content(lambda node: '\n\n'),
    "EOF": [],
    "*":
        replace_by_single_child,
}

LaTeXTransform = partial(traverse, processing_table=LaTeX_AST_transformation_table)
# LaTeXTransform = lambda tree : 1

def get_transformer() -> TransformationFunc:
    return LaTeXTransform


#######################################################################
#
# COMPILER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################

class LaTeXCompiler(Compiler):
    """Compiler for the abstract-syntax-tree of a LaTeX source file.
    """

    def __init__(self, grammar_name="LaTeX", grammar_source=""):
        super(LaTeXCompiler, self).__init__(grammar_name, grammar_source)
        assert re.match('\w+\Z', grammar_name)

    def on_latexdoc(self, node):
        return node.as_sexpr()

    def on_preamble(self, node):
        pass

    def on_document(self, node):
        pass

    def on_blockenv(self, node):
        pass

    def on_parblock(self, node):
        pass

    def on_sequence(self, node):
        pass

    def on_paragraph(self, node):
        pass

    def on_inlineenv(self, node):
        pass

    def on_beginenv(self, node):
        pass

    def on_endenv(self, node):
        pass

    def on_command(self, node):
        pass

    def on_config(self, node):
        pass

    def on_block(self, node):
        pass

    def on_text(self, node):
        pass

    def on_cfgtext(self, node):
        pass

    def on_word_sequence(self, node):
        pass

    def on_blockcmd(self, node):
        pass

    def on_CMDNAME(self, node):
        pass

    def on_NAME(self, node):
        pass

    def on_ESCAPED(self, node):
        pass

    def on_BRACKETS(self, node):
        pass

    def on_TEXTCHUNK(self, node):
        pass

    def on_WSPC(self, node):
        pass

    def on_LF(self, node):
        pass

    def on_PARSEP(self, node):
        pass

    def on_EOF(self, node):
        pass


def get_compiler(grammar_name="LaTeX", grammar_source="") -> LaTeXCompiler:
    global thread_local_LaTeX_compiler_singleton
    try:
        compiler = thread_local_LaTeX_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_LaTeX_compiler_singleton = \
            LaTeXCompiler(grammar_name, grammar_source)
        return thread_local_LaTeX_compiler_singleton 


#######################################################################
#
# END OF DHPARSER-SECTIONS
#
#######################################################################


def compile_src(source):
    """Compiles ``source`` and returns (result, errors, ast).
    """
    with logging("LOGS"):
        compiler = get_compiler()
        cname = compiler.__class__.__name__
        log_file_name = os.path.basename(os.path.splitext(source)[0]) \
            if is_filename(source) < 0 else cname[:cname.find('.')] + '_out'    
        result = compile_source(source, get_scanner(), 
                                get_grammar(),
                                get_transformer(), compiler)
    return result


if __name__ == "__main__":
    if len(sys.argv) > 1:
        result, errors, ast = compile_src(sys.argv[1])
        if errors:
            for error in errors:
                print(error)
            sys.exit(1)
        else:
            print(result)
    else:
        print("Usage: LaTeXCompiler.py [FILENAME]")