2.12.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit cfbc805b authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

initial commit

parents
__pycache__/
*.pyc
.idea/
.cache/
*_parser.py
*_compiler.py
CommonMark/
*.ast
*.st
*.bcf
*.aux
*.toc
testdata/*.log
testdata/*.pdf
*~
*.old
DEBUG*
external_resources/
Copyright 2017 Bavarian Academy of Sciences and Humanties
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
This diff is collapsed.
PyDSL
=====
Author: Eckhart Arnold, Bavarian Academy of Sciences
Email: arnold@badw.de
License
-------
PyDSL is open source software under the [MIT License](https://opensource.org/licenses/MIT)
Description
-----------
A parser combinator based parsing and compiling infrastructure for domain
specific languages (DSL) in python.
Primary use case: A DSL for the "Mittellateinische Wörterbuch"!
..to be continued
\ No newline at end of file
* finally split ParserCombinators.py into different modules, like:
SyntaxTree, ErrorMessages, ParserCombinators, ASTTransform, EBNFCompiler, DSLCompiler
\ No newline at end of file
expression = term { ("+" | "-") term} ;
term = factor { ("*"|"/") factor} ;
factor = constant | variable | "(" expression ")" ;
variable = "x" | "y" | "z" ;
constant = digit {digit} ;
digit = "0" | "1" | "..." | "9" ;
test = digit constant variable ;
Arithmetic
==========
This is a grammar for simple arithmetic calculations, a standard textbook
example for EBNF-Grammars
# EBNF-Grammar in EBNF
@ comment = /#.*(?:\n|$)/ # comments start with '#' and eat all chars upto and including '\n'
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
syntax = [~//] { definition | directive } §EOF
definition = symbol §"=" expression
directive = "@" §symbol §"=" ( regexp | literal | list_ )
expression = term { "|" term }
term = factor { factor }
factor = [flowmarker] [retrieveop] symbol !"=" # negative lookahead to be sure it's not a definition
| [flowmarker] literal
| [flowmarker] regexp
| [flowmarker] group
| [flowmarker] oneormore
| repetition
| option
flowmarker = "!" | "&" | "§" | # '!' negative lookahead, '&' positive lookahead, '§' required
"-!" | "-&" # '-' negative lookbehind, '-&' positive lookbehind
retrieveop = "::" | ":" # '::' pop, ':' retrieve
group = "(" expression §")"
option = "[" expression §"]"
repetition = "{" expression §"}"
oneormore = "<" expression §">"
symbol = /(?!\d)\w+/~ # e.g. expression, factor, parameter_list
literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while'
| /'(?:[^']|\\')*?'/~ # whitespace surrounding literals will be ignored tacitly.
regexp = /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# '~' is a whitespace-marker, if present leading or trailing
# whitespace of a regular expression will be ignored tacitly.
list_ = /\w+\s*(?:,\s*\w+\s*)*/~ # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
# BEGIN_QUOTE, END_QUOTE ; see markdown.py for an exmaple
EOF = !/./
EBNF
====
This directory contains one or more variants of grammars for the Extended
Backus-Naur-Form (EBNF). The file named 'EBNF.ebnf' should always be the grammar
for the stock EBNF-compiler of `ParserCombinators`.
# latex Grammar
@ whitespace := /[ \t]*\n?(?!\s*\n)[ \t]*/
@ comment := /%.*(?:\n|$)/
genericenv := beginenv sequence endenv
beginenv := "\begin" §( "{" name "}" )
endenv := "\end" §( "{" @name "}" )
name := ~/\w+/
genericcmd := command [ config ] block
command := /\\\w+/
config := "[" cfgtext §"]"
sequence := { partext | parblock }
parblock := "{" { partext | parblock } §"}"
block := "{" { text | block } §"}"
partext := text | par
text := cfgtext | brackets
cfgtext := chunk | wspc | escaped
escaped := /\\[%$&]/
brackets := /[\[\]]/ # left and right square brackets: [ ]
chunk := /[^\\%$&\{\}\[\]\s\n]+/ # some piece of text excluding whitespace,
# linefeed and special characters
wspc := /[ \t]*\n?(?!\s*\n)[ \t]*/ # whitespace, including at most one linefeed
lf := /[ \t]*\n(?!\s*\n)/ # a linefeed, but not an empty line (i.e. par)
par := /\s*\n\s*\n/ # at least one empty line, i.e.
# [whitespace] linefeed [whitespace] linefeed
LaTeX
=====
This is going to be a compiler for a subset of LaTeX. TeX/LaTeX in
general cannot is not a context free language ADD REFERENCE HERE.
Thus, only a subset.
# EBNF-Syntax von MLW-Einträgen (Versuch!!!)
@literalws = none # no implicit whitespace around literals!
Lemma = [Wortart] [Markierung] Eintrag
Wortart = (ABKUERZUNG | BUCHSTABENFOLGE) LEERZEICHEN
Markierung = "*"
Eintrag = Wort ["," Flexion] "." [Hinweis] LEERRAUM Bedeutung
Wort = BUCHSTABENFOLGE
Flexion = ALLES_BIS_PUNKT
Hinweis = LEERRAUM
(BEDEUTUNGSABSCHNITT !BEDEUTUNGSTRENNER !BEDEUTUNGSENDE)
Bedeutung = TEXTBLOCK { LEERRAUM TEXTBLOCK }
TEXTBLOCK = < !LEERRAUM (BUCHSTABENFOLGE | ZEICHEN) >
BEDEUTUNGSABSCHNITT = < !LEERRAUM !BEDEUTUNGSTRENNER !BEDEUTUNGSENDE
(BUCHSTABENFOLGE | ZEICHEN) >
ALLES_BIS_PUNKT = /[^.]+/
NUMMER = /[0-9]+/
BUCHSTABENFOLGE = /\w+/
ABKUERZUNG = /\w+\./
LEERZEICHEN = /\s/
LEERRAUM = / /
BEDEUTUNGSTRENNER = / - /
BEDEUTUNGSENDE = /: /
ZEICHEN = /./
Mittellateinisches Wörterbuch
=============================
This directory contains the components for two kinds of DSLs for the
[Medival Latin Dictionary](https://www.mlw.badw.de) of the
[Bavarian Academy of Sciences](https://badw.de). The two kinds of DSLs are.
1. MLW: A domain specific language for writing dictionary entries
for the "Mittellateinisches Wörterbuch". The DSL texts will be converted
to an XML data model.
2. MLW_RETRO: A grammar for the retrodigitalisation of existing dictionary
entries in ASCII-Format. This is very experimental...
# EBNF-Grammar in EBNF
@ comment = /#.*(?:\n|$)/ # comments start with '#' and eat all chars upto and including '\n'
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
syntax = [~//] { definition | directive } §EOF
definition = symbol §"=" expression
directive = "@" §symbol §"=" ( regexp | literal | list_ )
expression = term { "|" term }
term = factor { factor }
factor = [flowmarker] [retrieveop] symbol !"=" # negative lookahead to be sure it's not a definition
| [flowmarker] literal
| [flowmarker] regexp
| [flowmarker] group
| [flowmarker] oneormore
| repetition
| option
flowmarker = "!" | "&" | "§" | # '!' negative lookahead, '&' positive lookahead, '§' required
"-!" | "-&" # '-' negative lookbehind, '-&' positive lookbehind
retrieveop = "::" | ":" # '::' pop, ':' retrieve
group = "(" expression §")"
option = "[" expression §"]"
repetition = "{" expression §"}"
oneormore = "<" expression §">"
symbol = /\w+/~ # e.g. expression, factor, parameter_list
literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while'
| /'(?:[^']|\\')*?'/~ # whitespace surrounding literals will be ignored tacitly.
regexp = /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# '~' is a whitespace-marker, if present leading or trailing
# whitespace of a regular expression will be ignored tacitly.
list_ = /\w+\s*(?:,\s*\w+\s*)*/~ # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
# BEGIN_QUOTE, END_QUOTE ; see markdown.py for an exmaple
EOF = !/./
#!/usr/bin/python3
"""LatexParser.py - Parses the Erweiterte Backus-Naur-Form
Copyright 2016 by Eckhart Arnold
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Module latex_parser defines a parser for the Erweiterte Bacchus-Naur-Form.
"""
# TODO: Document all the code!
import re
from ParserCombinators import Alternative, Forward, RE, Sequence, Token, Required, \
ZeroOrMore, replace_by_single_child, remove_enclosing_delimiters, remove_whitespace, strip_assignment, \
reduce_single_child, flatten_sameop_sequence, flatten_repetition, CompilerBase, full_compilation, \
error_messages
class EBNFGrammar:
expression = Forward()
regexp = Alternative("regexp", RE('~/(?:[^/]|(?<=\\\\)/)*/~'), RE('/(?:[^/]|(?<=\\\\)/)*/'))
literal = Alternative("literal", RE('§?"(?:[^"]|\\\\")*"'), RE("§?'(?:[^']|\\\\')*'"))
symbol = RE('\\w+', "symbol")
group = Sequence("group", Token("("), expression, Required(Token(")")))
repetition = Sequence("repetition", Token("{"), expression, Required(Token("}")))
option = Sequence("option", Token("["), expression, Required(Token("]")))
factor = Alternative("factor", symbol, literal, regexp, option, repetition, group)
term = Sequence("term", factor, ZeroOrMore(None, factor))
expression.set(Sequence("expression", term, ZeroOrMore(None, Sequence(None, Token("|"), term))))
production = Sequence("production", symbol, Required(Token("=")), expression, Required(Token(".")))
syntax = ZeroOrMore("syntax", production)
EBNFTransTable = {
# AST Transformations for EBNF-Grammar
"production": strip_assignment,
"expression": flatten_sameop_sequence,
"term": flatten_repetition,
"factor": replace_by_single_child,
"group": [remove_enclosing_delimiters, replace_by_single_child],
"repetition": remove_enclosing_delimiters,
"option": remove_enclosing_delimiters,
"symbol": remove_whitespace,
"Token": remove_whitespace,
"RE": remove_whitespace,
"literal": [reduce_single_child, remove_whitespace],
"regexp": [reduce_single_child, remove_whitespace],
"": [remove_whitespace, replace_by_single_child]
}
class EBNFCompiler(CompilerBase):
"""Generates a Parser from an abstract syntax tree of a grammar specified
in EBNF-Notation.
"""
def __init__(self, grammar_name="Grammar"):
super(EBNFCompiler, self).__init__()
assert re.match('\w+\Z', grammar_name)
self.grammar_name = grammar_name
self.rules = set()
self.symbols = set()
self.component = str(None)
self.recursive = set()
def syntax(self, node):
productions = []
for nd in node.result:
productions += [self.compile__(nd)]
root = productions[0][0] if productions else ""
productions.reverse()
declarations = ['class ' + self.grammar_name + ':']
declarations += [symbol + ' = Forward()' for symbol in self.recursive]
for symbol, statement in productions:
if symbol in self.recursive:
declarations += [symbol + '.set(' + statement + ')']
else:
declarations += [symbol + ' = ' + statement]
for nd in self.symbols:
if nd.result not in self.rules:
nd.add_error("Missing production for symbol '%s'" % nd.result)
if root and '_root' not in self.symbols:
declarations.append('_root = ' + root)
return '\n '.join(declarations)
def production(self, node):
rule = node.result[0].result
self.component = '"' + rule + '"'
self.rules.add(rule)
prod = self.compile__(node.result[1])
return (rule, prod)
def _non_terminal(self, node, kind):
"""Compiles any non-terminal, where `kind` indicates the Parser class
name for the particular non-terminal.
"""
comp = self.component; self.component = str(None)
arguments = [comp] + [self.compile__(r) for r in node.result]
return kind + '(' + ', '.join(arguments) + ')'
def expression(self, node):
return self._non_terminal(node, 'Alternative')
def term(self, node):
return self._non_terminal(node, 'Sequence')
def option(self, node):
return self._non_terminal(node, 'Option')
def repetition(self, node):
return self._non_terminal(node, 'ZeroOrMore')
def symbol(self, node):
self.symbols.add(node)
if node.result in self.rules:
self.recursive.add(node.result)
return node.result
def _re_variant(self, node, kind):
"""Compiles some variant of a regular expression, where `kind`
indicates the variant, i.e. the classname of the Parser like
`Token` or `Re`.
"""
comp = [self.component] if self.component != str(None) else []
self.component = str(None)
arg = node.result[1:] if node.result[0:1] == "~" else node.result
return kind + '(' + ', '.join([arg] + comp) + ')'
def literal(self, node):
comp = [self.component] if self.component != str(None) else []
self.component = str(None)
if node.result[0] == '§':
return 'Required(Token(' + ', '.join([node.result[1:]] + comp) + '))'
else:
return 'Token(' + ', '.join([node.result] + comp) + ')'
def regexp(self, node):
comp = [self.component] if self.component != str(None) else []
self.component = str(None)
if (node.result[:2], node.result[-2:]) == ('~/', '/~'):
arg = repr(node.result[2:-2].replace(r'\/', '/'))
return 'RE(' + ', '.join([arg] + comp) + ')'
else:
comp = comp or [str(None)]
arg = repr(node.result[1:-1].replace(r'\/', '/'))
return 'RegExp(' + ', '.join(comp + [arg]) + ')'
def load_if_file(text_or_file):
"""Reads and returns content of a file if parameter `text_or_file` is a
file name (i.e. a single line string), otherwise (i.e. if `text_or_file` is
a multiline string) returns the content of `text_or_file`.
"""
if text_or_file.find('\n') > 0:
with open(text_or_file) as f:
content = f.read()
return content
else:
return text_or_file
class Error(Exception):
"""Base class for Errors in this module"""
pass
class GrammarError(Error):
"""Raised when (already) the grammar of a domain specific language (DSL)
contains errors.
"""
def __init__(self, grammar_src, error_messages):
self.grammar_src = grammar_src
self.error_messages = error_messages
class CompileError(Error):
"""Raised when a string or file in a domain specific language (DSL)
contains errors.
"""
def __init__(self, dsl_text, dsl_grammar, error_messages):
self.dsl_text = dsl_text
self.dsl_grammar = dsl_grammar
self.error_messages = error_messages
def compileDSL(text_or_file, dsl_grammar, trans_table, compiler):
"""Compiles a text in a domain specific language (DSL) with an
EBNF-specified grammar. Resurns the compiled text.
"""
assert isinstance(text_or_file, str)
assert isinstance(dsl_grammar, str)
assert isinstance(compiler, CompilerBase)
assert isinstance(trans_table, dict)
# read grammar
grammar_src = load_if_file(dsl_grammar)
parser_py, errors, AST = full_compilation(grammar_src, EBNFGrammar.syntax,
EBNFTransTable, EBNFCompiler())
if errors: raise GrammarError(grammar_src, error_messages(grammar_src, errors))
code = compile(parser_py, '<string>', 'exec')
name_space = {}
exec(code, name_space)
parser = name_space['Grammar']
src = load_if_file(text_or_file)
result, errors, AST = full_compilation(src, parser.root, trans_table, compiler)
if errors: raise CompileError(src, grammar_src, error_messages(src, errors))
return result
if __name__ == "__main__":
assert (str(EBNFGrammar.syntax) == str(EBNFGrammar.syntax))
def test(file_name):
print(file_name)
with open('testdata/' + file_name) as f:
text = f.read()
result, errors, syntax_tree = full_compilation(text, EBNFGrammar.syntax,
EBNFTransTable, EBNFCompiler())
print(errors)
print(syntax_tree.as_sexpr())
print(result)
return result
print(EBNFGrammar.syntax)
test('arithmetic.ebnf')
test('ebnf_1.ebnf')
# test('ebnf_modern.ebnf')
# code = test('left_recursion.ebnf')
# exec(code)
# result = parse("1 + 2 - 3 * 5 .", formula)
# print(result.as_sexpr())
# print(result.collect_errors())
# print(result)
#!/usr/bin/python3
"""LatexParser.py - Parser for LaTeX documents
Copyright 2016 by Eckhart Arnold
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Module latex_parser defines a parser for LaTeX files based on parser
combinators (see module parser_combinators).
"""
import os
import sys
sys.path.append(os.path.abspath('../'))
from ParserCombinators import Alternative, Forward, Optional, RegExp, \
Sequence, Token, Required, ZeroOrMore, parse
doc = Forward()
configDoc = Forward()
text = RegExp("text", r'(?:[^\\\{\}%\n]|\n(?!\n))+')
configText = RegExp("configText", r'[^\\\{\}\[\]%]+')
group = Sequence("group", Token('{'), doc, Required(Token('}')))
config = Sequence("config", Token('['), doc, Required(Token(']')))
comment = RegExp("comment", r'( *%.*\n)+')
gap = RegExp("gap", r'\n\n+')
commandToken = RegExp("commandToken", r'\\\\?(?!(end\W)|(begin\W))[^\\\{\}\[\]%\s]*') # r'\\\\?(?!end )[^\\\{\}\[\]%\s]*'
commandConfig = Sequence("commandConfig", Optional(None, comment), config)
commandGroup = Sequence("commandGroup", Optional(None, comment), group)
command = Sequence("command",
commandToken,
Optional("head", commandConfig),
ZeroOrMore("tail", commandGroup))
anonymousEnv = Sequence("anonymousEnv", Token('{'), doc, Required(Token('}')))
envBegin = Sequence("envBegin",
RegExp("head", r'\\begin\{\w+\}'),
ZeroOrMore("tail", commandGroup))
envEnd = RegExp("envEnd", r'\\end\{\w+\}')
environment = Sequence("environment", envBegin, doc, Required(envEnd))
configDoc.set(ZeroOrMore("configDoc", Alternative(None, command, group, configText)))
doc.set(ZeroOrMore("doc", Alternative(None, environment, anonymousEnv, command, comment, text, gap)))
if __name__ == "__main__":
def test(file_name):
print(file_name)
with open(file_name) as f:
latex_file = f.read()
result = parse(latex_file, doc)
assert str(result) == latex_file or result.collect_errors()
return result
assert str(doc) == str(doc)
print(doc)
result = test('testdata/testdoc1.tex')
# result = test('testdata/testdoc2.tex')
# result = test('testdata/testerror.tex')
print(result.as_sexpr())
print(result.collect_errors())
# EBNF-Syntax von MLW-Einträgen (Versuch!!!)
lemma = "LEMMA" wort ["," flexion] erklaerung
flexion = endung { "," endung | " " genus "." }
endung = [MINUS] KLEIBUCHSTABEN
genus = KLEINBUCHSTABE
erklaerung = "ERKLÄRUNG" [interpretament] { bedeutung }
bedeutung = "BEDEUTUNG" (interpretament | kategorie) { beleg }
interpretament = lateinisch "---" deutsch
lateinisch = WORT { !"---" (WORT | SONDERZEICHEN | LEERRAUM) }
deutsch = WORT { WORT | SONDERZEICHEN | LEERRAUM }
kategorie = WORT { WORT | SONDERZEICHEN | LEERRAUM }
beleg = "*" { TEXT | erlaeuterung }
erlaeuterung = "{" TEXT "}"
TEXT = WORT { WORT | SONDERZEICHEN | LEERRAUM | ZAHL }
LEERRAUM = /\s+/
KLEINBUCHSTABE = /[a-z]/