Commit 1e552471 authored by Eckhart Arnold's avatar Eckhart Arnold

- renamed AST_transform to "traverse": TODO: add semantic analysis phase!!!

parent b4daffdc
......@@ -26,10 +26,9 @@ try:
except ImportError:
import re
from toolkit import load_if_file, escape_re, md5
from toolkit import load_if_file, escape_re, md5, sane_parser_name
from parsercombinators import GrammarBase, mixin_comment, Forward, RE, NegativeLookahead, \
Alternative, Sequence, Optional, Required, OneOrMore, ZeroOrMore, Token, CompilerBase, \
sane_parser_name
Alternative, Sequence, Optional, Required, OneOrMore, ZeroOrMore, Token, CompilerBase
from syntaxtree import *
from version import __version__
......@@ -112,7 +111,6 @@ class EBNFGrammar(GrammarBase):
root__ = syntax
# TODO: Add some sanity checks to Transformations, e.g. "Required(Optional(..." should yield an error.
EBNFTransTable = {
# AST Transformations for EBNF-grammar
"syntax":
......
......@@ -1388,7 +1388,7 @@ def full_compilation(source, grammar_base, AST_transformations, compiler):
grammar_base (GrammarBase): The GrammarBase object
AST_transformations (dict): The transformation-table that
assigns AST transformation functions to parser names (see
function ASTTransform)
function traverse)
compiler (object): An instance of a class derived from
``CompilerBase`` with a suitable method for every parser
name or class.
......
......@@ -18,33 +18,49 @@ DHParser is open source software under the [MIT License](https://opensource.org/
Purpose
-------
Domain specific languages are widespread in computer sciences, but seem to be underused in the
Digital Humanities. While DSLs are often introduced to Digital-Humanities-projects as
[practical adhoc-solution][Müller_2016], these solutions are often somewhat "quick and dirty". In other words they are
more of a hack than a technology. The purpose of DHParsers is to introduce
[DSLs as a technology][Arnold_2016] to the Digital Humanities. It is based on the well known technology of
[EBNF][ISO_IEC_14977]-based parser generators, but employs the more modern form called "[parsing expression grammar][Ford_2004]"
and [parser combinators][Ford_20XX] as a variant of the classical recursive descent parser.
Why another parser generator? There are plenty of good parser generators out there, e.g. [Añez's grako parser generator][Añez_2017]. However, DHParser is
intended as a tool that is specifically geared towards digital humanities applications, while most existing parser
generators come from compiler construction toolkits for programming languages. Also, DHParser shall (in the future)
serve as a teching tool, which influences some of its design decisions such as, for example, clearly separating
the parsing, syntax-tree-transformation and compilation stages. Also, DHParser is intended as a tool to experiment with.
One possible research area is, how non [context-free grammars](https://en.wikipedia.org/wiki/Context-free_grammar)
such as the grammars of [TeX][tex_stackexchange_no_bnf] or [CommonMark][MacFarlane_et_al_2017] can be described with
declarative langauges in the spirit of but beyond EBNF, and what extensions of the parsing technology are necessary to
capture such languages.
Primary use case at the Bavarian Academy of Sciences and Humanities (for the time being):
A DSL for the "[Mittellateinische Wörterbuch](http://www.mlw.badw.de/)"!
Domain specific languages are widespread in computer sciences, but
seem to be underused in the Digital Humanities. While DSLs are
sometimes introduced to Digital-Humanities-projects as
[practical adhoc-solution][Müller_2016], these solutions are often
somewhat "quick and dirty". In other words they are more of a hack
than a technology. The purpose of DHParser is to introduce
[DSLs as a technology][Arnold_2016] to the Digital Humanities. It is
based on the well known technology of [EBNF][ISO_IEC_14977]-based
parser generators, but employs the more modern form called
"[parsing expression grammar][Ford_2004]" and
[parser combinators][Ford_20XX] as a variant of the classical
recursive descent parser.
Why another parser generator? There are plenty of good parser
generators out there,
e.g. [Añez's grako parser generator][Añez_2017]. However, DHParser is
intended as a tool that is specifically geared towards digital
humanities applications, while most existing parser generators come
from compiler construction toolkits for programming languages. Also,
DHParser shall (in the future) serve as a teching tool, which
influences some of its design decisions such as, for example, clearly
separating the parsing, syntax-tree-transformation and compilation
stages. Also, DHParser is intended as a tool to experiment with. One
possible research area is, how non
[context-free grammars](https://en.wikipedia.org/wiki/Context-free_grammar)
such as the grammars of [TeX][tex_stackexchange_no_bnf] or
[CommonMark][MacFarlane_et_al_2017] can be described with declarative
langauges in the spirit of but beyond EBNF, and what extensions of the
parsing technology are necessary to capture such languages.
Primary use case at the Bavarian Academy of Sciences and Humanities
(for the time being): A DSL for the
"[Mittellateinische Wörterbuch](http://www.mlw.badw.de/)"!
Further (intended) use cases are:
* LaTeX -> XML/HTML conversion. See this [discussion on why an EBNF-parser for the complete TeX/LaTeX-grammar][tex_stackexchange_no_bnf]
is not possible.
* [CommonMark][MacFarlane_et_al_2017] and other DSLs for cross media publishing of scientific literature, e.g. journal articles.
(Common Mark and Markdown also go beyond what is feasible with pure EBNF-based-parsers.)
* LaTeX -> XML/HTML conversion. See this
[discussion on why an EBNF-parser for the complete TeX/LaTeX-grammar][tex_stackexchange_no_bnf]
is not possible.
* [CommonMark][MacFarlane_et_al_2017] and other DSLs for cross media
publishing of scientific literature, e.g. journal articles. (Common
Mark and Markdown also go beyond what is feasible with pure
EBNF-based-parsers.)
* EBNF itself. DHParser is already self-hosting ;-)
* Digital and cross-media editions
* Digital dictionaries
......@@ -55,28 +71,35 @@ Description
... comming soon ;-)
For a simple self-test run `dhparser.py` from the command line. This compiles the EBNF-Grammer in
`examples/EBNF/EBNF.ebnf` and outputs the Python-based parser class representing that grammar. The concrete and abstract
syntax tree as well as a full and abbreviated log of the parsing process will be stored in a sub-directory named "DEBUG".
For a simple self-test run `dhparser.py` from the command line. This
compiles the EBNF-Grammer in `examples/EBNF/EBNF.ebnf` and outputs the
Python-based parser class representing that grammar. The concrete and
abstract syntax tree as well as a full and abbreviated log of the
parsing process will be stored in a sub-directory named "DEBUG".
References
----------
Juancarlo Añez: grako, a PEG parser generator in Python, 2017. URL: [bitbucket.org/apalala/grako][Añez_2017]
Juancarlo Añez: grako, a PEG parser generator in Python, 2017. URL:
[bitbucket.org/apalala/grako][Añez_2017]
[Añez_2017]: https://bitbucket.org/apalala/grako
Eckhart Arnold: Domänenspezifische Notationen. Eine (noch) unterschätzte Technologie in den Digitalen Geisteswissenschaften, Präsentation auf dem
[dhmuc-Workshop: Digitale Editionen und Auszeichnungssprachen](https://dhmuc.hypotheses.org/workshop-digitale-editionen-und-auszeichnungssprachen),
Eckhart Arnold: Domänenspezifische Notationen. Eine (noch)
unterschätzte Technologie in den Digitalen Geisteswissenschaften,
Präsentation auf dem
[dhmuc-Workshop: Digitale Editionen und Auszeichnungssprachen](https://dhmuc.hypotheses.org/workshop-digitale-editionen-und-auszeichnungssprachen),
München 2016. Short-URL: [tiny.badw.de/2JVT][Arnold_2016]
[Arnold_2016]: https://f.hypotheses.org/wp-content/blogs.dir/1856/files/2016/12/EA_Pr%C3%A4sentation_Auszeichnungssprachen.pdf
Brian Ford: Parsing Expression Grammars: A Recognition-Based Syntactic Foundation, Cambridge Massachusetts, 2004. Short-URL:[http://t1p.de/jihs][Ford_2004]
Brian Ford: Parsing Expression Grammars: A Recognition-Based Syntactic
Foundation, Cambridge
Massachusetts, 2004. Short-URL:[http://t1p.de/jihs][Ford_2004]
[Ford_2004]: https://pdos.csail.mit.edu/~baford/packrat/popl04/peg-popl04.pdf
......@@ -91,14 +114,16 @@ Berlin Heidelberg 2008.
Dominikus Herzberg: Objekt-orientierte Parser-Kombinatoren in Python,
Blog-Post, September, 18th 2008 on denkspuren. gedanken, ideen,
anregungen und links rund um informatik-themen, short-URL: [http://t1p.de/bm3k][Herzberg_2008a]
anregungen und links rund um informatik-themen, short-URL:
[http://t1p.de/bm3k][Herzberg_2008a]
[Herzberg_2008a]: http://denkspuren.blogspot.de/2008/09/objekt-orientierte-parser-kombinatoren.html
Dominikus Herzberg: Eine einfache Grammatik für LaTeX, Blog-Post,
September, 18th 2008 on denkspuren. gedanken, ideen, anregungen und
links rund um informatik-themen, short-URL: [http://t1p.de/7jzh][Herzberg_2008b]
links rund um informatik-themen, short-URL:
[http://t1p.de/7jzh][Herzberg_2008b]
[Herzberg_2008b]: http://denkspuren.blogspot.de/2008/09/eine-einfache-grammatik-fr-latex.html
......@@ -113,15 +138,18 @@ informatik-themen, short-URL: [http://t1p.de/s0zk][Herzberg_2007]
[ISO_IEC_14977]: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf
John MacFarlane, David Greenspan, Vicent Marti, Neil Williams, Benjamin Dumke-von der Ehe, Jeff Atwood:
CommonMark. A strongly defined, highly compatible specification of Markdown, 2017. [commonmark.org][MacFarlane_et_al_2017]
John MacFarlane, David Greenspan, Vicent Marti, Neil Williams,
Benjamin Dumke-von der Ehe, Jeff Atwood: CommonMark. A strongly
defined, highly compatible specification of
Markdown, 2017. [commonmark.org][MacFarlane_et_al_2017]
[MacFarlane_et_al_2017]: http://commonmark.org/
Stefan Müller: DSLs in den digitalen Geisteswissenschaften, Präsentation auf dem
[dhmuc-Workshop: Digitale Editionen und Auszeichnungssprachen](https://dhmuc.hypotheses.org/workshop-digitale-editionen-und-auszeichnungssprachen),
München 2016. Short-URL: [tiny.badw.de/2JVy][Müller_2016]
Stefan Müller: DSLs in den digitalen Geisteswissenschaften,
Präsentation auf dem
[dhmuc-Workshop: Digitale Editionen und Auszeichnungssprachen](https://dhmuc.hypotheses.org/workshop-digitale-editionen-und-auszeichnungssprachen),
München 2016. Short-URL: [tiny.badw.de/2JVy][Müller_2016]
[Müller_2016]: https://f.hypotheses.org/wp-content/blogs.dir/1856/files/2016/12/Mueller_Anzeichnung_10_Vortrag_M%C3%BCnchen.pdf
......
* split ParserCombinators.py into different modules, like:
SyntaxTree, ErrorMessages, ParserCombinators, ASTTransform, EBNFCompiler, DSLCompiler
* smarter handling of whitespace parameters in RE and Token (not important...)
* add warning when overlapping groups in REs occur! (should possibly change whitespace
handling or REs!?)
......@@ -1419,7 +1419,7 @@ def full_compilation(source, grammar_base, AST_transformations, compiler):
source (str): the input source for compilation
grammar_base (GrammarBase): the GrammarBase object
AST_transformations (dict): a table that assigns AST transformation
functions to parser names (see function ASTTransform)
functions to parser names (see function traverse)
compiler (object): an instance of a class derived from `CompilerBase`
with a suitable method for every parser name or class.
Returns:
......
......@@ -59,9 +59,9 @@ try:
except ImportError:
import re
from toolkit import IS_LOGGING, LOGS_DIR, escape_re
from toolkit import IS_LOGGING, LOGS_DIR, escape_re, sane_parser_name
from syntaxtree import WHITESPACE_KEYWORD, TOKEN_KEYWORD, ZOMBIE_PARSER, Node, \
error_messages, ASTTransform
error_messages, traverse
__all__ = ['HistoryRecord',
......@@ -95,7 +95,6 @@ __all__ = ['HistoryRecord',
'Pop',
'Forward',
'PARSER_SYMBOLS',
'sane_parser_name',
'CompilerBase',
'full_compilation',
'COMPILER_SYMBOLS']
......@@ -843,13 +842,6 @@ PARSER_SYMBOLS = {'RegExp', 'mixin_comment', 'RE', 'Token', 'Required',
#######################################################################
def sane_parser_name(name):
"""Checks whether given name is an acceptable parser name. Parser names
must not be preceeded or succeeded by a double underscore '__'!
"""
return name and name[:2] != '__' and name[-2:] != '__'
class CompilerBase:
def compile__(self, node):
comp, cls = node.parser.name, node.parser.__class__.__name__
......@@ -877,7 +869,7 @@ def full_compilation(source, grammar_base, AST_transformations, compiler):
grammar_base (GrammarBase): The GrammarBase object
AST_transformations (dict): The transformation-table that
assigns AST transformation functions to parser names (see
function ASTTransform)
function ``syntaxtree.traverse``)
compiler (object): An instance of a class derived from
``CompilerBase`` with a suitable method for every parser
name or class.
......@@ -905,7 +897,7 @@ def full_compilation(source, grammar_base, AST_transformations, compiler):
if syntax_tree.error_flag:
result = None
else:
ASTTransform(syntax_tree, AST_transformations)
traverse(syntax_tree, AST_transformations)
syntax_tree.log(log_file_name, ext='.ast')
result = compiler.compile__(syntax_tree)
errors = syntax_tree.collect_errors()
......
......@@ -29,18 +29,17 @@ except ImportError:
import re
from typing import NamedTuple
from toolkit import IS_LOGGING, LOGS_DIR, expand_table
from toolkit import IS_LOGGING, LOGS_DIR, expand_table, line_col, sequence
__all__ = ['WHITESPACE_KEYWORD',
'TOKEN_KEYWORD',
'line_col',
'ZOMBIE_PARSER',
'Error',
'Node',
'error_messages',
'compact_sexpr',
'ASTTransform',
'traverse',
'no_transformation',
'replace_by_single_child',
'reduce_single_child',
......@@ -57,19 +56,6 @@ __all__ = ['WHITESPACE_KEYWORD',
'AST_SYMBOLS']
WHITESPACE_KEYWORD = 'WSP__'
TOKEN_KEYWORD = 'TOKEN__'
def line_col(text, pos):
"""Returns the position within a text as (line, column)-tuple.
"""
assert pos < len(text), str(pos) + " >= " + str(len(text))
line = text.count("\n", 0, pos) + 1
column = pos - text.rfind("\n", 0, pos)
return line, column
class ZombieParser:
"""
Serves as a substitute for a Parser instance.
......@@ -209,7 +195,7 @@ class Node:
function parameters. This could be an XML-representation or a
lisp-like S-expression.
Parameters:
Args:
tab (str): The indentation string, e.g. '\t' or ' '
openF: (Node->str) A function that returns an opening
string (e.g. an XML-tag_name) for a given node
......@@ -245,7 +231,7 @@ class Node:
"""
Returns content as S-expression, i.e. in lisp-like form.
Parameters:
Args:
src: The source text or `None`. In case the source text is
given the position of the element in the text will be
reported as line and column.
......@@ -275,7 +261,7 @@ class Node:
"""
Returns content as XML-tree.
Parameters:
Args:
src: The source text or `None`. In case the source text is
given the position will also be reported as line and
column.
......@@ -354,7 +340,7 @@ class Node:
'e/r' yields 'x1', then 'x2'
'e' yields (r x1)(r x2)
Parameters:
Args:
path (str): The path of the object, e.g. 'a/b/c'. The
components of ``path`` can be regular expressions
......@@ -401,27 +387,45 @@ def compact_sexpr(s):
########################################################################
def ASTTransform(node, transtable):
"""Transforms the parse tree starting with the given ``node`` into
an abstract syntax tree by calling transformation functions
registered in the transformation dictionary ``transtable``.
WHITESPACE_KEYWORD = 'WSP__'
TOKEN_KEYWORD = 'TOKEN__'
def traverse(node, calltable):
"""Traverses the snytax tree starting with the given ``node`` depth
first and applies the sequences of callback functions registered
in the ``calltable``-dictionary.
Possible use cases are the transformation of a concrete syntax tree
into an abstract tree (AST) or the semantic analysis of the AST.
Args:
node (Node): The root-node of the syntax tree to be traversed
calltable (dict): parser.name -> sequence of functions that
will be applied to the current node in order. This
dictionary is interpreted as a ``compact_table``. See
``toolkit.expand_table`` or ``EBNFCompiler.EBNFTransTable``
Example:
table = { "term": [replace_by_single_child, flatten],
"factor, flowmarker, retrieveop": replace_by_single_child }
traverse(node, table)
"""
# normalize transformation entries by turning single transformations
# into lists with a single item
table = {name: transformation if isinstance(transformation, collections.abc.Sequence)
else [transformation] for name, transformation in list(transtable.items())}
# normalize calltable entries by turning single values into lists
# with a single value
table = {name: sequence(call) for name, call in list(calltable.items())}
table = expand_table(table)
def recursive_ASTTransform(nd):
def traverse_recursive(nd):
if nd.children:
for child in nd.result:
recursive_ASTTransform(child)
transformation = table.get(nd.parser.name,
traverse_recursive(child)
sequence = table.get(nd.parser.name,
table.get('~', [])) + table.get('*', [])
for transform in transformation:
transform(nd)
for call in sequence:
call(nd)
recursive_ASTTransform(node)
traverse_recursive(node)
def no_transformation(node):
......
......@@ -41,10 +41,14 @@ __all__ = ['logging_on',
'logging_off',
'IS_LOGGING',
'LOGS_DIR',
'line_col',
'escape_re',
'load_if_file',
'is_python_code',
'md5']
'md5',
'expand_table',
'sequence',
'sane_parser_name']
LOGGING: str = "LOGS" # LOGGING = "" turns logging off!
......@@ -101,6 +105,15 @@ def LOGS_DIR() -> str:
return dirname
def line_col(text, pos):
"""Returns the position within a text as (line, column)-tuple.
"""
assert pos < len(text), str(pos) + " >= " + str(len(text))
line = text.count("\n", 0, pos) + 1
column = pos - text.rfind("\n", 0, pos)
return line, column
def escape_re(s):
"""Returns `s` with all regular expression special characters escaped.
"""
......@@ -168,3 +181,16 @@ def expand_table(compact_table):
for p in parts:
expanded_table[p] = value
return expanded_table
def sequence(arg):
"""Returns the argument if it is a sequence, otherwise returns a
list containing the argument as sole item."""
return arg if isinstance(arg, collections.abc.Sequence) else [arg]
def sane_parser_name(name):
"""Checks whether given name is an acceptable parser name. Parser names
must not be preceeded or succeeded by a double underscore '__'!
"""
return name and name[:2] != '__' and name[-2:] != '__'
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment