diff --git a/DHParser/stringview.py b/DHParser/stringview.py index 4004651a4067f5b587827685e05ef5518e101b56..5f3a399eb2d346b8896b8e3b56cec65f2a43af7a 100644 --- a/DHParser/stringview.py +++ b/DHParser/stringview.py @@ -208,9 +208,9 @@ class StringView(collections.abc.Sized): def match(self, regex, flags=0): """Executes `regex.match` on the StringView object and returns the - result, which is either a match-object or None. - WARNING: match.end(), match.span() etc. are mapped to the underlying text, - not the StringView-object!!! + result, which is either a match-object or None. Keep in mind that + match.end(), match.span() etc. are mapped to the underlying text, + not the StringView-object!!! """ return regex.match(self.text, pos=self.begin, endpos=self.end) @@ -236,17 +236,16 @@ class StringView(collections.abc.Sized): def search(self, regex): """Executes regex.search on the StringView object and returns the - result, which is either a match-object or None. - WARNING: match.end(), match.span() etc. are mapped to the underlying text, - not the StringView-object!!! + result, which is either a match-object or None. Keep in mind that + match.end(), match.span() etc. are mapped to the underlying text, + not the StringView-object!!! """ return regex.search(self.text, pos=self.begin, endpos=self.end) def finditer(self, regex): """Executes regex.finditer on the StringView object and returns the - iterator of match objects. - WARNING: match.end(), match.span() etc. are mapped to the underlying text, - not the StringView-object!!! + iterator of match objects. Keep in mind that match.end(), match.span() + etc. are mapped to the underlying text, not the StringView-object!!! """ return regex.finditer(self.text, pos=self.begin, endpos=self.end) diff --git a/DHParser/transform.py b/DHParser/transform.py index b7b6278a5400c17494ffc848c953344d8bf5135a..2c98e726503bb072cef506974d7421850aebd22d 100644 --- a/DHParser/transform.py +++ b/DHParser/transform.py @@ -920,4 +920,3 @@ def forbid(context: List[Node], child_tags: AbstractSet[str]): if child.tag_name in child_tags: context[0].new_error(node, 'Element "%s" cannot be nested inside "%s".' % (child.parser.name, node.parser.name)) - diff --git a/documentation/.buildinfo b/documentation/.buildinfo deleted file mode 100644 index 3be13af30d26b6657fa69a3039463984e711d665..0000000000000000000000000000000000000000 --- a/documentation/.buildinfo +++ /dev/null @@ -1,4 +0,0 @@ -# Sphinx build info version 1 -# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: a864fbe9973eb04d25fc99fca3d8ce3e -tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/documentation/.nojekyll b/documentation/.nojekyll deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/documentation/DHParser.pdf b/documentation/DHParser.pdf deleted file mode 100644 index 851ccd9573afe2257977a00a3ea79589c26695d7..0000000000000000000000000000000000000000 Binary files a/documentation/DHParser.pdf and /dev/null differ diff --git a/documentation_sources/Makefile b/documentation/Makefile similarity index 100% rename from documentation_sources/Makefile rename to documentation/Makefile diff --git a/documentation/ModuleReference.html b/documentation/ModuleReference.html deleted file mode 100644 index a0dfd76d32d68df01f666814a25e9bfa52170e9a..0000000000000000000000000000000000000000 --- a/documentation/ModuleReference.html +++ /dev/null @@ -1,3351 +0,0 @@ - - - - - - - - - - - Module Reference — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -
-

Module Reference

-

DHParser is split into a number of modules plus one command line utility -(dhparser.py, which will not be described here.)

-

Usually, the user or “importer” of DHParser does not need to worry -about its internal module structure, because DHParser provides a flat -namespace form which all of its symbols can be imported, e.g.:

-
from DHParser import *
-
-
-

or:

-
from DHParser import recompile_grammar, grammar_suite, compile_source
-
-
-

However, in order to add or change the source code of DHParser, its module -structure must be understood. DHParser’s modules can roughly be sorted into -three different categories:

-
    -
  1. Modules that contain the basic functionality for packrat-parsing, -AST-transformation and the skeleton for a DSL-compilers.
  2. -
  3. Modules for EBNF-Grammars and DSL compilation.
  4. -
  5. Support or “toolkit”-modules that contain different helpful functions
  6. -
-

The import-order of DHParser’s modules runs across these categories. In the -following list the modules further below in the list may import one or -more of the modules further above in the list, but not the other way round:

-
    -
  • versionnumber.py – contains the verison number of DHParser
  • -
  • toolkit.py – utility functions for DHParser
  • -
  • stringview.py – a string class where slices are views not copies as -with the standard Python strings.
  • -
  • preprocess.py – preprocessing of source files for DHParser
  • -
  • error.py – error handling for DHParser
  • -
  • syntaxtree.py – syntax tree classes for DHParser
  • -
  • -
    transform.py – transformation functions for converting the concrete
    -
    into the abstract syntax tree
    -
    -
  • -
  • logging.py – logging and debugging for DHParser
  • -
  • parse.py – parser combinators for for DHParser
  • -
  • -
    compile.py – abstract base class for compilers that transform an AST
    -
    into something useful
    -
    -
  • -
  • ebnf.py – EBNF -> Python-Parser compilation for DHParser
  • -
  • dsl.py – Support for domain specific notations for DHParser
  • -
  • testing.py – test support for DHParser based grammars and compilers
  • -
-
-

Main Modules Reference

-

The core of DHParser are the modules containing the functionality -for the parsing and compiling process. The modules preprocess, -parse, transform and compile represent particular stages of the -parsing/compiling process, while syntaxtree and error define -classes for syntax trees and parser/compiler errors, respectively.

-
-

Module preprocess

-

Module preprocess contains functions for preprocessing source -code before the parsing stage as well as source mapping facilities -to map the locations of parser and compiler errors to the -non-preprocessed source text.

-

Preprocessing (and source mapping of errors) will only be needed -for some domain specific languages, most notable those that -cannot completely be described with context-free grammars.

-
-
-make_token(token: str, argument: str = '') → str[source]
-

Turns the token and argument into a special token that -will be caught by the PreprocessorToken-parser.

-

This function is a support function that should be used by -preprocessors to inject preprocessor tokens into the source text.

-
- -
-
-strip_tokens(tokenized: str) → str[source]
-

Replaces all tokens with the token’s arguments.

-
- -
-
-nil_preprocessor(text: str) → Tuple[str, Union[typing.Callable[[int], int], functools.partial]][source]
-

A preprocessor that does nothing, i.e. just returns the input.

-
- -
-
-chain_preprocessors(*preprocessors) → Union[typing.Callable[[str], typing.Union[str, typing.Tuple[str, typing.Union[typing.Callable[[int], int], functools.partial]]]], functools.partial][source]
-

Merges a seuqence of preprocessor functions in to a single function.

-
- -
-
-prettyprint_tokenized(tokenized: str) → str[source]
-

Returns a pretty-printable version of a document that contains tokens.

-
- -
-
-class SourceMap(positions, offsets)
-
-
-offsets
-

Alias for field number 1

-
- -
-
-positions
-

Alias for field number 0

-
- -
- -
-
-tokenized_to_original_mapping(tokenized_source: str) → preprocess.SourceMap[source]
-

Generates a source map for mapping positions in a text that has -been enriched with token markers to their original positions.

- --- - - - - - -
Parameters:tokenized_source – the source text enriched with token markers
Returns:a source map, i.e. a list of positions and a list of corresponding -offsets. The list of positions is ordered from smallest to highest. -An offset is valid for its associated position and all following -positions until (and excluding) the next position in the list of -positions.
-
- -
-
-source_map(position: int, srcmap: preprocess.SourceMap) → int[source]
-

Maps a position in a (pre-)processed text to its corresponding -position in the original document according to the given source map.

- --- - - - - - -
Parameters:
    -
  • position – the position in the processed text
  • -
  • srcmap – the source map, i.e. a mapping of locations to offset values
  • -
-
Returns:

the mapped position

-
-
- -
-
-with_source_mapping(result: Union[str, typing.Tuple[str, typing.Union[typing.Callable[[int], int], functools.partial]]]) → Tuple[str, Union[typing.Callable[[int], int], functools.partial]][source]
-

Normalizes preprocessors results, by adding a mapping if a preprocessor -only returns the transformed source code and no mapping by itself. It is -assumed that in this case the preprocessor has just enriched the source -code with tokens, so that a source mapping can be derived automatically -with tokenized_to_original_mapping() (see above).

-
- -
-
-

Module syntaxtree

-

Module syntaxtree defines the Node-class for syntax trees as well -as an abstract base class for parser-objects. The latter is defined -here, because node-objects refer to parser-objects. All concrete -parser classes are defined in the parse module.

-
-
-class ParserBase(name='')[source]
-

ParserBase is the base class for all real and mock parser classes. -It is defined here, because Node objects require a parser object -for instantiation.

-
-
-apply(func: Callable) → bool[source]
-

Applies the function func to the parser. Returns False, if -- for whatever reason - the functions has not been applied, True -otherwise.

-
- -
-
-grammar() → object[source]
-

Returns the Grammar object to which the parser belongs. If not -yet connected to any Grammar object, None is returned.

-
- -
-
-name
-

Returns the name of the parser or the empty string ‘’ for unnamed -parsers.

-
- -
-
-ptype
-

Returns the type of the parser. By default this is the parser’s -class name preceded by a colon, e.g. ‘:ZeroOrMore’.

-
- -
-
-repr
-

Returns the parser’s name if it has a name and repr()

-
- -
-
-reset()[source]
-

Resets any parser variables. (Should be overridden.)

-
- -
- -
-
-class MockParser(name='', ptype='')[source]
-

MockParser objects can be used to reconstruct syntax trees from a -serialized form like S-expressions or XML. Mock objects can mimic -different parser types by assigning them a ptype on initialization.

-

Mock objects should not be used for anything other than -syntax tree (re-)construction. In all other cases where a parser -object substitute is needed, chose the singleton ZOMBIE_PARSER.

-
- -
-
-class ZombieParser[source]
-

Serves as a substitute for a Parser instance.

-

ZombieParser is the class of the singelton object -ZOMBIE_PARSER. The ZOMBIE_PARSER has a name and can be -called, but it never matches. It serves as a substitute where only -these (or one of these properties) is needed, but no real Parser- -object is instantiated.

-
- -
-
-class Node(parser, result: Union[typing.Tuple[_ForwardRef('Node'), ...], _ForwardRef('Node'), DHParser.stringview.StringView, str, NoneType], leafhint: bool = False) → None[source]
-

Represents a node in the concrete or abstract syntax tree.

-
-
-tag_name
-

str – The name of the node, which is either its -parser’s name or, if that is empty, the parser’s class name

-
- -
-
-result
-

str or tuple – The result of the parser which -generated this node, which can be either a string or a -tuple of child nodes.

-
- -
-
-children
-

tuple – The tuple of child nodes or an empty tuple -if there are no child nodes. READ ONLY!

-
- -
-
-content
-

str – Yields the contents of the tree as string. The -difference to str(node) is that node.content does -not add the error messages to the returned string.

-
- -
-
-parser
-

Parser – The parser which generated this node. -WARNING: In case you use mock syntax trees for testing or -parser replacement during the AST-transformation: DO NOT -rely on this being a real parser object in any phase after -parsing (i.e. AST-transformation and compiling), for -example by calling isinstance(node.parer, ...).

-
- -
-
-len
-

int – The full length of the node’s string result if the -node is a leaf node or, otherwise, the concatenated string -result’s of its descendants. The figure always represents -the length before AST-transformation and will never change -through AST-transformation. READ ONLY!

-
- -
-
-pos
-

int – the position of the node within the parsed text.

-

The value of pos is -1 meaning invalid by default. -Setting this value will set the positions of all child -nodes relative to this value.

-

To set the pos values of all nodes in a syntax tree, the -pos value of the root node should be set to 0 right -after parsing.

-

Other than that, this value should be considered READ ONLY. -At any rate, it should only be reassigned during the parsing -stage and never during or after the AST-transformation.

-
- -
-
-errors
-

list – A list of all errors that occured on this node.

-
- -
-
-attributes
-

dict – An optional dictionary of XML-attributes. This -dictionary is created lazily upon first usage. The attributes -will only be shown in the XML-Representation, not in the -S-Expression-output.

-
- -
-
-as_sxpr(src: str = None, compact: bool = False, showerrors: bool = True, indentation: int = 2) → str[source]
-

Returns content as S-expression, i.e. in lisp-like form.

- --- - - - -
Parameters:
    -
  • src – The source text or None. In case the source text is -given the position of the element in the text will be -reported as line and column.
  • -
  • compact – If True a compact representation is returned where -brackets are omitted and only the indentation indicates the -tree structure.
  • -
-
-
- -
-
-as_xml(src: str = None, showerrors: bool = True, indentation: int = 2) → str[source]
-

Returns content as XML-tree.

- --- - - - -
Parameters:src – The source text or None. In case the source text is -given the position will also be reported as line and -column.
-
- -
-
-attributes
-

Returns a dictionary of XML-Attributes attached to the Node.

-
- -
-
-content
-

Returns content as string, omitting error messages.

-
- -
-
-init_pos(pos: int) → syntaxtree.Node[source]
-

(Re-)initialize position value. Usually, the parser guard -(parsers.add_parser_guard()) takes care of assigning the -position in the document to newly created nodes. However, -where Nodes are created outside the reach of the parser -guard, their document-position must be assigned manually. -This function recursively reassigns the position values -of the child nodes, too.

-
- -
-
-pick(tag_names: Union[str, typing.Set[str]]) → Union[_ForwardRef('Node'), NoneType][source]
-

Picks the first descendant with one of the given tag_names.

-

This function is just syntactic sugar for -next(node.select_by_tag(tag_names, False)). However, rather than -raising a StopIterationError if no descendant with the given tag-name -exists, it returns None.

-
- -
-
-pos
-

Returns the position of the Node’s content in the source text.

-
- -
-
-result
-

Returns the result from the parser that created the node. -Error messages are not included in the result. Use self.content() -if the result plus any error messages is needed.

-
- -
-
-select(match_function: Callable, include_root: bool = True) → Iterator[_ForwardRef('Node')][source]
-

Finds nodes in the tree that fulfill a given criterion.

-

select is a generator that yields all nodes for which the -given match_function evaluates to True. The tree is -traversed pre-order.

-

See function Node.select_by_tag for some examples.

- --- - - - - - -
Parameters:
    -
  • match_function (function) – A function that takes as Node -object as argument and returns True or False
  • -
  • include_root (bool) – If False, only descendant nodes will be -checked for a match.
  • -
-
Yields:

Node – All nodes of the tree for which -match_function(node) returns True

-
-
- -
-
-select_by_tag(tag_names: Union[str, typing.AbstractSet[str]], include_root: bool = True) → Iterator[_ForwardRef('Node')][source]
-

Returns an iterator that runs through all descendants that have one -of the given tag names.

-

Examples:

-
>>> tree = parse_sxpr('(a (b "X") (X (c "d")) (e (X "F")))')
->>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag("X", False))
-['(X (c "d"))', '(X "F")']
->>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag({"X", "b"}, False))
-['(b "X")', '(X (c "d"))', '(X "F")']
->>> any(tree.select_by_tag('a', False))
-False
->>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag('a', True))
-['(a (b "X") (X (c "d")) (e (X "F")))']
->>> flatten_sxpr(next(tree.select_by_tag("X", False)).as_sxpr())
-'(X (c "d"))'
-
-
- --- - - - - - -
Parameters:
    -
  • tag_name (set) – A tag name or set of tag names that is being -searched for
  • -
  • include_root (bool) – If False, only descendant nodes will be -checked for a match.
  • -
-
Yields:

Node – All nodes which have a given tag name.

-
-
- -
-
-structure
-

Return structure (and content) as S-expression on a single line -without any line breaks.

-
- -
-
-tag_name
-

Returns the tage name of Node, i.e. the name for XML or -S-expression representation. By default the tag name is the -name of the node’s parser or, if the node’s parser is unnamed, the -node’s parser’s ptype.

-
- -
-
-tree_size() → int[source]
-

Recursively counts the number of nodes in the tree including the root node.

-
- -
- -
-
-class RootNode(node: Union[syntaxtree.Node, NoneType] = None) → syntaxtree.RootNode[source]
-

TODO: Add Documentation!!!

-
-
errors (list): A list of all errors that have occured so far during
-
processing (i.e. parsing, AST-transformation, compiling) -of this tree.
-
error_flag (int): the highest warning or error level of all errors
-
that occurred.
-
-
-
-add_error(node: syntaxtree.Node, error: DHParser.error.Error) → syntaxtree.RootNode[source]
-

Adds an Error object to the tree, locating it at a specific node.

-
- -
-
-collect_errors() → List[DHParser.error.Error][source]
-

Returns the list of errors, ordered bv their position.

-
- -
-
-new_error(node: syntaxtree.Node, message: str, code: int = 1000) → syntaxtree.RootNode[source]
-

Adds an error to this tree, locating it at a specific node. -:param pos: The position of the error in the source text -:type pos: int -:param message: A string with the error message.abs -:type message: str -:param code: An error code to identify the kind of error -:type code: int

-
- -
- -
-
-parse_sxpr(sxpr: str) → syntaxtree.Node[source]
-

Generates a tree of nodes from an S-expression.

-

This can - among other things - be used for deserialization of trees that -have been serialized with Node.as_sxpr() or as a convenient way to -generate test data.

-

Example: ->>> parse_sxpr(“(a (b c))”).as_sxpr() -‘(an (bn “c”n )n)’

-
- -
-
-parse_xml(xml: str) → syntaxtree.Node[source]
-

Generates a tree of nodes from a (Pseudo-)XML-source.

-
- -
-
-flatten_sxpr(sxpr: str) → str[source]
-

Returns S-expression sxpr as a one-liner without unnecessary -whitespace.

-

Example: ->>> flatten_sxpr(‘(an (bn cn )n)n’) -‘(a (b c))’

-
- -
-
-flatten_xml(xml: str) → str[source]
-

Returns an XML-tree as a one linter without unnecessary whitespace, -i.e. only whitespace within leaf-nodes is preserved.

-
- -
-
-

Module parse

-

Module parse contains the python classes and functions for -DHParser’s packrat-parser. It’s central class is the -Grammar-class, which is the base class for any concrete -Grammar. Grammar-objects are callable and parsing is done by -calling a Grammar-object with a source text as argument.

-

The different parsing functions are callable descendants of class -Parser. Usually, they are organized in a tree and defined -within the namespace of a grammar-class. See ebnf.EBNFGrammar -for an example.

-
-
-class Parser(name: str = '') → None[source]
-

(Abstract) Base class for Parser combinator parsers. Any parser -object that is actually used for parsing (i.e. no mock parsers) -should should be derived from this class.

-

Since parsers can contain other parsers (see classes UnaryOperator -and NaryOperator) they form a cyclical directed graph. A root -parser is a parser from which all other parsers can be reached. -Usually, there is one root parser which serves as the starting -point of the parsing process. When speaking of “the root parser” -it is this root parser object that is meant.

-

There are two different types of parsers:

-
    -
  1. Named parsers for which a name is set in field parser.name. -The results produced by these parsers can later be retrieved in -the AST by the parser name.
  2. -
  3. Anonymous parsers where the name-field just contains the empty -string. AST-transformation of Anonymous parsers can be hooked -only to their class name, and not to the individual parser.
  4. -
-

Parser objects are callable and parsing is done by calling a parser -object with the text to parse.

-

If the parser matches it returns a tuple consisting of a node -representing the root of the concrete syntax tree resulting from the -match as well as the substring text[i:] where i is the length of -matched text (which can be zero in the case of parsers like -ZeroOrMore or Option). If i > 0 then the parser has “moved -forward”.

-

If the parser does not match it returns (None, text). **Note** that -this is not the same as an empty match `(“”, text). Any empty match -can for example be returned by the ZeroOrMore-parser in case the -contained parser is repeated zero times.

-
-
Attributes and Properties:
-
-
visited: Mapping of places this parser has already been to
-
during the current parsing process onto the results the -parser returned at the respective place. This dictionary -is used to implement memoizing.
-
recursion_counter: Mapping of places to how often the parser
-
has already been called recursively at this place. This -is needed to implement left recursion. The number of -calls becomes irrelevant once a resault has been memoized.
-
cycle_detection: The apply()-method uses this variable to make
-
sure that one and the same function will not be applied -(recursively) a second time, if it has already been -applied to this parser.
-
grammar: A reference to the Grammar object to which the parser
-
is attached.
-
-
-
-
-
-ApplyFunc
-

alias of typing.Callable

-
- -
-
-apply(func: Callable[_ForwardRef('Parser'), NoneType]) → bool[source]
-

Applies function func(parser) recursively to this parser and all -descendant parsers if any exist. The same function can never -be applied twice between calls of the reset()-method! -Returns True, if function has been applied, False if function -had been applied earlier already and thus has not been applied again.

-
- -
-
-grammar
-

Returns the Grammar object to which the parser belongs. If not -yet connected to any Grammar object, None is returned.

-
- -
-
-reset()[source]
-

Initializes or resets any parser variables. If overwritten, -the reset()-method of the parent class must be called from the -reset()-method of the derived class.

-
- -
- -
-
-exception UnknownParserError[source]
-

UnknownParserError is raised if a Grammer object is called with a -parser that does not exist or if in the course of parsing a parser -is reffered to that does not exist.

-
- -
-
-class Grammar(root: parse.Parser = None) → None[source]
-

Class Grammar directs the parsing process and stores global state -information of the parsers, i.e. state information that is shared -accross parsers.

-

Grammars are basically collections of parser objects, which are -connected to an instance object of class Grammar. There exist two -ways of connecting parsers to grammar objects: Either by passing -the root parser object to the constructor of a Grammar object -(“direct instantiation”), or by assigning the root parser to the -class variable “root__” of a descendant class of class Grammar.

-

Example for direct instantiation of a grammar:

-
>>> number = RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')
->>> number_parser = Grammar(number)
->>> number_parser("3.1416").content
-'3.1416'
-
-
-

Collecting the parsers that define a grammar in a descendant class of -class Grammar and assigning the named parsers to class variables -rather than global variables has several advantages:

-
    -
  1. It keeps the namespace clean.
  2. -
  3. The parser names of named parsers do not need to be passed to the -constructor of the Parser object explicitly, but it suffices to -assign them to class variables, which results in better -readability of the Python code.
  4. -
  5. The parsers in the class do not necessarily need to be connected -to one single root parser, which is helpful for testing and -building up a parser successively of several components.
  6. -
-

As a consequence, though, it is highly recommended that a Grammar -class should not define any other variables or methods with names -that are legal parser names. A name ending with a double -underscore ‘__’ is not a legal parser name and can safely be -used.

-

Example:

-
class Arithmetic(Grammar):
-    # special fields for implicit whitespace and comment configuration
-    COMMENT__ = r'#.*(?:\n|$)'  # Python style comments
-    wspR__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__)
-
-    # parsers
-    expression = Forward()
-    INTEGER = RE('\\d+')
-    factor = INTEGER | Token("(") + expression + Token(")")
-    term = factor + ZeroOrMore((Token("*") | Token("/")) + factor)
-    expression.set(term + ZeroOrMore((Token("+") | Token("-")) + term))
-    root__ = expression
-
-
-

Upon instantiation the parser objects are deep-copied to the -Grammar object and assigned to object variables of the same name. -Any parser that is directly assigned to a class variable is a -‘named’ parser and its field parser.name contains the variable -name after instantiation of the Grammar class. All other parsers, -i.e. parsers that are defined within a named parser, remain -“anonymous parsers” where parser.name is the empty string, unless -a name has been passed explicitly upon instantiation. -If one and the same parser is assigned to several class variables -such as, for example the parser expression in the example above, -the first name sticks.

-

Grammar objects are callable. Calling a grammar object with a UTF-8 -encoded document, initiates the parsing of the document with the -root parser. The return value is the concrete syntax tree. Grammar -objects can be reused (i.e. called again) after parsing. Thus, it -is not necessary to instantiate more than one Grammar object per -thread.

-

Grammar classes contain a few special class fields for implicit -whitespace and comments that should be overwritten, if the defaults -(no comments, horizontal right aligned whitespace) don’t fit:

-
-
-COMMENT__
-

regular expression string for matching comments

-
- -
-
-WSP__
-

regular expression for whitespace and comments

-
- -
-
-wspL__
-

regular expression string for left aligned whitespace, -which either equals WSP__ or is empty.

-
- -
-
-wspR__
-

regular expression string for right aligned whitespace, -which either equals WSP__ or is empty.

-
- -
-
-root__
-

The root parser of the grammar. Theoretically, all parsers of the -grammar should be reachable by the root parser. However, for testing -of yet incomplete grammars class Grammar does not assume that this -is the case.

-
- -
-
-parser_initializiation__
-

Before the parser class (!) has been initialized, -which happens upon the first time it is instantiated (see -:func:_assign_parser_names()` for an explanation), this class -field contains a value other than “done”. A value of “done” indicates -that the class has already been initialized.

-
- -
-
-python__src__
-

For the purpose of debugging and inspection, this field can -take the python src of the concrete grammar class -(see dsl.grammar_provider).

-
- -
-
-all_parsers__
-

A set of all parsers connected to this grammar object

-
- -
-
-history_tracking__
-

A flag indicating that the parsing history shall -be tracked

-
- -
-
-whitespace__
-

A parser for the implicit optional whitespace (or the -:class:zombie-parser if the default is empty). The default -whitespace will be used by parsers Token and, if no -other parsers are passed to its constructor, by parser -RE. It can also be place explicitly in the -EBNF-Grammar via the “~”-sign.

-
- -
-
-wsp_left_parser__
-

The same as whitespace for -left-adjacent-whitespace.

-
- -
-
-wsp_right_parser__
-

The same as whitespace for -right-adjacent-whitespace.

-
- -
-
-_dirty_flag__
-

A flag indicating that the Grammar has been called at -least once so that the parsing-variables need to be reset -when it is called again.

-
- -
-
-document__
-

the text that has most recently been parsed or that is -currently being parsed.

-
- -
-
-document_length__
-

the length of the document.

-
- -
-
-document_lbreaks__
-

list of linebreaks within the document, starting -with -1 and ending with EOF. This helps generating line -and column number for history recording and will only be -initialized if history_tracking__ is true.

-
- -
-
-tree__
-

The root-node of the parsing tree. This variable is available -for error-reporting already during parsing via -self.grammar.tree__.add_error, but it references the full -parsing tree only after parsing has been finished.

-
- -
-
-_reversed__
-

the same text in reverse order - needed by the Lookbehind- -parsers.

-
- -
-
-variables__
-

A mapping for variable names to a stack of their respective -string values - needed by the Capture-, Retrieve- -and Pop-parsers.

-
- -
-
-rollback__
-

A list of tuples (location, rollback-function) that are -deposited by the Capture- and Pop-parsers. -If the parsing process reaches a dead end then all -rollback-functions up to the point to which it retreats will be -called and the state of the variable stack restored accordingly.

-
- -
-
-last_rb__loc__
-

The last, i.e. most advanced location in the text -where a variable changing operation occurred. If the parser -backtracks to a location at or before last_rb__loc__ (i.e. -location <= last_rb__loc__) then a rollback of all variable -changing operations is necessary that occurred after the -location to which the parser backtracks. This is done by -calling method rollback_to__(location)().

-
- -
-
-call_stack__
-

A stack of all parsers that have been called. This -is required for recording the parser history (for debugging) -and, eventually, i.e. one day in the future, for tracing through -the parsing process.

-
- -
-
-history__
-

A list of parser-call-stacks. A parser-call-stack is -appended to the list each time a parser either matches, fails -or if a parser-error occurs.

-
- -
-
-moving_forward__
-

This flag indicates that the parsing process is currently -moving forward . It is needed to reduce noise in history recording -and should not be considered as having a valid value if history -recording is turned off! (See add_parser_guard() and its local -function guarded_call())

-
- -
-
-recursion_locations__
-

Stores the locations where left recursion was -detected. Needed to provide minimal memoization for the left -recursion detection algorithm, but, strictly speaking, superfluous -if full memoization is enabled. (See add_parser_guard() and its -local function guarded_call())

-
- -
-
-memoization__
-

Turns full memoization on or off. Turning memoization off -results in less memory usage and sometimes reduced parsing time. -In some situations it may drastically increase parsing time, so -it is safer to leave it on. (Default: on)

-
- -
-
-left_recursion_handling__
-

Turns left recursion handling on or off. -If turned off, a recursion error will result in case of left -recursion.

-
- -
-
-push_rollback__(location, func)[source]
-

Adds a rollback function that either removes or re-adds -values on the variable stack (self.variables) that have been -added (or removed) by Capture or Pop Parsers, the results of -which have been dismissed.

-
- -
-
-reversed__
-

Returns a reversed version of the currently parsed document. As -about the only case where this is needed is the Lookbehind-parser, -this is done lazily.

-
- -
-
-rollback_to__(location)[source]
-

Rolls back the variable stacks (self.variables) to its -state at an earlier location in the parsed document.

-
- -
- -
-
-class PreprocessorToken(token: str) → None[source]
-

Parses tokens that have been inserted by a preprocessor.

-

Preprocessors can generate Tokens with the make_token-function. -These tokens start and end with magic characters that can only be -matched by the PreprocessorToken Parser. Such tokens can be used to -insert BEGIN - END delimiters at the beginning or ending of a -quoted block, for example.

-
- -
-
-class RegExp(regexp, name: str = '') → None[source]
-

Regular expression parser.

-

The RegExp-parser parses text that matches a regular expression. -RegExp can also be considered as the “atomic parser”, because all -other parsers delegate part of the parsing job to other parsers, -but do not match text directly.

-

Example:

-
>>> word = RegExp(r'\w+')
->>> Grammar(word)("Haus").content
-'Haus'
-
-
-

EBNF-Notation: / ... /

-

EBNF-Example: word = /\w+/

-
- -
-
-class Whitespace(regexp, name: str = '') → None[source]
-

An variant of RegExp that signifies through its class name that it -is a RegExp-parser for whitespace.

-
- -
-
-class RE(regexp, wL=None, wR=None, name: str = '') → None[source]
-

Regular Expressions with optional leading or trailing whitespace.

-

The RE-parser parses pieces of text that match a given regular -expression. Other than the RegExp-Parser it can also skip -“implicit whitespace” before or after the matched text.

-

The whitespace is in turn defined by a regular expression. It should -be made sure that this expression also matches the empty string, -e.g. use r’s*’ or r’[t ]+’, but not r’s+’. If the respective -parameters in the constructor are set to None the default -whitespace expression from the Grammar object will be used.

-

Example (allowing whitespace on the right hand side, but not on -the left hand side of a regular expression):

-
>>> word = RE(r'\w+', wR=r'\s*')
->>> parser = Grammar(word)
->>> result = parser('Haus ')
->>> result.content
-'Haus '
->>> result.structure
-'(:RE (:RegExp "Haus") (:Whitespace " "))'
->>> str(parser(' Haus'))
-' <<< Error on " Haus" | Parser did not match! Invalid source file?\n    Most advanced: None\n    Last match:    None; >>> '
-
-
-

EBNF-Notation: / ... /~`  or  `~/ ... /`  or  `~/ ... /~

-

EBNF-Example: word = /\w+/~

-
-
-apply(func: Callable[_ForwardRef('Parser'), NoneType]) → bool[source]
-

Applies function func(parser) recursively to this parser and all -descendant parsers if any exist. The same function can never -be applied twice between calls of the reset()-method! -Returns True, if function has been applied, False if function -had been applied earlier already and thus has not been applied again.

-
- -
-
-create_main_parser(arg) → parse.Parser[source]
-

Creates the main parser of this compound parser. Can be overridden.

-
- -
- -
-
-class Token(token: str, wL=None, wR=None, name: str = '') → None[source]
-

Class Token parses simple strings. Any regular regular expression -commands will be interpreted as simple sequence of characters.

-

Other than that class Token is essentially a renamed version of -class RE. Because tokens often have a particular semantic different -from other REs, parsing them with a separate parser class allows to -distinguish them by their parser type.

-
-
-create_main_parser(arg) → parse.Parser[source]
-

Creates the main parser of this compound parser. Can be overridden.

-
- -
- -
-
-mixin_comment(whitespace: str, comment: str) → str[source]
-

Returns a regular expression that merges comment and whitespace -regexps. Thus comments cann occur whereever whitespace is allowed -and will be skipped just as implicit whitespace.

-

Note, that because this works on the level of regular expressions, -nesting comments is not possible. It also makes it much harder to -use directives inside comments (which isn’t recommended, anyway).

-
- -
-
-class Synonym(parser: parse.Parser, name: str = '') → None[source]
-

Simply calls another parser and encapsulates the result in -another node if that parser matches.

-

This parser is needed to support synonyms in EBNF, e.g.:

-
jahr       = JAHRESZAHL
-JAHRESZAHL = /\d\d\d\d/
-
-
-

Otherwise the first line could not be represented by any parser -class, in which case it would be unclear whether the parser -RE(‘dddd’) carries the name ‘JAHRESZAHL’ or ‘jahr’.

-
- -
-
-class Option(parser: parse.Parser, name: str = '') → None[source]
-

Parser Option always matches, even if its child-parser -did not match.

-

If the child-parser did not match Option returns a node -with no content and does not move forward in the text.

-

If the child-parser did match, Option returns the a node -with the node returnd by the child-parser as its single -child and the text at the position where the child-parser -left it.

-

Examples:

-
>>> number = Option(Token('-')) + RegExp(r'\d+') + Option(RegExp(r'\.\d+'))
->>> Grammar(number)('3.14159').content
-'3.14159'
->>> Grammar(number)('3.14159').structure
-'(:Series (:Option) (:RegExp "3") (:Option (:RegExp ".14159")))'
->>> Grammar(number)('-1').content
-'-1'
-
-
-

EBNF-Notation: [ ... ]

-

EBNF-Example: number = ["-"]  /\d+/  [ /\.\d+/ ]

-
- -
-
-class ZeroOrMore(parser: parse.Parser, name: str = '') → None[source]
-

ZeroOrMore applies a parser repeatedly as long as this parser -matches. Like Option the ZeroOrMore parser always matches. In -case of zero repetitions, the empty match ((), text) is returned.

-

Examples:

-
>>> sentence = ZeroOrMore(RE(r'\w+,?')) + Token('.')
->>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
-'Wo viel der Weisheit, da auch viel des Grämens.'
->>> Grammar(sentence)('.').content  # an empty sentence also matches
-'.'
-
-
-

EBNF-Notation: { ... }

-

EBNF-Example: sentence = { /\w+,?/ } "."

-
- -
-
-class OneOrMore(parser: parse.Parser, name: str = '') → None[source]
-

OneOrMore applies a parser repeatedly as long as this parser -matches. Other than ZeroOrMore which always matches, at least -one match is required by OneOrMore.

-

Examples:

-
>>> sentence = OneOrMore(RE(r'\w+,?')) + Token('.')
->>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
-'Wo viel der Weisheit, da auch viel des Grämens.'
->>> str(Grammar(sentence)('.'))  # an empty sentence also matches
-' <<< Error on "." | Parser did not match! Invalid source file?\n    Most advanced: None\n    Last match:    None; >>> '
-
-
-

EBNF-Notation: { ... }+

-

EBNF-Example: sentence = { /\w+,?/ }+

-
- -
-
-class Series(*parsers, mandatory: int = 1000, name: str = '') → None[source]
-

Matches if each of a series of parsers matches exactly in the order of -the series.

-

Example:

-
>>> variable_name = RegExp('(?!\d)\w') + RE('\w*')
->>> Grammar(variable_name)('variable_1').content
-'variable_1'
->>> str(Grammar(variable_name)('1_variable'))
-' <<< Error on "1_variable" | Parser did not match! Invalid source file?\n    Most advanced: None\n    Last match:    None; >>> '
-
-
-

EBNF-Notation: ... ... (sequence of parsers separated by a blank or new line)

-

EBNF-Example: series = letter letter_or_digit

-
-
-static combined_mandatory(left: parse.Parser, right: parse.Parser)[source]
-

Returns the position of the first mandatory element (if any) when -parsers left and right are joined to a sequence.

-
- -
- -
-
-class Alternative(*parsers, name: str = '') → None[source]
-

Matches if one of several alternatives matches. Returns -the first match.

-

This parser represents the EBNF-operator “|” with the qualification -that both the symmetry and the ambiguity of the EBNF-or-operator -are broken by selecting the first match.:

-
# the order of the sub-expression matters!
->>> number = RE('\d+') | RE('\d+') + RE('\.') + RE('\d+')
->>> str(Grammar(number)("3.1416"))
-'3 <<< Error on ".141" | Parser stopped before end! trying to recover... >>> '
-
-# the most selective expression should be put first:
->>> number = RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')
->>> Grammar(number)("3.1416").content
-'3.1416'
-
-
-

EBNF-Notation: ... | ...

-

EBNF-Example: sentence = /\d+\.\d+/ | /\d+/

-
-
-reset()[source]
-

Initializes or resets any parser variables. If overwritten, -the reset()-method of the parent class must be called from the -reset()-method of the derived class.

-
- -
- -
-
-class AllOf(*parsers, name: str = '') → None[source]
-

Matches if all elements of a list of parsers match. Each parser must -match exactly once. Other than in a sequence, the order in which -the parsers match is arbitrary, however.

-

Example:

-
>>> prefixes = AllOf(Token("A"), Token("B"))
->>> Grammar(prefixes)('A B').content
-'A B'
->>> Grammar(prefixes)('B A').content
-'B A'
-
-
-

EBNF-Notation: <... ...> (sequence of parsers enclosed by angular brackets)

-

EBNF-Example: set = <letter letter_or_digit>

-
- -
-
-class SomeOf(*parsers, name: str = '') → None[source]
-

Matches if at least one element of a list of parsers match. No parser -must match more than once . Other than in a sequence, the order in which -the parsers match is arbitrary, however.

-

Example:

-
>>> prefixes = SomeOf(Token("A"), Token("B"))
->>> Grammar(prefixes)('A B').content
-'A B'
->>> Grammar(prefixes)('B A').content
-'B A'
->>> Grammar(prefixes)('B').content
-'B'
-
-
-

EBNF-Notation: <... ...> (sequence of parsers enclosed by angular brackets)

-

EBNF-Example: set = <letter letter_or_digit>

-
- -
-
-Unordered(parser: parse.NaryOperator, name: str = '') → parse.NaryOperator[source]
-

Returns an AllOf- or SomeOf-parser depending on whether parser -is a Series (AllOf) or an Alternative (SomeOf).

-
- -
-
-class Lookahead(parser: parse.Parser, name: str = '') → None[source]
-

Matches, if the contained parser would match for the following text, -but does not consume any text.

-
- -
-
-class NegativeLookahead(parser: parse.Parser, name: str = '') → None[source]
-

Matches, if the contained parser would not match for the following -text.

-
-
-sign(bool_value) → bool[source]
-

Returns the value. Can be overriden to return the inverted bool.

-
- -
- -
-
-class Lookbehind(parser: parse.Parser, name: str = '') → None[source]
-

Matches, if the contained parser would match backwards. Requires -the contained parser to be a RegExp, RE, PlainText or Token parser.

-

EXPERIMENTAL

-
- -
-
-class NegativeLookbehind(parser: parse.Parser, name: str = '') → None[source]
-

Matches, if the contained parser would not match backwards. Requires -the contained parser to be a RegExp-parser.

-
-
-sign(bool_value) → bool[source]
-

Returns the value. Can be overriden to return the inverted bool.

-
- -
- -
-
-class Capture(parser: parse.Parser, name: str = '') → None[source]
-

Applies the contained parser and, in case of a match, saves the result -in a variable. A variable is a stack of values associated with the -contained parser’s name. This requires the contained parser to be named.

-
- -
-
-class Retrieve(symbol: parse.Parser, rfilter: Callable[List[str], str] = None, name: str = '') → None[source]
-

Matches if the following text starts with the value of a particular -variable. As a variable in this context means a stack of values, -the last value will be compared with the following text. It will not -be removed from the stack! (This is the difference between the -Retrieve and the Pop parser.) -The constructor parameter symbol determines which variable is -used.

-
-
-retrieve_and_match(text: DHParser.stringview.StringView) → Tuple[Union[DHParser.syntaxtree.Node, NoneType], DHParser.stringview.StringView][source]
-

Retrieves variable from stack through the filter function passed to -the class’ constructor and tries to match the variable’s value with -the following text. Returns a Node containing the value or None -accordingly.

-

This functionality has been move from the __call__ method to an -independent method to allow calling it from a subclasses __call__ -method without triggering the parser guard a second time.

-
- -
- -
-
-class Pop(symbol: parse.Parser, rfilter: Callable[List[str], str] = None, name: str = '') → None[source]
-

Matches if the following text starts with the value of a particular -variable. As a variable in this context means a stack of values, -the last value will be compared with the following text. Other -than the Retrieve-parser, the Pop-parser removes the value -from the stack in case of a match.

-

The constructor parameter symbol determines which variable is -used.

-
- -
-
-class Forward[source]
-

Forward allows to declare a parser before it is actually defined. -Forward declarations are needed for parsers that are recursively -nested, e.g.:

-
class Arithmetic(Grammar):
-    '''
-    expression =  term  { ("+" | "-") term }
-    term       =  factor  { ("*" | "/") factor }
-    factor     =  INTEGER | "("  expression  ")"
-    INTEGER    =  /\d+/~
-    '''
-    expression = Forward()
-    INTEGER    = RE('\\d+')
-    factor     = INTEGER | Token("(") + expression + Token(")")
-    term       = factor + ZeroOrMore((Token("*") | Token("/")) + factor)
-    expression.set(term + ZeroOrMore((Token("+") | Token("-")) + term))
-    root__     = expression
-
-
-
-
-apply(func: Callable[_ForwardRef('Parser'), NoneType]) → bool[source]
-

Applies function func(parser) recursively to this parser and all -descendant parsers if any exist. The same function can never -be applied twice between calls of the reset()-method! -Returns True, if function has been applied, False if function -had been applied earlier already and thus has not been applied again.

-
- -
-
-set(parser: parse.Parser)[source]
-

Sets the parser to which the calls to this Forward-object -shall be delegated.

-
- -
- -
-
-

Module transform

-

Module transform contains the functions for transforming the -concrete syntax tree (CST) into an abstract syntax tree (AST).

-

As these functions are very generic, they can in principle be -used for any kind of tree transformations, not necessarily only -for CST -> AST transformations.

-
-
-TransformationDict
-

alias of typing.Dict

-
- -
-
-TransformationProc
-

alias of typing.Callable

-
- -
-
-ConditionFunc
-

alias of typing.Callable

-
- -
-
-KeyFunc
-

alias of typing.Callable

-
- -
-
-transformation_factory(t1=None, t2=None, t3=None, t4=None, t5=None)[source]
-

Creates factory functions from transformation-functions that -dispatch on the first parameter after the context parameter.

-

Decorating a transformation-function that has more than merely the -context-parameter with transformation_factory creates a -function with the same name, which returns a partial-function that -takes just the context-parameter.

-

Additionally, there is some some syntactic sugar for -transformation-functions that receive a collection as their second -parameter and do not have any further parameters. In this case a -list of parameters passed to the factory function will be converted -into a collection.

-

Main benefit is readability of processing tables.

-

Usage:

-
@transformation_factory(AbstractSet[str])
-def remove_tokens(context, tokens):
-    ...
-
-
-

or, alternatively:

-
@transformation_factory
-def remove_tokens(context, tokens: AbstractSet[str]):
-    ...
-
-
-

Example:

-
trans_table = { 'expression': remove_tokens('+', '-') }
-
-
-

instead of:

-
trans_table = { 'expression': partial(remove_tokens, tokens={'+', '-'}) }
-
-
- --- - - - -
Parameters:t1 – type of the second argument of the transformation function, -only necessary if the transformation functions’ parameter list -does not have type annotations.
-
- -
-
-traverse(root_node: DHParser.syntaxtree.Node, processing_table: Dict[str, Union[typing.Sequence[typing.Callable], typing.Dict[str, typing.Sequence[typing.Callable]]]], key_func: Callable[DHParser.syntaxtree.Node, str] = <function key_tag_name>) → None[source]
-

Traverses the snytax tree starting with the given node depth -first and applies the sequences of callback-functions registered -in the processing_table-dictionary.

-

The most important use case is the transformation of a concrete -syntax tree into an abstract tree (AST). But it is also imaginable -to employ tree-traversal for the semantic analysis of the AST.

-

In order to assign sequences of callback-functions to nodes, a -dictionary (“processing table”) is used. The keys usually represent -tag names, but any other key function is possible. There exist -three special keys:

-
    -
  • ‘+’: always called (before any other processing function)
  • -
  • ‘*’: called for those nodes for which no (other) processing -function appears in the table
  • -
  • ‘~’: always called (after any other processing function)
  • -
- --- - - - -
Parameters:
    -
  • root_node (Node) – The root-node of the syntax tree to be traversed
  • -
  • processing_table (dict) – node key -> sequence of functions that -will be applied to matching nodes in order. This dictionary -is interpreted as a compact_table. See -expand_table() or EBNFCompiler.EBNFTransTable()
  • -
  • key_func (function) – A mapping key_func(node) -> keystr. The default -key_func yields node.parser.name.
  • -
-
-

Example:

-
table = { "term": [replace_by_single_child, flatten],
-          "factor, flowmarker, retrieveop": replace_by_single_child }
-traverse(node, table)
-
-
-
- -
-
-is_named(context: List[DHParser.syntaxtree.Node]) → bool[source]
-

Returns True if the current node’s parser is a named parser.

-
- -
-
-replace_by_single_child(context: List[DHParser.syntaxtree.Node])[source]
-

Removes single branch node, replacing it by its immediate descendant. -Replacement only takes place, if the last node in the context has -exactly one child.

-
- -
-
-reduce_single_child(context: List[DHParser.syntaxtree.Node])[source]
-

Reduces a single branch node by transferring the result of its -immediate descendant to this node, but keeping this node’s parser entry. -Reduction only takes place if the last node in the context has -exactly one child.

-
- -
-
-replace_or_reduce(context: List[DHParser.syntaxtree.Node], condition: Callable = <function is_named>)[source]
-

Replaces node by a single child, if condition is met on child, -otherwise (i.e. if the child is anonymous) reduces the child.

-
- -
-
-replace_parser(context: List[DHParser.syntaxtree.Node], name: str)[source]
-

Replaces the parser of a Node with a mock parser with the given -name.

- --- - - - -
Parameters:
    -
  • context – the context where the parser shall be replaced
  • -
  • name – “NAME:PTYPE” of the surrogate. The ptype is optional
  • -
-
-
- -
-
-collapse(context: List[DHParser.syntaxtree.Node])[source]
-

Collapses all sub-nodes of a node by replacing them with the -string representation of the node.

-
- -
-
-merge_children(context: List[DHParser.syntaxtree.Node], tag_names: Tuple[str])[source]
-

Joins all children next to each other and with particular tag-names -into a single child node with a mock-parser with the name of the -first tag-name in the list.

-
- -
-
-replace_content(context: List[DHParser.syntaxtree.Node], func: Callable)[source]
-

Replaces the content of the node. func takes the node’s result -as an argument an returns the mapped result.

-
- -
-
-replace_content_by(context: List[DHParser.syntaxtree.Node], content: str)[source]
-

Replaces the content of the node with the given text content.

-
- -
-
-apply_if(context: List[DHParser.syntaxtree.Node], transformation: Callable, condition: Callable)[source]
-

Applies a transformation only if a certain condition is met.

-
- -
-
-apply_unless(context: List[DHParser.syntaxtree.Node], transformation: Callable, condition: Callable)[source]
-

Applies a transformation if a certain condition is not met.

-
- -
-
-traverse_locally(context: List[DHParser.syntaxtree.Node], processing_table: Dict, key_func: Callable = <function key_tag_name>)[source]
-

Transforms the syntax tree starting from the last node in the context -according to the given processing table. The purpose of this function is -to apply certain transformations locally, i.e. only for those nodes that -have the last node in the context as their parent node.

-
- -
-
-is_anonymous(context: List[DHParser.syntaxtree.Node]) → bool[source]
-

Returns True if the current node’s parser is an anonymous parser.

-
- -
-
-is_whitespace(context: List[DHParser.syntaxtree.Node]) → bool[source]
-

Removes whitespace and comments defined with the -@comment-directive.

-
- -
-
-is_empty(context: List[DHParser.syntaxtree.Node]) → bool[source]
-

Returns True if the current node’s content is empty.

-
- -
-
-is_expendable(context: List[DHParser.syntaxtree.Node]) → bool[source]
-

Returns True if the current node either is a node containing -whitespace or an empty node.

-
- -
-
-is_token(context: List[DHParser.syntaxtree.Node], tokens: AbstractSet[str] = frozenset()) → bool[source]
-

Checks whether the last node in the context has ptype == TOKEN_PTYPE -and it’s content matches one of the given tokens. Leading and trailing -whitespace-tokens will be ignored. In case an empty set of tokens is passed, -any token is a match.

-
- -
-
-is_one_of(context: List[DHParser.syntaxtree.Node], tag_name_set: AbstractSet[str]) → bool[source]
-

Returns true, if the node’s tag_name is one of the given tag names.

-
- -
-
-has_content(context: List[DHParser.syntaxtree.Node], regexp: str) → bool[source]
-

Checks a node’s content against a regular expression.

-

In contrast to re.match the regular expression must match the complete -string and not just the beginning of the string to succeed!

-
- -
-
-has_parent(context: List[DHParser.syntaxtree.Node], tag_name_set: AbstractSet[str]) → bool[source]
-

Checks whether a node with one of the given tag names appears somewhere -in the context before the last node in the context.

-
- -
-
-lstrip(context: List[DHParser.syntaxtree.Node], condition: Callable = <function is_expendable>)[source]
-

Recursively removes all leading child-nodes that fulfill a given condition.

-
- -
-
-rstrip(context: List[DHParser.syntaxtree.Node], condition: Callable = <function is_expendable>)[source]
-

Recursively removes all leading nodes that fulfill a given condition.

-
- -
-
-strip(context: List[DHParser.syntaxtree.Node], condition: Callable = <function is_expendable>)[source]
-

Removes leading and trailing child-nodes that fulfill a given condition.

-
- -
-
-keep_children(context: List[DHParser.syntaxtree.Node], section: slice = slice(None, None, None))[source]
-

Keeps only child-nodes which fall into a slice of the result field.

-
- -
-
-keep_children_if(context: List[DHParser.syntaxtree.Node], condition: Callable)[source]
-

Removes all children for which condition() returns True.

-
- -
-
-keep_tokens(context: List[DHParser.syntaxtree.Node], tokens: AbstractSet[str] = frozenset())[source]
-

Removes any among a particular set of tokens from the immediate -descendants of a node. If tokens is the empty set, all tokens -are removed.

-
- -
-
-keep_nodes(context: List[DHParser.syntaxtree.Node], tag_names: AbstractSet[str])[source]
-

Removes children by tag name.

-
- -
-
-keep_content(context: List[DHParser.syntaxtree.Node], regexp: str)[source]
-

Removes children depending on their string value.

-
- -
-
-remove_children_if(context: List[DHParser.syntaxtree.Node], condition: Callable)[source]
-

Removes all children for which condition() returns True.

-
- -
-
-remove_nodes(context: List[DHParser.syntaxtree.Node], tag_names: AbstractSet[str])[source]
-

Removes children by tag name.

-
- -
-
-remove_content(context: List[DHParser.syntaxtree.Node], regexp: str)[source]
-

Removes children depending on their string value.

-
- -
-
-remove_tokens(context: List[DHParser.syntaxtree.Node], tokens: AbstractSet[str] = frozenset())[source]
-

Removes any among a particular set of tokens from the immediate -descendants of a node. If tokens is the empty set, all tokens -are removed.

-
- -
-
-flatten(context: List[DHParser.syntaxtree.Node], condition: Callable = <function is_anonymous>, recursive: bool = True)[source]
-

Flattens all children, that fulfil the given condition -(default: all unnamed children). Flattening means that wherever a -node has child nodes, the child nodes are inserted in place of the -node.

-

If the parameter recursive is True the same will recursively be -done with the child-nodes, first. In other words, all leaves of -this node and its child nodes are collected in-order as direct -children of this node.

-

Applying flatten recursively will result in these kinds of -structural transformation:

-
(1 (+ 2) (+ 3))    ->   (1 + 2 + 3)
-(1 (+ (2 + (3))))  ->   (1 + 2 + 3)
-
-
-
- -
-
-error_on(context: List[DHParser.syntaxtree.Node], condition: Callable, error_msg: str = '')[source]
-

Checks for condition; adds an error message if condition is not met.

-
- -
-
-warn_on(context: List[DHParser.syntaxtree.Node], condition: Callable, warning: str = '')[source]
-

Checks for condition; adds an warning message if condition is not met.

-
- -
-
-

Module compile

-

Module compile contains a skeleton class for syntax -driven compilation support. Class Compiler can serve as base -class for a compiler. Compiler objects -are callable an receive the Abstract syntax tree (AST) -as argument and yield whatever output the compiler produces. In -most Digital Humanities applications this will be -XML-code. However, it can also be anything else, like binary -code or, as in the case of DHParser’s EBNF-compiler, Python -source code.

-

Function compile_source invokes all stages of the compilation -process, i.e. pre-processing, parsing, CST to AST-transformation -and compilation.

-

See module ebnf for a sample of the implementation of a -compiler object.

-
-
-exception CompilerError[source]
-

Exception raised when an error of the compiler itself is detected. -Compiler errors are not to be confused with errors in the source -code to be compiled, which do not raise Exceptions but are merely -reported as an error.

-
- -
-
-class Compiler(grammar_name='', grammar_source='')[source]
-

Class Compiler is the abstract base class for compilers. Compiler -objects are callable and take the root node of the abstract -syntax tree (AST) as argument and return the compiled code in a -format chosen by the compiler itself.

-

Subclasses implementing a compiler must define on_XXX()-methods -for each node name that can occur in the AST where ‘XXX’ is the -node’s name(for unnamed nodes it is the node’s ptype without the -leading colon ‘:’).

-

These compiler methods take the node on which they are run as -argument. Other than in the AST transformation, which runs depth-first, -compiler methods are called forward moving starting with the root -node, and they are responsible for compiling the child nodes -themselves. This should be done by invoking the compile(node)- -method which will pick the right on_XXX-method. It is not -recommended to call the on_XXX-methods directly.

-
-
-context
-

A list of parent nodes that ends with the currently -compiled node.

-
- -
-
-grammar_name
-

The name of the grammar this compiler is related to

-
- -
-
-grammar_source
-

The source code of the grammar this compiler is -related to.

-
- -
-
-_dirty_flag
-

A flag indicating that the compiler has already been -called at least once and that therefore all compilation -variables must be reset when it is called again.

-
- -
-
-compile(node: DHParser.syntaxtree.Node) → Any[source]
-

Calls the compilation method for the given node and returns the -result of the compilation.

-

The method’s name is derived from either the node’s parser -name or, if the parser is anonymous, the node’s parser’s class -name by adding the prefix on_.

-

Note that compile does not call any compilation functions -for the parsers of the sub nodes by itself. Rather, this should -be done within the compilation methods.

-
- -
-
-fallback_compiler(node: DHParser.syntaxtree.Node) → Any[source]
-

This is a generic compiler function which will be called on -all those node types for which no compiler method on_XXX has -been defined.

-
- -
-
-static method_name(node_name: str) → str[source]
-

Returns the method name for node_name, e.g.:

-
>>> Compiler.method_name('expression')
-'on_expression'
-
-
-
- -
-
-set_grammar_name(grammar_name: str = '', grammar_source: str = '')[source]
-

Changes the grammar’s name and the grammar’s source.

-

The grammar name and the source text of the grammar are -metadata about the grammar that do not affect the compilation -process. Classes inheriting from Compiler can use this -information to name and annotate its output. Returns self.

-
- -
- -
-
-compile_source(source: str, preprocessor: Union[typing.Callable[[str], typing.Union[str, typing.Tuple[str, typing.Union[typing.Callable[[int], int], functools.partial]]]], functools.partial, NoneType], parser: DHParser.parse.Grammar, transformer: Union[typing.Callable[[DHParser.syntaxtree.Node], typing.Any], functools.partial], compiler: compile.Compiler) → Tuple[[Any, List[DHParser.error.Error]], DHParser.syntaxtree.Node][source]
-

Compiles a source in four stages: -1. Pre-Processing (if needed) -2. Parsing -3. AST-transformation -4. Compiling.

-

The compilations stage is only invoked if no errors occurred in -either of the two previous stages.

- --- - - - -
Parameters:
    -
  • source (str) – The input text for compilation or a the name of a -file containing the input text.
  • -
  • preprocessor (function) – text -> text. A preprocessor function -or None, if no preprocessor is needed.
  • -
  • parser (function) – A parsing function or grammar class
  • -
  • transformer (function) – A transformation function that takes -the root-node of the concrete syntax tree as an argument and -transforms it (in place) into an abstract syntax tree.
  • -
  • compiler (function) – A compiler function or compiler class -instance
  • -
-
-
-
Returns (tuple):
-
The result of the compilation as a 3-tuple -(result, errors, abstract syntax tree). In detail: -1. The result as returned by the compiler or None in case of failure -2. A list of error or warning messages -3. The root-node of the abstract syntax tree
-
-
- -
-
-

Module error

-

Module error defines class Error and a few helpful functions that are -needed for error reporting of DHParser. Usually, what is of interest are -the string representations of the error objects. For example:

-
from DHParser import compile_source, has_errors
-
-result, errors, ast = compile_source(source, preprocessor, grammar,
-                                     transformer, compiler)
-if errors:
-    for error in errors:
-        print(error)
-
-    if has_errors(errors):
-        print("There have been fatal errors!")
-        sys.exit(1)
-    else:
-        print("There have been warnings, but no errors.")
-
-
-
-
-is_error(code: int) → bool[source]
-

Returns True, if error is an error, not just a warning.

-
- -
-
-is_warning(code: int) → bool[source]
-

Returns True, if error is merely a warning.

-
- -
-
-has_errors(messages: Iterable[error.Error], level: int = 1000) → bool[source]
-

Returns True, if at least one entry in messages has at -least the given error level.

-
- -
-
-only_errors(messages: Iterable[error.Error], level: int = 1000) → Iterator[error.Error][source]
-

Returns an Iterator that yields only those messages that have -at least the given error level.

-
- -
-
-linebreaks(text: Union[DHParser.stringview.StringView, str]) → List[int][source]
-

Returns a list of indices all line breaks in the text.

-
- -
-
-line_col(lbreaks: List[int], pos: int) → Tuple[int, int][source]
-

Returns the position within a text as (line, column)-tuple based -on a list of all line breaks, including -1 and EOF.

-
- -
-
-adjust_error_locations(errors: List[error.Error], original_text: Union[DHParser.stringview.StringView, str], source_mapping: Union[typing.Callable[[int], int], functools.partial] = <function <lambda>>) → List[error.Error][source]
-

Adds (or adjusts) line and column numbers of error messages in place.

- --- - - - - - -
Parameters:
    -
  • errors – The list of errors as returned by the method -collect_errors() of a Node object
  • -
  • original_text – The source text on which the errors occurred. -(Needed in order to determine the line and column numbers.)
  • -
  • source_mapping – A function that maps error positions to their -positions in the original source file.
  • -
-
Returns:

The list of errors. (Returning the list of errors is just syntactical -sugar. Be aware that the line, col and orig_pos attributes have been -changed in place.)

-
-
- -
-
-
-

Domain Specific Language Modules Reference

-

DHParser contains additional support for domain specific languages. -Module ebnf provides a self-hosting parser for EBNF-Grammars as -well as an EBNF-compiler that compiles an EBNF-Grammar into a -DHParser based Grammar class that can be executed to parse source text -conforming to this grammar into contrete syntax trees.

-

Module dsl contains additional functions to support the compilation -of arbitrary domain specific languages (DSL).

-

One very indispensable part of the systematic construction of domain -specific languages is testing. DHParser supports unit testing of -smaller as well as larger components of the Grammar of a DSL.

-
-

Module ebnf

-

Module ebnf provides a self-hosting parser for EBNF-Grammars as -well as an EBNF-compiler that compiles an EBNF-Grammar into a -DHParser based Grammar class that can be executed to parse source text -conforming to this grammar into contrete syntax trees.

-
-
-class EBNFGrammar(root: DHParser.parse.Parser = None) → None[source]
-

Parser for an EBNF source file, with this grammar:

-
# EBNF-Grammar in EBNF
-
-@ comment    = /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
-@ whitespace = /\s*/                            # whitespace includes linefeed
-@ literalws  = right                            # trailing whitespace of literals will be ignored tacitly
-
-syntax     = [~//] { definition | directive } §EOF
-definition = symbol §"=" expression
-directive  = "@" §symbol "=" ( regexp | literal | list_ )
-
-expression = term { "|" term }
-term       = { ["§"] factor }+                       # "§" means all following factors mandatory
-factor     = [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
-           | [flowmarker] literal
-           | [flowmarker] plaintext
-           | [flowmarker] regexp
-           | [flowmarker] whitespace
-           | [flowmarker] oneormore
-           | [flowmarker] group
-           | [flowmarker] unordered
-           | repetition
-           | option
-
-flowmarker = "!"  | "&"                         # '!' negative lookahead, '&' positive lookahead
-           | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
-retrieveop = "::" | ":"                         # '::' pop, ':' retrieve
-
-group      = "(" §expression ")"
-unordered  = "<" §expression ">"                # elements of expression in arbitrary order
-oneormore  = "{" expression "}+"
-repetition = "{" §expression "}"
-option     = "[" §expression "]"
-
-symbol     = /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
-literal    = /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
-           | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
-plaintext  = /`(?:[^"]|\\")*?`/~                # like literal but does not eat whitespace
-regexp     = /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
-                                                # '~' is a whitespace-marker, if present leading or trailing
-                                                # whitespace of a regular expression will be ignored tacitly.
-whitespace = /~/~                               # implicit or default whitespace
-list_      = /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
-                                                # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
-EOF = !/./
-
-
-
- -
-
-exception EBNFCompilerError[source]
-

Error raised by EBNFCompiler class. (Not compilation errors -in the strict sense, see CompilationError in module dsl.py)

-
- -
-
-class EBNFCompiler(grammar_name='', grammar_source='')[source]
-

Generates a Parser from an abstract syntax tree of a grammar specified -in EBNF-Notation.

-

Instances of this class must be called with the root-node of the -abstract syntax tree from an EBNF-specification of a formal language. -The returned value is the Python-source-code of a Grammar class for -this language that can be used to parse texts in this language. -See classes parser.Compiler and parser.Grammar for more information.

-

Addionally, class EBNFCompiler provides helper methods to generate -code-skeletons for a preprocessor, AST-transformation and full -compilation of the formal language. These method’s names start with -the prefix gen_.

-
-
-current_symbols
-

During compilation, a list containing the root -node of the currently compiled definition as first element -and then the nodes of the symbols that are referred to in -the currently compiled definition.

-
- -
-
-rules
-

Dictionary that maps rule names to a list of Nodes that -contain symbol-references in the definition of the rule. -The first item in the list is the node of the rule- -definition itself. Example:

-
-
alternative = a | b
-

Now [node.content for node in self.rules[‘alternative’]] -yields [‘alternative = a | b’, ‘a’, ‘b’]

-
- -
-
-symbols
-

A mapping of symbol names to their first usage (not -their definition!) in the EBNF source.

-
- -
-
-variables
-

A set of symbols names that are used with the -Pop or Retrieve operator. Because the values of these -symbols need to be captured they are called variables. -See test_parser.TestPopRetrieve for an example.

-
- -
-
-recursive
-

A set of symbols that are used recursively and -therefore require a Forward-operator.

-
- -
-
-definitions
-

A dictionary of definitions. Other than rules -this maps the symbols to their compiled definienda.

-
- -
-
-deferred_taks
-

A list of callables that is filled during -compilatation, but that will be executed only after -compilation has finished. Typically, it contains -sementatic checks that require information that -is only available upon completion of compilation.

-
- -
-
-root
-

The name of the root symbol.

-
- -
-
-directives
-

A dictionary of all directives and their default -values.

-
- -
-
-re_flags
-

A set of regular expression flags to be added to all -regular expressions found in the current parsing process

-
- -
-
-assemble_parser(definitions: List[Tuple[str, str]], root_node: DHParser.syntaxtree.Node) → str[source]
-

Creates the Python code for the parser after compilation of -the EBNF-Grammar

-
- -
-
-gen_compiler_skeleton() → str[source]
-

Returns Python-skeleton-code for a Compiler-class for the -previously compiled formal language.

-
- -
-
-gen_preprocessor_skeleton() → str[source]
-

Returns Python-skeleton-code for a preprocessor-function for -the previously compiled formal language.

-
- -
-
-gen_transformer_skeleton() → str[source]
-

Returns Python-skeleton-code for the AST-transformation for the -previously compiled formal language.

-
- -
-
-non_terminal(node: DHParser.syntaxtree.Node, parser_class: str, custom_args: List[str] = []) → str[source]
-

Compiles any non-terminal, where parser_class indicates the Parser class -name for the particular non-terminal.

-
- -
-
-verify_transformation_table(transtable)[source]
-

Checks for symbols that occur in the transformation-table but have -never been defined in the grammar. Usually, this kind of -inconsistency results from an error like a typo in the transformation -table.

-
- -
- -
-
-grammar_changed(grammar_class, grammar_source: str) → bool[source]
-

Returns True if grammar_class does not reflect the latest -changes of grammar_source

- --- - - - -
Parameters:
    -
  • grammar_class – the parser class representing the grammar -or the file name of a compiler suite containing the grammar
  • -
  • grammar_source – File name or string representation of the -EBNF code of the grammar
  • -
-
-
-
Returns (bool):
-
True, if the source text of the grammar is different from the -source from which the grammar class was generated
-
-
- -
-
-PreprocessorFactoryFunc
-

alias of typing.Callable

-
- -
-
-ParserFactoryFunc
-

alias of typing.Callable

-
- -
-
-TransformerFactoryFunc
-

alias of typing.Callable

-
- -
-
-CompilerFactoryFunc
-

alias of typing.Callable

-
- -
-
-

Module dsl

-

Module dsl contains various functions to support the -compilation of domain specific languages based on an EBNF-grammar.

-
-
-exception GrammarError(errors, grammar_src)[source]
-

Raised when (already) the grammar of a domain specific language (DSL) -contains errors.

-
- -
-
-exception CompilationError(errors, dsl_text, dsl_grammar, AST, result)[source]
-

Raised when a string or file in a domain specific language (DSL) -contains errors.

-
- -
-
-load_compiler_suite(compiler_suite: str) → Tuple[[Callable[Union[typing.Callable[[str], typing.Union[str, typing.Tuple[str, typing.Union[typing.Callable[[int], int], functools.partial]]]], functools.partial]], Callable[DHParser.parse.Grammar], Callable[Union[typing.Callable[[DHParser.syntaxtree.Node], typing.Any], functools.partial]]], Callable[DHParser.compile.Compiler]][source]
-

Extracts a compiler suite from file or string compiler_suite -and returns it as a tuple (preprocessor, parser, ast, compiler).

- --- - - - -
Returns:
-
4-tuple (preprocessor function, parser class,
-
ast transformer function, compiler class)
-
-
-
- -
-
-compileDSL(text_or_file: str, preprocessor: Union[typing.Callable[[str], typing.Union[str, typing.Tuple[str, typing.Union[typing.Callable[[int], int], functools.partial]]]], functools.partial], dsl_grammar: Union[str, DHParser.parse.Grammar], ast_transformation: Union[typing.Callable[[DHParser.syntaxtree.Node], typing.Any], functools.partial], compiler: DHParser.compile.Compiler) → Any[source]
-

Compiles a text in a domain specific language (DSL) with an -EBNF-specified grammar. Returns the compiled text or raises a -compilation error.

- --- - - - -
Raises:CompilationError if any errors occurred during compilation
-
- -
-
-raw_compileEBNF(ebnf_src: str, branding='DSL') → DHParser.ebnf.EBNFCompiler[source]
-

Compiles an EBNF grammar file and returns the compiler object -that was used and which can now be queried for the result as well -as skeleton code for preprocessor, transformer and compiler objects.

- --- - - - - - - - -
Parameters:
    -
  • ebnf_src (str) – Either the file name of an EBNF grammar or -the EBNF grammar itself as a string.
  • -
  • branding (str) – Branding name for the compiler suite source -code.
  • -
-
Returns:

An instance of class ebnf.EBNFCompiler

-
Raises:

CompilationError if any errors occurred during compilation

-
-
- -
-
-compileEBNF(ebnf_src: str, branding='DSL') → str[source]
-

Compiles an EBNF source file and returns the source code of a -compiler suite with skeletons for preprocessor, transformer and -compiler.

- --- - - - - - - - -
Parameters:
    -
  • ebnf_src (str) – Either the file name of an EBNF grammar or -the EBNF grammar itself as a string.
  • -
  • branding (str) – Branding name for the compiler suite source -code.
  • -
-
Returns:

The complete compiler suite skeleton as Python source code.

-
Raises:

CompilationError if any errors occurred during compilation

-
-
- -
-
-grammar_provider(ebnf_src: str, branding='DSL') → DHParser.parse.Grammar[source]
-

Compiles an EBNF grammar and returns a grammar-parser provider -function for that grammar.

- --- - - - - - -
Parameters:
    -
  • ebnf_src (str) – Either the file name of an EBNF grammar or -the EBNF grammar itself as a string.
  • -
  • branding (str or bool) – Branding name for the compiler -suite source code.
  • -
-
Returns:

A provider function for a grammar object for texts in the -language defined by ebnf_src.

-
-
- -
-
-compile_on_disk(source_file: str, compiler_suite='', extension='.xml') → Iterable[DHParser.error.Error][source]
-

Compiles the a source file with a given compiler and writes the -result to a file.

-

If no compiler_suite is given it is assumed that the source -file is an EBNF grammar. In this case the result will be a Python -script containing a parser for that grammar as well as the -skeletons for a preprocessor, AST transformation table, and compiler. -If the Python script already exists only the parser name in the -script will be updated. (For this to work, the different names -need to be delimited section marker blocks.). compile_on_disk() -returns a list of error messages or an empty list if no errors -occurred.

- --- - - - - - -
Parameters:
    -
  • source_file (str) – The file name of the source text to be -compiled.
  • -
  • compiler_suite (str) – The file name of the compiler suite -(usually ending with ‘Compiler.py’), with which the source -file shall be compiled. If this is left empty, the source -file is assumed to be an EBNF-Grammar that will be compiled -with the internal EBNF-Compiler.
  • -
  • extension (str) – The result of the compilation (if successful) -is written to a file with the same name but a different -extension than the source file. This parameter sets the -extension.
  • -
-
Returns:

A (potentially empty) list of error or warning messages.

-
-
- -
-
-recompile_grammar(ebnf_filename, force=False) → bool[source]
-

Re-compiles an EBNF-grammar if necessary, that is, if either no -corresponding ‘XXXXCompiler.py’-file exists or if that file is -outdated.

- --- - - - -
Parameters:
    -
  • ebnf_filename (str) – The filename of the ebnf-source of the -grammar. In case this is a directory and not a file, all -files within this directory ending with .ebnf will be -compiled.
  • -
  • force (bool) – If False (default), the grammar will only be -recompiled if it has been changed.
  • -
-
-
- -
-
-

Module testing

-

Module testing contains support for unit-testing domain specific -languages. Tests for arbitrarily small components of the Grammar can -be written into test files with ini-file syntax in order to test -whether the parser matches or fails as expected. It can also be -tested whether it produces an expected concrete or abstract syntax tree. -Usually, however, unexpected failure to match a certain string is the -main cause of trouble when constructing a context free Grammar.

-
-
-unit_from_configfile(config_filename)[source]
-

Reads grammar unit tests contained in a file in config file (.ini) -syntax.

- --- - - - - - -
Parameters:config_filename (str) – A config file containing Grammar unit-tests
Returns:A dictionary representing the unit tests.
-
- -
-
-unit_from_json(json_filename)[source]
-

Reads grammar unit tests from a json file.

-
- -
-
-unit_from_file(filename)[source]
-

Reads a grammar unit test from a file. The format of the file is -determined by the ending of its name.

-
- -
-
-get_report(test_unit)[source]
-

Returns a text-report of the results of a grammar unit test. The report -lists the source of all tests as well as the error messages, if a test -failed or the abstract-syntax-tree (AST) in case of success.

-

If an asterix has been appended to the test name then the concrete syntax -tree will also be added to the report in this particular case.

-

The purpose of the latter is to help constructing and debugging -of AST-Transformations. It is better to switch the CST-output on and off -with the asterix marker when needed than to output the CST for all tests -which would unnecessarily bloat the test reports.

-
- -
-
-grammar_unit(test_unit, parser_factory, transformer_factory, report=True, verbose=False)[source]
-

Unit tests for a grammar-parser and ast transformations.

-
- -
-
-grammar_suite(directory, parser_factory, transformer_factory, fn_patterns=['*test*'], ignore_unknown_filetypes=False, report=True, verbose=True)[source]
-

Runs all grammar unit tests in a directory. A file is considered a test -unit, if it has the word “test” in its name.

-
- -
-
-reset_unit(test_unit)[source]
-

Resets the tests in test_unit by removing all results and error -messages.

-
- -
-
-runner(test_classes, namespace)[source]
-

Runs all or some selected Python unit tests found in the -namespace. To run all tests in a module, call -runner("", globals()) from within that module.

-

Unit-Tests are either classes, the name of which starts with -“Test” and methods, the name of which starts with “test” contained -in such classes or functions, the name of which starts with “test”.

- --- - - - -
Parameters:
    -
  • tests – Either a string or a list of strings that contains the -names of test or test classes. Each test and, in the case -of a test class, all tests within the test class will be -run.
  • -
  • namespace – The namespace for running the test, usually -globals() should be used.
  • -
-
-

Example

-
-
class TestSomething()
-
-
def setup(self):
-
pass
-
def teardown(self):
-
pass
-
def test_something(self):
-
pass
-
-
-
if __name__ == “__main__”:
-
from DHParser.testing import runner -runner(“”, globals())
-
-
- -
-
-
-

Supporting Modules Reference

-

Finally, DHParser comprises a number of “toolkit”-modules which -define helpful functions and classes that will are used at different -places throughout the other DHParser-modules.

-
-

Module toolkit

-

Module toolkit contains utility functions that are needed across -several of the the other DHParser-Modules or that are just very generic -so that they are best defined in a toolkit-module.

-
-
-escape_re(strg: str) → str[source]
-

Returns the string with all regular expression special characters escaped.

-
- -
-
-escape_control_characters(strg: str) → str[source]
-
-
Replace all control characters (e.g.
-
) in a string by their backslashed representation.
-
-
- -
-
-is_filename(strg: str) → bool[source]
-

Tries to guess whether string s is a file name.

-
- -
-
-lstrip_docstring(docstring: str) → str[source]
-

Strips leading whitespace from a docstring.

-
- -
-
-load_if_file(text_or_file) → str[source]
-

Reads and returns content of a text-file if parameter -text_or_file is a file name (i.e. a single line string), -otherwise (i.e. if text_or_file is a multi-line string) -text_or_file is returned.

-
- -
-
-is_python_code(text_or_file: str) → bool[source]
-

Checks whether ‘text_or_file’ is python code or the name of a file that -contains python code.

-
- -
-
-md5(*txt)[source]
-

Returns the md5-checksum for txt. This can be used to test if -some piece of text, for example a grammar source file, has changed.

-
- -
-
-expand_table(compact_table: Dict) → Dict[source]
-

Expands a table by separating keywords that are tuples or strings -containing comma separated words into single keyword entries with -the same values. Returns the expanded table. -Example: ->>> expand_table({“a, b”: 1, (‘d’,’e’,’f’):5, “c”:3}) -{‘a’: 1, ‘b’: 1, ‘d’: 5, ‘e’: 5, ‘f’: 5, ‘c’: 3}

-
- -
-
-compile_python_object(python_src, catch_obj_regex='')[source]
-

Compiles the python source code and returns the (first) object -the name of which is matched by catch_obj_regex. If catch_obj -is the empty string, the namespace dictionary will be returned.

-
- -
-
-smart_list(arg: Union[typing.Iterable, typing.Any]) → Union[typing.Sequence, typing.Set][source]
-

Returns the argument as list, depending on its type and content.

-

If the argument is a string, it will be interpreted as a list of -comma separated values, trying ‘;’, ‘,’, ‘ ‘ as possible delimiters -in this order, e.g. ->>> smart_list(‘1; 2, 3; 4’) -[‘1’, ‘2, 3’, ‘4’] ->>> smart_list(‘2, 3’) -[‘2’, ‘3’] ->>> smart_list(‘a b cd’) -[‘a’, ‘b’, ‘cd’]

-

If the argument is a collection other than a string, it will be -returned as is, e.g. ->>> smart_list((1, 2, 3)) -(1, 2, 3) ->>> smart_list({1, 2, 3}) -{1, 2, 3}

-

If the argument is another iterable than a collection, it will -be converted into a list, e.g. ->>> smart_list(i for i in {1,2,3}) -[1, 2, 3]

-

Finally, if none of the above is true, the argument will be -wrapped in a list and returned, e.g. ->>> smart_list(125) -[125]

-
- -
-
-sane_parser_name(name) → bool[source]
-

Checks whether given name is an acceptable parser name. Parser names -must not be preceded or succeeded by a double underscore ‘__’!

-
- -
-
-

Module log

-

Module log contains logging and debugging support for the -parsing process.

-

For logging functionality, the global variable LOGGING is defined which -contains the name of a directory where log files shall be placed. By -setting its value to the empty string “” logging can be turned off.

-

To read the directory name function LOGS_DIR() should be called -rather than reading the variable LOGGING. LOGS_DIR() makes sure -the directory exists and raises an error if a file with the same name -already exists.

-

For debugging of the parsing process, the parsing history can be -logged and written to an html-File.

-

For ease of use module log defines a context-manager logging -to which either False (turn off logging), a log directory name or -True for the default logging directory is passed as argument. -The other components of DHParser check whether logging is on and -write log files in the the logging directory accordingly. Usually, -this will be concrete and abstract syntax trees as well as the full -and abreviated parsing history.

-

Example:

-
from DHParser import compile_source, logging
-
-with logging("LOGS"):
-    result, errors, ast = compile_source(source, preprocessor, grammar,
-                                         transformer, compiler)
-
-
-
-
-log_dir() → str[source]
-

Creates a directory for log files (if it does not exist) and -returns its path.

-

WARNING: Any files in the log dir will eventually be overwritten. -Don’t use a directory name that could be the name of a directory -for other purposes than logging.

- --- - - - -
Returns:name of the logging directory
-
- -
-
-logging(dirname='LOGS')[source]
-

Context manager. Log files within this context will be stored in -directory dirname. Logging is turned off if name is empty.

- --- - - - -
Parameters:dirname – the name for the log directory or the empty string to -turn logging of
-
- -
-
-is_logging() → bool[source]
-

-> True, if logging is turned on.

-
- -
-
-logfile_basename(filename_or_text, function_or_class_or_instance) → str[source]
-

Generates a reasonable logfile-name (without extension) based on -the given information.

-
- -
-
-clear_logs(logfile_types=frozenset({'.ast', '.log', '.cst'}))[source]
-

Removes all logs from the log-directory and removes the -log-directory if it is empty.

-
- -
-
-class HistoryRecord(call_stack: List[_ForwardRef('Parser')], node: DHParser.syntaxtree.Node, text: DHParser.stringview.StringView) → None[source]
-

Stores debugging information about one completed step in the -parsing history.

-

A parsing step is “completed” when the last one of a nested -sequence of parser-calls returns. The call stack including -the last parser call will be frozen in the HistoryRecord- -object. In addition a reference to the generated leaf node -(if any) will be stored and the result status of the last -parser call, which ist either MATCH, FAIL (i.e. no match) -or ERROR.

-
-
-class Snapshot(line, column, stack, status, text)
-
-
-column
-

Alias for field number 1

-
- -
-
-line
-

Alias for field number 0

-
- -
-
-stack
-

Alias for field number 2

-
- -
-
-status
-

Alias for field number 3

-
- -
-
-text
-

Alias for field number 4

-
- -
- -
-
-as_csv_line() → str[source]
-

Returns history record formatted as a csv table row.

-
- -
-
-as_html_tr() → str[source]
-

Returns history record formatted as an html table row.

-
- -
-
-as_tuple() → log.Snapshot[source]
-

Returns history record formatted as a snapshot tuple.

-
- -
-
-static last_match(history: List[_ForwardRef('HistoryRecord')]) → Union[_ForwardRef('HistoryRecord'), NoneType][source]
-

Returns the last match from the parsing-history. -:param history: the parsing-history as a list of HistoryRecord objects

- --- - - - -
Returns:the history record of the last match or none if either history is -empty or no parser could match
-
- -
-
-static most_advanced_match(history: List[_ForwardRef('HistoryRecord')]) → Union[_ForwardRef('HistoryRecord'), NoneType][source]
-

Returns the closest-to-the-end-match from the parsing-history. -:param history: the parsing-history as a list of HistoryRecord objects

- --- - - - -
Returns:the history record of the closest-to-the-end-match or none if either history is -empty or no parser could match
-
- -
- -
-
-log_ST(syntax_tree, log_file_name)[source]
-

Writes an S-expression-representation of the syntax_tree to a file, -if logging is turned on.

-
- -
-
-log_parsing_history(grammar, log_file_name: str = '', html: bool = True) → None[source]
-

Writes a log of the parsing history of the most recently parsed document.

- --- - - - -
Parameters:
    -
  • grammar (Grammar) – The Grammar object from which the parsing history -shall be logged.
  • -
  • log_file_name (str) – The (base-)name of the log file to be written. -If no name is given (default), then the class name of the grammar -object will be used.
  • -
  • html (bool) – If true (default), the log will be output as html-Table, -otherwise as plain test. (Browsers might take a few seconds or -minutes to display the table for long histories.)
  • -
-
-
- -
-
-

Module stringview

-

StringView provides string-slicing without copying. -Slicing Python-strings always yields copies of a segment of the original -string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html -However, this becomes costly (in terms of space and as a consequence also -time) when parsing longer documents. Unfortunately, Python’s memoryview -does not work for unicode strings. Hence, the StringView class.

-

It is recommended to compile this modules with the Cython-compiler for -speedup. The modules comes with a stringview.pxd that contains some type -declarations to fully exploit the potential of the Cython-compiler.

-
-
-class StringView
-

A rudimentary StringView class, just enough for the use cases -in parse.py. The difference between a StringView and the python -builtin strings is that StringView-objects do slicing without -copying, i.e. slices are just a view on a section of the sliced -string.

-
-
-count
-

Returns the number of non-overlapping occurrences of substring -sub in StringView S[start:end]. Optional arguments start and end -are interpreted as in slice notation.

-
- -
-
-find
-

Returns the lowest index in S where substring sub is found, -such that sub is contained within S[start:end]. Optional -arguments start and end are interpreted as in slice notation. -Returns -1 on failure.

-
- -
-
-finditer
-

Executes regex.finditer on the StringView object and returns the -iterator of match objects. -WARNING: match.end(), match.span() etc. are mapped to the underlying text,

-
-
not the StringView-object!!!
-
- -
-
-index
-

Converts an index for a string watched by a StringView object -to an index relative to the string view object, e.g.: ->>> import re ->>> sv = StringView(‘xxIxx’)[2:3] ->>> match = sv.match(re.compile(‘I’)) ->>> match.end() -3 ->>> sv.index(match.end()) -1

-
- -
-
-indices
-

Converts indices for a string watched by a StringView object -to indices relative to the string view object. See also: sv_index()

-
- -
-
-lstrip
-

Returns a copy of self with leading whitespace removed.

-
- -
-
-match
-

Executes regex.match on the StringView object and returns the -result, which is either a match-object or None. -WARNING: match.end(), match.span() etc. are mapped to the underlying text,

-
-
not the StringView-object!!!
-
- -
-
-replace
-

Returns a string where old is replaced by new.

-
- -
-
-rfind
-

Returns the highest index in S where substring sub is found, -such that sub is contained within S[start:end]. Optional -arguments start and end are interpreted as in slice notation. -Returns -1 on failure.

-
- -
-
-rstrip
-

Returns a copy of self with trailing whitespace removed.

-
- -
-
-search
-

Executes regex.search on the StringView object and returns the -result, which is either a match-object or None. -WARNING: match.end(), match.span() etc. are mapped to the underlying text,

-
-
not the StringView-object!!!
-
- -
-
-split
-

Returns a list of the words in self, using sep as the -delimiter string. If sep is not specified or is None, any -whitespace string is a separator and empty strings are -removed from the result.

-
- -
-
-startswith
-

Return True if S starts with the specified prefix, False otherwise. -With optional start, test S beginning at that position. -With optional end, stop comparing S at that position. -prefix can also be a tuple of strings to try.

-
- -
-
-strip
-

Returns a copy of the StringView self with leading and trailing -whitespace removed.

-
- -
- -
-
-

Module versionnumber

-
-
-
- - -
- -
- - -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation_sources/ModuleReference.rst b/documentation/ModuleReference.rst similarity index 100% rename from documentation_sources/ModuleReference.rst rename to documentation/ModuleReference.rst diff --git a/documentation/ReferenceManual.html b/documentation/ReferenceManual.html deleted file mode 100644 index 24fb9bb2aed7c5c3a0d6e6a299dd04fe917d3033..0000000000000000000000000000000000000000 --- a/documentation/ReferenceManual.html +++ /dev/null @@ -1,342 +0,0 @@ - - - - - - - - - - - DHParser Reference Manual — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -
-

DHParser Reference Manual

-

This reference manual explains the technology used by DHParser. It is -intended for people who would like to extend or contribute to -DHParser. The reference manual does not explain how a Domain Specific -Language (DSL) is developed (see the User’s Manual for that). It it -explains the technical approach that DHParser employs for parsing, -abstract syntax tree transformation and compilation of a given -DSL. And it describes the module and class structure of the DHParser -Software. The programming guide requires a working knowledge of Python -programming and a basic understanding or common parser technology from -the reader. Also, it is recommended to read the introduction and the -user’s guide first.

-
-

Fundamentals

-

DHParser is a parser generator aimed at but not restricted to the -creation of domain specific languages in the Digital Humanities (DH), -hence the name “DHParser”. In the Digital Humanities, DSLs allow to -enter annotated texts or data in a human friendly and readable form -with a Text-Editor. In contrast to the prevailing XML-approach, the -DSL-approach distinguishes between a human-friendly editing data -format and a maschine friendly working data format which can be XML -but does not need to be. Therefore, the DSL-approach requires an -additional step to reach the working data format, that is, the -compilation of the annotated text or data written in the DSL (editing -data format) to the working data format. In the following a text or -data file wirtten in a DSL will simply be called document. The -editing data format will also be called source format and the -working data format be denoted as target format.

-

Compiling a document specified in a domain specific language involves the following steps:

-
    -
  1. Parsing the document which results in a representation of the document as a concrete -syntax tree.
  2. -
  3. Transforming the concrete syntax tree (CST) into an abstract syntax tree (AST), i.e. a -streamlined and simplified syntax tree ready for compilation.
  4. -
  5. Compiling the abstract syntax tree into the working data format.
  6. -
-

All of these steps a carried out be the computer without any user intervention, i.e. without the -need of humans to rewrite or enrich the data during any these steps. A DSL-compiler therefore -consists of three components which are applied in sequence, a parser, a transformer and a -compiler. Creating, i.e. programming these components is the task of compiler construction. -The creation of all of these components is supported by DHParser, albeit to a different degree:

-
    -
  1. Creating a parser: DHParser fully automizes parser generation. Once the syntax of the DSL -is formally specified, it can be compiled into a python class that is able to parse any -document written in the DSL. DHParser uses Parsing-Expression-Grammars in a variant of the -Extended-Backus-Naur-Form (EBNF) for the specification of the syntax. (See -examples/EBNF/EBNF.ebnf for an example.)
  2. -
  3. Specifying the AST-transformations: DHParser supports the AST-transformation with a -depth-first tree traversal algorithm (see DHParser.transform.traverse ) and a number of -stock transformation functions which can also be combined. Most of the AST-transformation is -specified in a declarative manner by filling in a transformation-dictionary which associates -the node-types of the concrete syntax tree with such combinations of transformations. See -DHParser.ebnf.EBNF_AST_transformation_table as an example.
  4. -
  5. Filling in the compiler class skeleton: Compiler generation cannot be automated like parser -generation. It is supported by DHParser merely by generating a skeleton of a compiler class -with a method-stub for each definition (or “production” as the definition are sometimes also -called) of the EBNF-specification. (See examples/EBNF/EBNFCompiler.py) If the target format -is XML, there is a chance that the XML can simply be generated by serializing the abstract -syntax tree as XML without the need of a dedicated compilation step.
  6. -
-
-
-

Compiler Creation Workflow

-

TODO: Describe: -- setting up a new projekt -- invoking the DSL Compiler -- conventions and data types -- the flat namespace of DH Parser

-
-
-

Component Guide

-
-

Parser

-

Parser-creation if supported by DHParser by an EBNF to Python compiler which yields a working -python class that parses any document the EBNF-specified DSL to a tree of Node-objects, which -are instances of the class Node defined in DHParser/snytaxtree.py

-

The EBNF to Python compiler is actually a DSL-compiler that has been crafted with DHParser -itself. It is located in DHParser/enbf.py. The formal specification of the EBNF variant -used by DHParser can be found in examples/EBNF/EBNF.ebnf. Comparing the automatically -generated examples/EBNF/EBNFCompiler.py with DHParser/ebnf.py can give you an idea what -additional work is needed to create a DSL-compiler from an autogenerated DSL-parser. In most -DH-projects this task will be less complex, however, as the target format is XML which -usually can be derived from the abstract syntax tree with fewer steps than the Python code in -the case of DHParser’s EBNF to Python compiler.

-
-
-

AST-Transformation

-

Other than for the compiler generation (see the next point below), a functional rather than -object-oriented approach has been employed, because it allows for a more concise -specification of the AST-transformation since typically the same combination of -transformations can be used for several node types of the AST. It would therefore be tedious -to fill in a method for each of these. In a sense, the specification of AST-transformation -constitutes an “internal DSL” realized with the means of the Python language itself.

-
-
-

Compiler

-
-
-
-

Module Structure of DHParser

-
-
-

Class Hierarchy of DHParser

-
-
- - -
- -
- - -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation_sources/ReferenceManual.rst b/documentation/ReferenceManual.rst similarity index 100% rename from documentation_sources/ReferenceManual.rst rename to documentation/ReferenceManual.rst diff --git a/documentation/StepByStepGuide.html b/documentation/StepByStepGuide.html deleted file mode 100644 index 58d5206bfc25f18466ae5d97b3a76cd8e3efcf93..0000000000000000000000000000000000000000 --- a/documentation/StepByStepGuide.html +++ /dev/null @@ -1,1024 +0,0 @@ - - - - - - - - - - - DHParser’s Step by Step Guide — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -
-

DHParser’s Step by Step Guide

-

This step by step guide goes through the whole process of desining and testing -a domain specific notation from the very start. (The terms “domain specific -noation” and “domain specific language” are used interchangeably in the -following. Both will abbreviated by “DSL”, however.) We will design a simple -domain specific notation for poems as a teaching example. On the way we will -learn:

-
    -
  1. how to setup a new DHParser project
  2. -
  3. how to use the test driven development approach to designing a DSL
  4. -
  5. how to describe the syntax of a DSL with the EBNF-notation
  6. -
  7. how to specify the transformations for converting the concrete syntax tree -that results from parsing a text denoted in our DLS into an abstract syntax -tree that represents or comes close to representing out data model.
  8. -
  9. how to write a compiler that transforms the abstract syntax tree into a -target representation which might be a html page, epub or printable pdf in -the case of typical Digital-Humanities-projects.
  10. -
-
-

Setting up a new DHParser project

-

Since DHParser, while quite mature in terms of implemented features, is still -in a pre-first-release state, it is for the time being more recommendable to -clone the most current version of DHParser from the git-repository rather than -installing the packages from the Python Package Index (PyPI).

-

This section takes you from cloning the DHParser git repository to setting up -a new DHParser-project in the experimental-subdirectory and testing -whether the setup works. Similarily to current web development practices, most -of the work with DHParser is done from the shell. In the following, we assume -a Unix (Linux) environment. The same can most likely be done on other -operating systems in a very similar way, but there might be subtle -differences.

-
-

Installing DHParser from the git repository

-

In order to install DHParser from the git repository, open up a shell window -and type:

-
$ git clone git@gitlab.lrz.de:badw-it/DHParser.git
-$ cd DHParser
-
-
-

The second command changes to the DHParser directory. Within this directory -you should recognise the following subdirectories and files. There are more -files and directories for sure, but those will not concern us for now:

-
DHParser/            - the DHParser python packages
-documentation/       - DHParser's documentation in html-form
-documentation_source - DHParser's documentation in reStructedText-Format
-examples/            - some exmamples for DHParser (mostly incomplete)
-experimental/        - an empty directory for experimenting
-test/                - DHParser's unit-tests
-dhparser.py          - DHParser's command line tool for setting up projects
-README.md            - General information about DHParser
-LICENSE.txt          - DHParser's license. It's open source (hooray!)
-Introduction.md      - An introduction and appetizer for DHParser
-
-
-

In order to verify that the installation works, you can simply run the -“dhparser.py” script and, when asked, chose “3” for the self-test. If the -self-test runs through without error, the installation has succeded.

-
-
-

Staring a new DHParser project

-

In order to setup a new DHParser project, you run the dhparser.py-script -with the name of the new project. For the sake of the example, let’s type:

-
$ python dhparser.py experimental/poetry
-$ cd experimental/poetry
-
-
-

This creates a new DHParser-project with the name “poetry” in directory with -the same name within the subdirectory “experimental”. This new directory -contains the following files:

-
README.md           - a stub for a readme-file explaiing the project
-poetry.ebnf         - a trivial demo grammar for the new project
-example.dsl         - an example file written in this grammar
-tst_poetry_grammar.py - a python script ("test-script") that re-compiles
-                        the grammar (if necessary) and runs the unit tests
-grammar_tests/01_test_word.ini     - a demo unit test
-grammar_tests/02_test_document.ini - another unit test
-
-
-

Now, if you look into the file “example.dsl” you will find that it contains a -simple sequence of words, namely “Life is but a walking shadow”. In fact, the -demo grammar that comes with a newly created project is nothing but simple -grammar for sequences of words separated by whitespace. Now, since we alread -have unit tests, our first exercise will be to run the unit tests by starting -the script “tst_poetry_grammar.py”:

-
$ python tst_poetry_grammar.py
-
-
-

This will run through the unit-tests in the grammar_tests directory and print -their success or failure on the screen. If you check the contents of your -project directory after running the script, you might notice that there now -exists a new file “poetryCompiler.py” in the project directory. This is an -auto-generated compiler-script for our DSL. You can use this script to compile -any source file of your DSL, like “example.dsl”. Let’s try:

-
$ python poetryCompiler.py example.dsl
-
-
-

The output is a block of pseudo-XML, looking like this:

-
<document>
-  <:ZeroOrMore>
-    <WORD>
-      <:RegExp>Life</:RegExp>
-      <:Whitespace> </:Whitespace>
-    </WORD>
-    <WORD>
-      <:RegExp>is</:RegExp>
-      <:Whitespace> </:Whitespace>
-    </WORD>
- ...
-
-
-

Now, this does not look too helpful yet, partly, because it is cluttered with -all sorts of seemingly superflous pseudo-XML-tags like “<:ZeroOrMore>”. -However, you might notice that it contains the original sequence of words -“Life is but a walkting shadow” in a structured form, where each word is -(among other things) surrounded by <WORD>-tags. In fact, the output of the -compiler script is a pseudo-XML-representation of the contrete syntax tree -of our “example.dsl”-document according the grammar specified in “poetry.ebnf” -(which we haven’t looked into yet, but we will do so soon).

-

If you see the pseudo-XML on screen, the setup of the new DHParser-project -has been successful.

-
-
-

Understanding how compilation of DSL-documents with DHParser works

-

Generally speaking, the compilation process consists of three stages:

-
    -
  1. Parsing a document. This yields a concrete syntax tree (CST) of the -document.
  2. -
  3. Transforming. This transforms the CST into the much more concise abstract -syntax tree (AST) of the document.
  4. -
  5. Compiling. This turns the AST into anything you’d like, for example, an -XML-representation or a relational database record.
  6. -
-

Now, DHParser can fully automize the generation of a parser from a -syntax-description in EBNF-form, like our “poetry.ebnf”, but it cannot -automize the transformation from the concrete into the abstract syntax tree -(which for the sake of brevity we will simply call “AST-Transformation” in the -following), and neither can it automize the compilation of the abstract syntax -tree into something more useful. Therefore, the AST-Transformation in the -autogenerated compile-script is simply left empty, while the compiling stage -simply converts the syntax tree into a pseudo-XML-format.

-

The latter two stages have to be coded into the compile-script by hand, with -the support of templates within this script. If the grammar of the DSL is -changed - as it will be frequently during the development of a DSL - the -parser-part of this script will be regenerated by the testing-script before -the unit tests are run. The script will notice if the grammar has changed. -This also means that the parser part of this script will be overwritten and -should never be edited by hand. The other two stages can and should be edited -by hand. Stubs for theses parts of the compile-script will only be generated -if the compile-script does not yet exist, that is, on the very first calling -of the test-srcipt.

-

Usually, if you have adjusted the grammar, you will want to run the unit tests -anyway. Therefore, the regeneration of the parser-part of the compile-script -is triggered by the test-script.

-
-
-

The development workflow for DSLs

-

When developing a domain specific notation it is recommendable to first -develop the grammar and the parser for that notation, then to the abstract -syntax tree transformations and finally to implement the compiler. Of course -one can always come back and change the grammar later. But in order to avoid -revising the AST-transformations and the compiler time and again it helps if -the grammar has been worked out before. A bit of interlocking between these -steps does not hurt, though.

-

A resonable workflow for developing the grammar proceeds like this:

-
    -
  1. Set out by writing down a few example documents for your DSL. It is -advisable to start with a few simple examples that use only a subset of the -intended features of your DSL.

    -
  2. -
  3. Next you sktech a grammar for your DSL that is just rich enough to capture -those examples.

    -
  4. -
  5. Right after sketching the grammar you should write test cases for your -grammar. The test cases can be small parts or snippets of your example -documents. You could also use your example documents as test cases, but -usually the test cases should have a smaller granularity to make locating -errors easier.

    -
  6. -
  7. Next, you should run the test script. Usually, some test will fail at -the first attempt. So you’ll keep revising the EBNF-grammar, adjusting and -adding test cases until all tests pass.

    -
  8. -
  9. Now it is time to try and compile the example documents. By this time the -test-script should have generated the compile-script, which you can be -called with the example documents. Don’t worry too much about the output, -yet. What is important at this stage is merely whether the parser can -handle the examples or not. If not, further test cases and adjustments the -EBNF grammar will be needed - or revision of the examples in case you -decide to use different syntactic constructs.

    -

    If all examples can be parsed, you go back to step one and add further more -complex examples, and continue to do so until you have the feeling that you -DSL’s grammar is rich enough for all intended application cases.

    -
  10. -
-

Let’s try this with the trivial demo example that comes with creating a new -project with the “dhparser.py”-script. Now, you have already seen that the -“example.dsl”-document merely contains a simple sequence of words: “Life is -but a walking shadow” Now, wouldn’t it be nice, if we could end this sequence -with a full stop to turn it into a proper sentence. So, open “examples.dsl” -with a text editor and add a full stop:

-
Life is but a walking shadow.
-
-
-

Now, try to compile “examples.dsl” with the compile-script:

-
$ python poetryCompiler.py example.dsl
-example.dsl:1:29: Error: EOF expected; ".\n " found!
-
-
-

Since the grammar, obviously, did not allow full stops so far, the parser -returns an error message. The error message is pretty self-explanatory in this -case. (Often, you will unfortunately find that the error message are somewhat -difficult to decipher. In particular, because it so happens that an error the -parser complains about is just the consequence of an error made at an earlier -location that the parser may not have been able to recognize as such. We will -learn more about how to avoid such situations, later.) EOF is actually the -name of a parser that captures the end of the file, thus “EOF”! But instead of -the expected end of file an, as of now, unparsable construct, namely a full -stop followed by a line feed, signified by “n”, was found.

-

Let’s have look into the grammar description “poetry.ebnf”. We ignore the -beginning of the file, in particular all lines starting with “@” as these -lines do not represent any grammar rules, but meta rules or so-called -“directives” that determine some general characteristics of the grammar, such -as whitespace-handling or whether the parser is going to be case-sensitive. -Now, there are exactly three rules that make up this grammar:

-
document = ~ { WORD } §EOF
-WORD     =  /\w+/~
-EOF      =  !/./
-
-
-

EBNF-Grammars describe the structure of a domain specific notation in top-down -fashion. Thus, the first rule in the grammar describes the comonents out of -which a text or document in the domain specific notation is composed as a -whole. The following rules then break down the components into even smaller -components until, finally, there a only atomic components left which are -described be matching rules. Matching rules are rules that do not refer to -other rules any more. They consist of string literals or regular expressions -that “capture” the sequences of characters which form the atomic components of -our DSL. Rules in general always consist of a symbol on the left hand side of -a “=”-sign (which in this context can be unterstood as a definition signifier) -and the definition of the rule on the right hand side.

-
-

Note

-

Traditional parser technology for context-free grammars often -distinguishes two phases, scanning and parsing, where a lexical scanner -would take a stream of characters and yield a sequence of tokens and the -actual parser would then operate on the stream of tokens. DHParser, -however, is an instance of a scannerless parser where the functionality -of the lexical scanner is seamlessly integrated into the -parser. This is done by allowing regular expressions in the definiendum of -grammar symbols. The regular expressions do the work of the lexical -scanner.

-

Theoretically, one could do without scanners or regular expressions. -Because regular languages are a subset of context-free languages, parsers -for context-free languages can do all the work that regular expressions can -do. But it makes things easier - and, in the case of DHParser, also faster -- to have them.

-
-

In our case the text as a whole, conveniently named “document” (any other name -would be allowed, too), consists of a leading whitespace, a possibly empty -sequence of an arbitrary number of words words ending only if the end of file -has been reached. Whitespace in DHParser-grammers is always denoted by a tilde -“~”. Thuse the definiens of the rule “document” starts with a “~” on the right -hand side of the deifnition sign (“=”). Next, you find the symbol “WORD” -enclosed in braces. “WORD”, like any symbol composed of letters in DHParser, -refers to another rule further below that defines what words are. The meaning -of the braces is that whatever is enclosed by braces may be repeated zero or -more times. Thus the expression “{ WORD }” describes a seuqence of arbitrarily -many repetitions of WORD, whatever WORD may be. Finally, EOF refers to yet -another rule definied further below. We do not yet know what EOF is, but we -know that when the sequence of words ends, it must be followed by an EOF. The -paragraph sign “§” in front of EOF means that it is absolutely mandatory that -the seuqence of WORDs is followed by an EOF. If it doesn’t the program issues -an error message. Without the “§”-sign the parser simply would not match, -which in itself is not considered an error.

-

Now, let’s look at our two matching rules. Both of these rules contain regular -expressions. If you do not know about regular expressions yet, you should head -over to an explanation or tutorial on regular expressions, like -https://docs.python.org/3/library/re.html, before continuing, because we are -not going to discuss them here. In DHParser-Grammars regular expressions are -enclosed by simple forawrd slashes “/”. Everything between two forward slashes -is a regular expression as it would be understood by Python’s “re”-module. -Thus the rule WORD = /\w+/~ means that a word consists of a seuqence of -letters, numbers or underscores ‘_’ that must be at least one sign long. This -is what the regular expression “w+” inside the slashes means. In regular -expressions, “w” stands for word-characters and “+” means that the previous -character can be repeated one or more times. The tile “~” following the -regular expression, we already know. It means that a a word can be followed by -whitespace. Strictly speaking that whitespace is part of “WORD” as it is -defined here.

-

Similarly, the EOF (for “end of line”) symbol is defined by a rule that -consists of a simple regular expression, namely “.”. The dot in regular -expressions means any character. However, the regular expression itself -preceded by an exclamations mark “!”. IN DHParser-Grammars, the explanation -mark means “not”. Therefore the whole rule means, that no character must -follow. Since this is true only for the end of file, the parser looking for -EOF will only match if the very end of the file has been reached.

-

Now, what would be the easiest way to allow our sequence of words to be ended -like a real sentence with a dot “.”? As always when defining grammars on can -think of different choice to implement this requirement in our grammar. One -possible solution is to add a dot-literal before the “§EOF”-component at the -end of the definition of the “document”-rule. So let’s do that. Change the -line where the “document”-rule is defined to:

-
document = ~ { WORD } "." §EOF
-
-
-

As you can see, string-literals are simply denoted as strings between inverted -commas in DHParser’s variant of the EBNF-Grammar. Now, before we can compile -the file “example.dsl”, we will have to regenerate the our parser, because we -have changed the grammar. In order to recompile, we simply run the test-script -again:

-
$ python tst_poetry_grammar.py
-
-
-

But what is that? A whole lot of errormessages? Well, this it not surprising, -because we change the grammar, some of our old test-cases fail with the new -grammar. So we will have to update our test-cases wird. (Actually, the grammar -get’s compiles never the less and we could just ignore the test failures and -carry on with compiling our “example.dsl”-file again. But, for this time, -we’ll follow good practice and adjust the test cases. So open the test that -failed, “grammar_tests/02_test_document.ini”, in the editor and add full stops -at the end of the “match”-cases and remove the full stop at the end of the -“fail”-case:

-
[match:document]
-M1: """This is a sequence of words
-    extending over several lines."""
-M2: """  This sequence contains leading whitespace."""
-
-[fail:document]
-F1: """This test should fail, because neither
-    comma nor full have been defined anywhere"""
-
-
-

The format of the test-files should be pretty self-explanatory. It is a simple -ini-file, where the section markers hold the name of the grammar-rule to be -tested which is either preceded by “match” or “fail”. “match means” that the -following examples should be matched by the grammar-rule. “fail” means they -should not match. It is just as important that a parser or grammar-rules -does not match those strings it should not match as it is that it matches -those strings that it should match. The individual test-cases all get a name, -in this case M1, M2, F2, but if you prefer more meaningful names this is also -possible. (Beware, however, that the names for match-test different from the -names for the fail tests for the same rule!). Now, run the test-script again -and you’ll see that no errors get reported any more.

-

Finally, we can recompile out “example.dsl”-file, and by its XML output we can -tell that it worked:

-
$ python poetryCompiler.py example.dsl
-
-
-

So far, we have seen in nuce how the development workflow for a building up -DSL-grammar goes. Let’s take this a step further by adding more capabilities -to our grammr.

-
-
-

Extending the example DSL further

-

A grammar that can only digest single sentences is certainly a rather boring. -So we’ll extend our grammar a little further so that it can capture paragraphs -of sentences. To see, where we are heading, let’s first start a new example -file, let’s call it “macbeth.dsl” and enter the following lines:

-
Life’s but a walking shadow, a poor player that struts and frets his hour
-upon the stage and then is heard no more. It is a tale told by an idiot,
-full of sound and fury, signifying nothing.
-
-
-

What have we got, there? We’ve got a paragraph that consists of several -sentences each of which ends with a full stop. The sentences themselves can -consist of different parts which a separated by a comma. If, so far, we have -got a clear idea (in verbal terms) of the structure of texts in our DSL, we -can now try to formulate this in the grammar.

-
-
document = ~ { sentence } §EOF -sentence = part {“,” part } “.” -part = { WORD } # a subtle mistake, right here! -WORD = /w+/~ # something forgotten, here! -EOF = !/./
-

The most important new part is the grammar rule “sentence”. It reads as this: -A sentence is a part of a sentence potentially followed by a repeated sequence -of a comma and another part of a sentence and ultimately ending with a full -stop. (Understandable? If you have ever read Russell’s “Introduction to -Mathematical Philosophy” you will be used to this kind of prose. Other than -that I find the formal definition easier to understand. However, for learning -EBNF or any other formalism, it helps in the beginning to translate the -meaning of its statements into plain old Englisch.)

-

There is are two subtle mistakes in this grammar. If you can figure them out -just by thinking about it, feel free to correct the grammar right now. (Would -you really have noticed the mistakes if they hadn’t already been marked in the -code above?) For all less intelligent people, like me: Let’s be prudent and - -since the grammar has become more complex - add a few test cases. This should -make it easier to locate any errors. So open up an editor with a new file in -the tests subdirectory, say grammar_tests/03_test_sentence.ini (Test files -should always contain the component “test_” in the filename, otherwise they -will be overlooked by DHParser’s unit testing subsystem) and enter a few -test-cases like these:

-
[match:sentence]
-M1: """It is a tale told by an idiot,
-   full of sound and fury, signifying nothing."""
-M2: """Plain old sentence."""
-
-[fail:sentence]
-F1: """Ups, a full stop is missing"""
-F2: """No commas at the end,."""
-
-
-

Again, we recompile the grammar and run the test at the same time by running -the testing-script:

-
$ python tst_poetry_grammar.py
-Errors found by unit test "03_test_sentence.ini":
-Fail test "F2" for parser "sentence" yields match instead of expected failure!
-
-
-

Too bad, something went wrong here. But what? Didn’t the definition of the -rule “sentence” make sure that parts of sentences are, if at all, only be -followed by a sequence of a comma and another part of a sentence. So, how -come that between the last comma and the full stop there is nothing but empty -space? Ah, there’s the rub! If we look into our grammar, how parts of -sentences have been defined, we find that the rule:

-
part = { WORD }
-
-
-

definies a part of a sentence as a sequence of zero or more WORDs. This -means that a string of length zero also counts as a valid part of a sentence. -Now in order to avoid this, we could write:

-
part = WORD { WORD }
-
-
-

This definition makes sure that there is at least on WORD in a part. Since the -case that at least one item is needed occurs rather frequently in grammars, -DHParser offers a special syntax for this case:

-
part = { WORD }+
-
-
-

(The plus sign “+” must always follow directly after the curly brace “}” -without any whitespcae in between, otherwise DHParser won’t understannd it.) -At this point the worry may arise that the same problem could reoccur at -another level, if the rule for WORD would match empty strings as well. Let’s -quickly add a test case for this to the file -grammar_tests/01_test_word.ini:

-
[fail:WORD]
-F1: two words
-F2: ""
-
-
-

Thus, we are sure to be warned in case the definition of rule “WORD” matches -the empty string. Luckily, it does not do so now. But it might happen that we -change this definition later again for some reason, we might have forgotton -about this subtlety and introduce the same error again. With a test case we -can reduce the risk of such a regression error. This time the tests run -through, nicely. So let’s try the parser on our new example:

-
$ python poetryCompiler.py macbeth.dsl
-macbeth.dsl:1:1: Error: EOF expected; "Life’s but" found!
-
-
-

That is strange. Obviously, there is an error right at the beginning (line 1 -column 1). But what coul possibly be wrong with the word “Life”. Now you might -already have guessed what the error is and that the error is not exactly -located in the first column of the first line.

-

Unfortunately, DHParser - like almost any other parser out there - is not -always very good at spotting the exact location of an error. Because rules -refer to other rules, a rule may fail to parse - or, what is just as bad, -succeed to parse when it should indeed fail - as a consequence of an error in -the definition of one of the rules it refers to. But this means if the rule -for the whole document fails to match, the actual error can be located -anywhere in the document! There a different approaches to dealing with this -problem. A tool that DHParser offers is to write log-files that document the -parsing history. The log-files allow to spot the location, where the parsing -error occured. However, you will have to look for the error manually. A good -starting point is usually either the end of the parsing process or the point -where the parser reached the farthest into the text. In order to receive the -parsing history, you need to run the compiler-script again with the debugging -option:

-
$ python poetryCompiler.py macbeth.dsl
-
-
-

You will receive the same error messages as before. but this time various -kinds of debugging information have been written into a new created -subdirectory “LOGS”. (Beware that any files in the “LOGS” directory may be -overwritten or deleted by any of the DHParser scripts upon the next run! So -don’t store any important data there.) The most interesting file in the -“LGOS”-directory is the full parser log. We’ll ignore the other files and just -open the file “macbech_full_parser.log.html” in an internet-browser. As the -parsing history tends to become quite long, this usually takes a while, but -luckily not in the case of our short demo example:

-
$ firefox LOGS/macbeth_full_parser.log.html &
-
-
-_images/parsing_history.png -

What you see is a representation of the parsing history. It might look a bit -tedious in the beginning, especially the this column that contains the parser -call sequence. But it is all very straight forward: For every application of a -match rule, there is a row in the table. Typically, match rules are applied at -the end of a long sequence of parser calls that is displayed in the third -column. You will recognise the parsers that represent rules by their names, -e.g. “document”, “sentence” etc. Those parsers that merely represent -constructs of the EBNF grammar within a rule do not have a name and are -represented by theis type, which always begins with a colon, like -“:ZeroOrMore”. Finally, the regular expression or literal parsers are -represented by the regular expression pattern or the string literal -themselves. (Arguably, it can be confusing that parsers are represented in -three different ways in the parer call sequence. I am still figuring out a -better way to display the parser call sequence. Any suggestions welcome!) The -first two columns display the position in the text in terms of lines and -columns. The second but last column, labeled “success” shows wether the last -parser in the sequence matched or failed or produced an error. In case of an -error, the error message is displayed in the third column as well. In case the -parser matched, the last column displays exactly that section of the text that -the parser did match. If the parser did not match, the last column displays -the text that still lies ahead and has not yet been parsed.

-

In our concrete example, we can see that the parser “WORD” matches “Life”, but -not “Life’s” or “’s”. And this ultimately leads to the failure of the parsing -process as a whole. The simplemost solution would be to add the apostrophe to -the list of allowed characters in a word by changeing the respective line in -the grammar definition to WORD = /[\w’]+/. Now, before we even change the -grammar we first add another test case to capture this kind of error. Since we -have decided that “Life’s” should be parsed as a singe word, let’s open the -file “grammar_tests/01_test_word.ini” and add the following test:

-
[match:WORD]
-M3: Life’s
-
-
-

To be sure that the new test captures the error we have found you might want -to run the script “tst_poetry_grammar.py” and verify that it reports the -failure of test “M3” in the suite “01_test_word.ini”. After that, change the -regular expression for the symbol WORD in the grammar file “poetry.ebnf” as -just described. Now both the tests and the compilation of the file -“macbeth.dsl” should run through smoothly.

-
-

Caution

-

Depending on the purpose of your DSL, the simple solution of -allowing apostrophes within words, might not be what you want. After all -“Life’s” is but a shorthand for the two word phrase “Life is”. Now, -whatever alternative solution now comes to your mind, be aware that there -are also cases like Irish names, say “O’Dolan” where the apostrophe is -actually a part of a word and cases like “don’t” which, if expanded, would -be two words not separated at the position of the apostrophe.

-

We leave that as an exercise, first to figure out, what different cases for -the use of apostrophes in the middle of a word exist. Secondly, to make a -reasonable decision which of these should be treated as a single and which -as separate words and, finally, if possible, to write a grammar that -provides for these cases. These steps are quite typical for the kind of -challenges that occur during the design of a DSL for a -Digital-Humanities-Project.

-
-
-
-

Controlling abstract-syntax-tree generation

-

Compiling the example “macbeth.dsl” with the command python poetryCompier.py -macbeth.dsl, you might find yourself not being able to avoid the impression -that the output is rather verbose. Just looking at the beginning of the -output, we find:

-
<document>
-    <:ZeroOrMore>
-        <sentence>
-            <part>
-                <WORD>
-                    <:RegExp>Life’s</:RegExp>
-                    <:Whitespace> </:Whitespace>
-                </WORD>
-                <WORD>
-                    <:RegExp>but</:RegExp>
-                    <:Whitespace> </:Whitespace>
-                </WORD>
-...
-
-
-

But why do we need to know all those details! Why would we need a -“:ZeroOrMore” element inside the “<document>” element, if the -“<sentence>”-elements could just as well be direct descendants of the -“<document>”-element? Why do we need the information that “Life’s” has been -captured by a regular expression parser? Wouldn’t it suffice to know that the -word captured is “Life’s”? And is the whitespace really needed at all? If the -words in a sequence are separated by definition by whitespace, then it would -suffice to have the word without whitespace in our tree, and to add whitespace -only later when transforming the tree into some kind of output format. (On the -other hand, it might be convenient to have it in the tree never the less…)

-

Well, the answer to most most of these questions is that what our compilation -script yields is more or less the output that the parser yields which in turn -is the concrete syntax tree of the parsed text. Being a concrete syntax tree -it is by its very nature very verbose, because it captures every minute -syntactic detail described in the grammar and found in the text, no matter how -irrelevant it is, if we are primarily interested in the structure of our text. -In order for our tree to become more handy we have to transform it into an -abstract syntax tree first, which is called thus because it abstracts from -all details that deem us irrelevant. Now, which details we consider as -irrelevant is almost entirely up to ourselves. And we should think carefully -about what features must be included in the abstract syntax tree, because the -abstract syntax tree more or less reflects the data model (or is at most one -step away from it) with which want to capture our material.

-

For the sake of our example, let’s assume that we are not interested in -whitespace and that we want to get rid of all uniformative nodes, i.e. nodes -that merely demark syntactic structures but not semantic entities.

-

DHParser supports the transformation of the concrete syntax tree (CST) into the -abstract syntax tree (AST) with a simple technology that (in theory) allows to -specify the necessary transformations in an almost delcarative fashion: You -simply fill in a Python-dictionary of tag-names with transformation operators. -Technically, these operators are simply Python-functions. DHParser comes with a -rich set of predefined operators. Should these not suffice, you -can easily write your own. How does this look like?

-
poetry_AST_transformation_table = {
-    "+": remove_empty,
-    "document": [],
-    "sentence": [],
-    "part": [],
-    "WORD": [],
-    "EOF": [],
-    ":Token, :RE": reduce_single_child,
-    "*": replace_by_single_child
-}
-
-
-

You’ll find this table in the script poetryCompiler.py, which is also the -place where you edit the table, because then it is automatically used when -compiling your DSL-sources. Now, AST-Transformation works as follows: The whole -tree is scanned, starting at the deepest level and applying the specified -operators and then working its way upward. This means that the operators -specified for “WORD”-nodes will be applied before the operators of “part”-nodes -and “sentence”-nodes. This has the advantage that when a particular node is -reached the transformations for its descendant nodes have already been applied.

-

As you can see, the transformation-table contains an entry for every known -parser, i.e. “document”, “sentence”, “part”, “WORD”, “EOF”. (If any of these are -missing in the table of your poetryCompiler.py, add them now!) In the -template you’ll also find transformations for two anonymous parsers, i.e. -“:Token” and “:RE” as well as some curious entries such as “*” and “+”. The -latter are considered to be “jokers”. The transformations related to the -“+”-sign will be applied on any node, before any other transformation is -applied. In this case, all empty nodes will be removed first (transformation: -remove_empty). The “*”-joker contains a list of transformations that will be -applied to all those tags that have not been entered explicitly into the -transformation table. For example, if the transformation reaches a node with the -tag-name “:ZeroOrMore” (i.e. an anonymous node that has been generated by the -parser “:ZeroOrmore”), the “*”-joker-operators will be applied. In this -case it is just one transformation, namely, replace_by_single_child which -replaces a node that has but one child by its child. In contrast, the -transformation reduce_single_child eliminates a single child node by -attaching the child’s children or content directly to the parent node. We’ll see -what this means and how this works, briefly.

-
-

Caution

-

Once the compiler-script “xxxxCompiler.py” has been generated, the -only part that is changed after editing and extending the grammar is the -parser-part of this script (i.e. the class derived from class Grammar), -because this part is completly auto-generated and can therefore be -overwritten safely. The other parts of that script, including the -AST-transformation-dictionary, if never changed once it has been generated, -because it needs to be filled in by hand by the designer of the DSL and the -hand-made changes should not be overwritten. There it is left as it is when -regenerating the parser. However, this means, if you add symbols to your -grammar later, you will not find them as keys in the -AST-transformation-table, but you’ll have to add them yourself.

-

The comments in the compiler-script clearly indicate which parts can be -edited by hand safely, i.e. without running the risk of being overwritten, an -which cannot.

-
-

We can either specify no operator (empty list), a single operator or a list of -operators for transforming a node. There is a different between specifying an -empty list for a particular tag-name or leaving out a tag-name completly. In the -latter case the “*”-joker is applied, in place of the missing list of operators. -In the former case only the “+”-joker is applied. If a list of operators is -specified, these operator will be applied in sequence one after the other. We -also call the list of operators or the single operator if there is only one the -transformation for a particular tag (or parser name or parser type for that -matter).

-

Because the AST-transfomation works through the table from the inside to the -outside, it is reasonable to do the same when designing the AST-transformations, -to proceed in the same order. The innermost nodes that concern us are the nodes -captured by the <WORD>-parser, or simply, <WORD>-nodes. As we can see, these -nodes usually contain a <:RegExp>-node and a <:Whitespace>-node. As the “WORD” -parser is defined as a simple regular expresseion with followed by optional -whitespace in our grammar, we now that this must always be the case, although -the whitespace may occasionally be empty. Thus, we can eliminate the -uninformative child nodes by removing whitespace first and the reducing the -single left over child node. The respective line in the AST-transformation-table -in the compiler-script should be changed as follows:

-
"WORD": [remove_whitespace, reduce_single_child],
-
-
-

Running the “poetryCompiler.py”-script on “macbeth.dsl” again, yields:

-
<document>
-  <:ZeroOrMore>
-    <sentence>
-      <part>
-        <WORD>Life’s</WORD>
-        <WORD>but</WORD>
-        <WORD>a</WORD>
-        <WORD>walking</WORD>
-        <WORD>shadow</WORD>
-      </part>
-      <:Series>
-        <:Token>
-          <:PlainText>,</:PlainText>
-          <:Whitespace> </:Whitespace>
-        </:Token>
-        <part>
-          <WORD>a</WORD>
-...
-
-
-

It starts to become more readble and concise, but there are sill some oddities. -Firstly, the Tokens that deliminate parts of sentences still contain whitespace. -Secondly, if several <part>-nodes follow each other in a <sentence>-node, the -<part>-nodes after the first one are enclosed by a <:Series>-node or even a -cascade of <:ZeroOrMore> and <:Series>-nodes. As for the <:Token>-nodes, have -can do the same trick as with the WORD-nodes:

-
":Token": [remove_whitespace, reduce_single_child],
-":RE": reduce_single_child,
-
-
-

As to the nested structure of the <part>-nodes within the <sentence>-node, this -a rather typical case of syntactic artefacts that can be found in concrete -syntax trees. It is obviously a consequence of the grammar definition:

-
sentence = part {"," part } "."
-
-
-

We’d of course prefer to have flat structure of parts and punctuation marks -following each other within the sentence. Since this is a standard case, -DHParser includes a special operator to “flatten” nested structures of this -kind:

-
"sentence" = [flatten],
-
-
-

The flatten operator recursively eliminates all intermediary anonymous child -nodes. We do not need to do anything in particular for transforming the -<part>-node, except that we should explicitly assign an empty operator-list to -it, because we do not want the “*” to step in. The reason is that a <part> with -a single <WORD> should still be visible as a part a not replaced by the -<WORD>-node, because we would like our data model to have has regular a form as -possible. (This does of course imply a decision that we have taken on the form -of our data model, which would lead too far to discuss here. Suffice it to say -that depending on the occasion and purpose, such decisions can also be taken -otherwise.)

-

The only kind of nodes left are the <document>-nodes. In the output of the -compiler-script (see above), the <document>-node had a single childe node -“:ZeroOrMore”. Since this child node does not have any particular semantic -meaning it would reasonable to eliminate it and attach its children directly to -“document”. We could do so by entering reduce_single_child in the lost of -transformations for “document”-nodes. However, when designing the -AST-transformations, it is important not only to consider the concrete output -that a particular text yields, but all possible outputs. Therefore, before -specifying a transformation, we should also take a careful look at the grammar -again, where “document” is defined as follows:

-
document = ~ { sentence } §EOF
-
-
-

As we can see a “document”-node may also contain whitespace and an EOF-marker. -The reason why we don’t find these in the output is that empty nodes have been -eliminated by the remove_empty-transformation specified in the “+”-joker, -before. While EOF is always empty (little exercise: explain why!). But there -could be “:Whitespace”-nodes next to the zero or more sentences in the document -node, in which case the “reduce_single_child”-operator would do nothing, because -there is more than a single child. (We could of course also use the -“flatten”-operator, instead. Try this as an exercise.) Test cases help to -capture those different scenarios, so adding test cases and examining the output -in the test report halp to get a grip on this, if just looking at the grammar -strains you imagination too much.

-

Since we have decided, that we do not want to include whitespace in our data -model, we can simply eliminate any whitespace before we apply the -reduce_single_child-operator, so we change the “document”-entry in the -AST-transformation-table as thus:

-
"document": [remove_whitespace, reduce_single_child],
-
-
-

Now that everything is set, let’s have a look at the result:

-
<document>
-  <sentence>
-    <part>
-      <WORD>Life’s</WORD>
-      <WORD>but</WORD>
-      <WORD>a</WORD>
-      <WORD>walking</WORD>
-      <WORD>shadow</WORD>
-    </part>
-    <:Token>,</:Token>
-    <part>
-      <WORD>a</WORD>
-      <WORD>poor</WORD>
-      <WORD>player</WORD>
-...
-
-
-

That is much better. There is but one slight blemish in the output: While all -nodes left a named nodes, i.e. nodes associated with a named parser, there are a -few anonymous <:Token> nodes. Here is a little exercise: Do away with those -<:Token>-nodes by replacing them by something semantically more meaningful. -Hint: Add a new symbol “delimiter” in the grammar definition “poetry.ebnf”. An -alternative strategy to extending the grammar would be to use the -replace_parser operator. Which of the strategy is the better one? Explain -why.

-
-
-
- - -
- -
- - -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation_sources/StepByStepGuide.rst b/documentation/StepByStepGuide.rst similarity index 88% rename from documentation_sources/StepByStepGuide.rst rename to documentation/StepByStepGuide.rst index c911bfa7f07883f0fbdc144ed38b85dd04de9845..bde4ca0bfbc48b927eacc41b8ecc00ab70261601 100644 --- a/documentation_sources/StepByStepGuide.rst +++ b/documentation/StepByStepGuide.rst @@ -45,23 +45,23 @@ Installing DHParser from the git repository In order to install DHParser from the git repository, open up a shell window and type:: - $ git clone https://gitlab.lrz.de/badw-it/DHParser.git - $ cd DHParser + $ git clone https://gitlab.lrz.de/badw-it/DHParser.git + $ cd DHParser The second command changes to the DHParser directory. Within this directory you should recognise the following subdirectories and files. There are more files and directories for sure, but those will not concern us for now:: - DHParser/ - the DHParser python packages - documentation/ - DHParser's documentation in html-form - documentation_source - DHParser's documentation in reStructedText-Format - examples/ - some exmamples for DHParser (mostly incomplete) - experimental/ - an empty directory for experimenting - test/ - DHParser's unit-tests - dhparser.py - DHParser's command line tool for setting up projects - README.md - General information about DHParser - LICENSE.txt - DHParser's license. It's open source (hooray!) - Introduction.md - An introduction and appetizer for DHParser + DHParser/ - the DHParser python packages + documentation/ - DHParser's documentation in html-form + documentation_source - DHParser's documentation in reStructedText-Format + examples/ - some exmamples for DHParser (mostly incomplete) + experimental/ - an empty directory for experimenting + test/ - DHParser's unit-tests + dhparser.py - DHParser's command line tool for setting up projects + README.md - General information about DHParser + LICENSE.txt - DHParser's license. It's open source (hooray!) + Introduction.md - An introduction and appetizer for DHParser In order to verify that the installation works, you can simply run the "dhparser.py" script and, when asked, chose "3" for the self-test. If the @@ -73,8 +73,8 @@ Staring a new DHParser project In order to setup a new DHParser project, you run the ``dhparser.py``-script with the name of the new project. For the sake of the example, let's type:: - $ python dhparser.py experimental/poetry - $ cd experimental/poetry + $ python dhparser.py experimental/poetry + $ cd experimental/poetry This creates a new DHParser-project with the name "poetry" in directory with the same name within the subdirectory "experimental". This new directory @@ -95,7 +95,7 @@ grammar for sequences of words separated by whitespace. Now, since we alread have unit tests, our first exercise will be to run the unit tests by starting the script "tst_poetry_grammar.py":: - $ python tst_poetry_grammar.py + $ python tst_poetry_grammar.py This will run through the unit-tests in the grammar_tests directory and print their success or failure on the screen. If you check the contents of your @@ -104,11 +104,11 @@ exists a new file "poetryCompiler.py" in the project directory. This is an auto-generated compiler-script for our DSL. You can use this script to compile any source file of your DSL, like "example.dsl". Let's try:: - $ python poetryCompiler.py example.dsl + $ python poetryCompiler.py example.dsl The output is a block of pseudo-XML, looking like this:: - + <:ZeroOrMore> <:RegExp>Life @@ -219,12 +219,12 @@ but a walking shadow" Now, wouldn't it be nice, if we could end this sequence with a full stop to turn it into a proper sentence. So, open "examples.dsl" with a text editor and add a full stop:: - Life is but a walking shadow. + Life is but a walking shadow. Now, try to compile "examples.dsl" with the compile-script:: - $ python poetryCompiler.py example.dsl - example.dsl:1:29: Error: EOF expected; ".\n " found! + $ python poetryCompiler.py example.dsl + example.dsl:1:29: Error: EOF expected; ".\n " found! Since the grammar, obviously, did not allow full stops so far, the parser returns an error message. The error message is pretty self-explanatory in this @@ -244,9 +244,9 @@ lines do not represent any grammar rules, but meta rules or so-called as whitespace-handling or whether the parser is going to be case-sensitive. Now, there are exactly three rules that make up this grammar:: - document = ~ { WORD } §EOF - WORD = /\w+/~ - EOF = !/./ + document = ~ { WORD } §EOF + WORD = /\w+/~ + EOF = !/./ EBNF-Grammars describe the structure of a domain specific notation in top-down fashion. Thus, the first rule in the grammar describes the components out of @@ -261,20 +261,20 @@ a "="-sign (which in this context can be understood as a definition signifier) and the definition of the rule on the right hand side. .. note:: Traditional parser technology for context-free grammars often - distinguishes two phases, *scanning* and *parsing*, where a lexical scanner - would take a stream of characters and yield a sequence of tokens and the - actual parser would then operate on the stream of tokens. DHParser, - however, is an instance of a *scannerless parser* where the functionality - of the lexical scanner is seamlessly integrated into the - parser. This is done by allowing regular expressions in the definiendum of - grammar symbols. The regular expressions do the work of the lexical - scanner. - - Theoretically, one could do without scanners or regular expressions. - Because regular languages are a subset of context-free languages, parsers - for context-free languages can do all the work that regular expressions can - do. But it makes things easier - and, in the case of DHParser, also faster - - to have them. + distinguishes two phases, *scanning* and *parsing*, where a lexical scanner + would take a stream of characters and yield a sequence of tokens and the + actual parser would then operate on the stream of tokens. DHParser, + however, is an instance of a *scannerless parser* where the functionality + of the lexical scanner is seamlessly integrated into the + parser. This is done by allowing regular expressions in the definiendum of + grammar symbols. The regular expressions do the work of the lexical + scanner. + + Theoretically, one could do without scanners or regular expressions. + Because regular languages are a subset of context-free languages, parsers + for context-free languages can do all the work that regular expressions can + do. But it makes things easier - and, in the case of DHParser, also faster + - to have them. In our case the text as a whole, conveniently named "document" (any other name would be allowed, too), consists of a leading whitespace, a possibly empty @@ -325,7 +325,7 @@ possible solution is to add a dot-literal before the "§EOF"-component at the end of the definition of the "document"-rule. So let's do that. Change the line where the "document"-rule is defined to:: - document = ~ { WORD } "." §EOF + document = ~ { WORD } "." §EOF As you can see, string-literals are simply denoted as strings between inverted commas in DHParser's variant of the EBNF-Grammar. Now, before we can compile @@ -333,7 +333,7 @@ the file "example.dsl", we will have to regenerate the our parser, because we have changed the grammar. In order to recompile, we simply run the test-script again:: - $ python tst_poetry_grammar.py + $ python tst_poetry_grammar.py But what is that? A whole lot of error messages? Well, this it not surprising, because we change the grammar, some of our old test-cases fail with the new @@ -345,13 +345,13 @@ failed, "grammar_tests/02_test_document.ini", in the editor and add full stops at the end of the "match"-cases and remove the full stop at the end of the "fail"-case:: - [match:document] - M1: """This is a sequence of words + [match:document] + M1: """This is a sequence of words extending over several lines.""" - M2: """ This sequence contains leading whitespace.""" + M2: """ This sequence contains leading whitespace.""" - [fail:document] - F1: """This test should fail, because neither + [fail:document] + F1: """This test should fail, because neither comma nor full have been defined anywhere""" The format of the test-files should be pretty self-explanatory. It is a simple @@ -369,7 +369,7 @@ and you'll see that no errors get reported any more. Finally, we can recompile out "example.dsl"-file, and by its XML output we can tell that it worked:: - $ python poetryCompiler.py example.dsl + $ python poetryCompiler.py example.dsl So far, we have seen *in nuce* how the development workflow for a building up DSL-grammar goes. Let's take this a step further by adding more capabilities @@ -383,21 +383,21 @@ So we'll extend our grammar a little further so that it can capture paragraphs of sentences. To see, where we are heading, let's first start a new example file, let's call it "macbeth.dsl" and enter the following lines:: - Life’s but a walking shadow, a poor player that struts and frets his hour - upon the stage and then is heard no more. It is a tale told by an idiot, - full of sound and fury, signifying nothing. + Life’s but a walking shadow, a poor player that struts and frets his hour + upon the stage and then is heard no more. It is a tale told by an idiot, + full of sound and fury, signifying nothing. What have we got, there? We've got a paragraph that consists of several sentences each of which ends with a full stop. The sentences themselves can consist of different parts which a separated by a comma. If, so far, we have got a clear idea (in verbal terms) of the structure of texts in our DSL, we -can now try to formulate this in the grammar. +can now try to formulate this in the grammar.:: - document = ~ { sentence } §EOF - sentence = part {"," part } "." - part = { WORD } # a subtle mistake, right here! - WORD = /\w+/~ # something forgotten, here! - EOF = !/./ + document = ~ { sentence } §EOF + sentence = part {"," part } "." + part = { WORD } # a subtle mistake, right here! + WORD = /\w+/~ # something forgotten, here! + EOF = !/./ The most important new part is the grammar rule "sentence". It reads as this: A sentence is a part of a sentence potentially followed by a repeated sequence @@ -415,25 +415,25 @@ code above?) For all less intelligent people, like me: Let's be prudent and - since the grammar has become more complex - add a few test cases. This should make it easier to locate any errors. So open up an editor with a new file in the tests subdirectory, say ``grammar_tests/03_test_sentence.ini`` (Test files -should always contain the component "test_" in the filename, otherwise they +should always contain the component `test_` in the filename, otherwise they will be overlooked by DHParser's unit testing subsystem) and enter a few test-cases like these:: - [match:sentence] - M1: """It is a tale told by an idiot, + [match:sentence] + M1: """It is a tale told by an idiot, full of sound and fury, signifying nothing.""" - M2: """Plain old sentence.""" + M2: """Plain old sentence.""" - [fail:sentence] - F1: """Ups, a full stop is missing""" - F2: """No commas at the end,.""" + [fail:sentence] + F1: """Ups, a full stop is missing""" + F2: """No commas at the end,.""" Again, we recompile the grammar and run the test at the same time by running the testing-script:: - $ python tst_poetry_grammar.py - Errors found by unit test "03_test_sentence.ini": - Fail test "F2" for parser "sentence" yields match instead of expected failure! + $ python tst_poetry_grammar.py + Errors found by unit test "03_test_sentence.ini": + Fail test "F2" for parser "sentence" yields match instead of expected failure! Too bad, something went wrong here. But what? Didn't the definition of the rule "sentence" make sure that parts of sentences are, if at all, only be @@ -442,19 +442,19 @@ come that between the last comma and the full stop there is nothing but empty space? Ah, there's the rub! If we look into our grammar, how parts of sentences have been defined, we find that the rule:: - part = { WORD } + part = { WORD } defines a part of a sentence as a sequence of *zero* or more WORDs. This means that a string of length zero also counts as a valid part of a sentence. Now in order to avoid this, we could write:: - part = WORD { WORD } + part = WORD { WORD } This definition makes sure that there is at least on WORD in a part. Since the case that at least one item is needed occurs rather frequently in grammars, DHParser offers a special syntax for this case:: - part = { WORD }+ + part = { WORD }+ (The plus sign "+" must always follow directly after the curly brace "}" without any whitespace in between, otherwise DHParser won't understannd it.) @@ -463,9 +463,9 @@ another level, if the rule for WORD would match empty strings as well. Let's quickly add a test case for this to the file ``grammar_tests/01_test_word.ini``:: - [fail:WORD] - F1: two words - F2: "" + [fail:WORD] + F1: two words + F2: "" Thus, we are sure to be warned in case the definition of rule "WORD" matches the empty string. Luckily, it does not do so now. But it might happen that we @@ -474,8 +474,8 @@ about this subtlety and introduce the same error again. With a test case we can reduce the risk of such a regression error. This time the tests run through, nicely. So let's try the parser on our new example:: - $ python poetryCompiler.py macbeth.dsl - macbeth.dsl:1:1: Error: EOF expected; "Life’s but" found! + $ python poetryCompiler.py macbeth.dsl + macbeth.dsl:1:1: Error: EOF expected; "Life’s but" found! That is strange. Obviously, there is an error right at the beginning (line 1 column 1). But what could possibly be wrong with the word "Life". Now you might @@ -497,7 +497,7 @@ where the parser reached the farthest into the text. In order to receive the parsing history, you need to run the compiler-script again with the debugging option:: - $ python poetryCompiler.py macbeth.dsl + $ python poetryCompiler.py macbeth.dsl You will receive the same error messages as before. but this time various kinds of debugging information have been written into a new created @@ -509,7 +509,7 @@ open the file "macbeth_full_parser.log.html" in an internet-browser. As the parsing history tends to become quite long, this usually takes a while, but luckily not in the case of our short demo example:: - $ firefox LOGS/macbeth_full_parser.log.html & + $ firefox LOGS/macbeth_full_parser.log.html & .. image:: parsing_history.png @@ -544,8 +544,8 @@ grammar we first add another test case to capture this kind of error. Since we have decided that "Life’s" should be parsed as a singe word, let's open the file "grammar_tests/01_test_word.ini" and add the following test:: - [match:WORD] - M3: Life’s + [match:WORD] + M3: Life’s To be sure that the new test captures the error we have found you might want to run the script "tst_poetry_grammar.py" and verify that it reports the @@ -579,7 +579,7 @@ macbeth.dsl``, you might find yourself not being able to avoid the impression that the output is rather verbose. Just looking at the beginning of the output, we find:: - + <:ZeroOrMore> @@ -591,7 +591,7 @@ output, we find:: <:RegExp>but <:Whitespace> - ... + ... But why do we need to know all those details! Why would we need a ":ZeroOrMore" element inside the "" element, if the @@ -630,7 +630,7 @@ Technically, these operators are simply Python-functions. DHParser comes with a rich set of predefined operators. Should these not suffice, you can easily write your own. How does this look like? :: - poetry_AST_transformation_table = { + poetry_AST_transformation_table = { "+": remove_empty, "document": [], "sentence": [], @@ -639,7 +639,7 @@ can easily write your own. How does this look like? :: "EOF": [], ":Token, :RE": reduce_single_child, "*": replace_by_single_child - } + } You'll find this table in the script ``poetryCompiler.py``, which is also the place where you edit the table, because then it is automatically used when @@ -670,20 +670,20 @@ attaching the child's children or content directly to the parent node. We'll see what this means and how this works, briefly. .. caution:: Once the compiler-script "xxxxCompiler.py" has been generated, the - *only* part that is changed after editing and extending the grammar is the - parser-part of this script (i.e. the class derived from class Grammar), - because this part is completely auto-generated and can therefore be - overwritten safely. The other parts of that script, including the - AST-transformation-dictionary, if never changed once it has been generated, - because it needs to be filled in by hand by the designer of the DSL and the - hand-made changes should not be overwritten. There it is left as it is when - regenerating the parser. However, this means, if you add symbols to your - grammar later, you will not find them as keys in the - AST-transformation-table, but you'll have to add them yourself. - - The comments in the compiler-script clearly indicate which parts can be - edited by hand safely, i.e. without running the risk of being overwritten, an - which cannot. + *only* part that is changed after editing and extending the grammar is the + parser-part of this script (i.e. the class derived from class Grammar), + because this part is completely auto-generated and can therefore be + overwritten safely. The other parts of that script, including the + AST-transformation-dictionary, if never changed once it has been generated, + because it needs to be filled in by hand by the designer of the DSL and the + hand-made changes should not be overwritten. There it is left as it is when + regenerating the parser. However, this means, if you add symbols to your + grammar later, you will not find them as keys in the + AST-transformation-table, but you'll have to add them yourself. + + The comments in the compiler-script clearly indicate which parts can be + edited by hand safely, i.e. without running the risk of being overwritten, an + which cannot. We can either specify no operator (empty list), a single operator or a list of operators for transforming a node. There is a difference between specifying an @@ -707,11 +707,11 @@ uninformative child nodes by removing whitespace first and the reducing the single left over child node. The respective line in the AST-transformation-table in the compiler-script should be changed as follows:: - "WORD": [remove_whitespace, reduce_single_child], + "WORD": [remove_whitespace, reduce_single_child], Running the "poetryCompiler.py"-script on "macbeth.dsl" again, yields:: - + <:ZeroOrMore> @@ -728,7 +728,7 @@ Running the "poetryCompiler.py"-script on "macbeth.dsl" again, yields:: a - ... + ... It starts to become more readable and concise, but there are sill some oddities. Firstly, the Tokens that deliminate parts of sentences still contain whitespace. @@ -737,8 +737,8 @@ Secondly, if several -nodes follow each other in a -node, the cascade of <:ZeroOrMore> and <:Series>-nodes. As for the <:Token>-nodes, have can do the same trick as with the WORD-nodes:: - ":Token": [remove_whitespace, reduce_single_child], - ":RE": reduce_single_child, + ":Token": [remove_whitespace, reduce_single_child], + ":RE": reduce_single_child, As to the nested structure of the -nodes within the -node, this a rather typical case of syntactic artefacts that can be found in concrete @@ -751,7 +751,7 @@ following each other within the sentence. Since this is a standard case, DHParser includes a special operator to "flatten" nested structures of this kind:: - "sentence" = [flatten], + "sentence" = [flatten], The ``flatten`` operator recursively eliminates all intermediary anonymous child nodes. We do not need to do anything in particular for transforming the @@ -775,7 +775,7 @@ that a particular text yields, but all possible outputs. Therefore, before specifying a transformation, we should also take a careful look at the grammar again, where "document" is defined as follows:: - document = ~ { sentence } §EOF + document = ~ { sentence } §EOF As we can see a "document"-node may also contain whitespace and an EOF-marker. The reason why we don't find these in the output is that empty nodes have been @@ -794,11 +794,11 @@ model, we can simply eliminate any whitespace before we apply the ``reduce_single_child``-operator, so we change the "document"-entry in the AST-transformation-table as thus:: - "document": [remove_whitespace, reduce_single_child], + "document": [remove_whitespace, reduce_single_child], Now that everything is set, let's have a look at the result:: - + Life’s @@ -812,7 +812,7 @@ Now that everything is set, let's have a look at the result:: a poor player - ... + ... That is much better. There is but one slight blemish in the output: While all nodes left a named nodes, i.e. nodes associated with a named parser, there are a diff --git a/documentation/UserGuide.html b/documentation/UserGuide.html deleted file mode 100644 index f19bcb7044d1a740ec9a96ca0f2a8bb77c9d6b8c..0000000000000000000000000000000000000000 --- a/documentation/UserGuide.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - - - - - - DHParser User’s Guide — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -
-

DHParser User’s Guide

-

This user’s guide explains how to use create, test and employ a domain -specific language with DHParser for encoding text or data in a Digital -Humanities Project.

-
-

Introduction

-

Most Digital Humanities projects or least most text-centered DH projects -involve in some way or other the entering and encoding of annotated text or -data into a computer. And the systems that scientists use for that purpose -consist of an input surface (or “redactation system”) for entering the data, a -storage system to keep the data and a presentation system for providing the -data and possibly also functionality for working with the data to human or -machine receipients. A typical example of this type of system is Berlin’ -Ediarum-System, which consists of an XML-Editor for entering data, an -XML-Database for storing the data and a web application for providing the data -to human readers or other web services via an application programming -interface (API). Ediarum is also typical, because like many DH-projects it -assumes an XML-based workflow.

-
-
- - -
- -
- - -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation_sources/UserGuide.rst b/documentation/UserGuide.rst similarity index 100% rename from documentation_sources/UserGuide.rst rename to documentation/UserGuide.rst diff --git a/documentation/_modules/DHParser/syntaxtree.html b/documentation/_modules/DHParser/syntaxtree.html deleted file mode 100644 index 3ff81b315bafa3a83e2b2d5ba288238ee150da1e..0000000000000000000000000000000000000000 --- a/documentation/_modules/DHParser/syntaxtree.html +++ /dev/null @@ -1,1174 +0,0 @@ - - - - - - - - - - - DHParser.syntaxtree — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -

Source code for DHParser.syntaxtree

-# syntaxtree.py - syntax tree classes for DHParser
-#
-# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
-#                 Bavarian Academy of Sciences an Humanities (badw.de)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied.  See the License for the specific language governing
-# permissions and limitations under the License.
-
-
-"""
-Module ``syntaxtree`` defines the ``Node``-class for syntax trees as well
-as an abstract base class for parser-objects. The latter is defined
-here, because node-objects refer to parser-objects. All concrete
-parser classes are defined in the ``parse`` module.
-"""
-
-
-import collections.abc
-from collections import OrderedDict
-import copy
-
-from DHParser.error import Error, linebreaks, line_col
-from DHParser.stringview import StringView
-from DHParser.toolkit import re, typing
-from typing import Callable, cast, Iterator, List, AbstractSet, Set, Union, Tuple, Optional
-
-
-__all__ = ('ParserBase',
-           'WHITESPACE_PTYPE',
-           'PLAINTEXT_PTYPE',
-           'TOKEN_PTYPE',
-           'MockParser',
-           'ZombieParser',
-           'ZOMBIE_PARSER',
-           'ZOMBIE_NODE',
-           'Node',
-           'RootNode',
-           'parse_sxpr',
-           'parse_xml',
-           'flatten_sxpr',
-           'flatten_xml')
-
-
-#######################################################################
-#
-# parser base and mock parsers
-#
-#######################################################################
-
-
-class ParserBase:
-    """
-    ParserBase is the base class for all real and mock parser classes.
-    It is defined here, because Node objects require a parser object
-    for instantiation.
-    """
-    def __init__(self, name=''):  # , pbases=frozenset()):
-        self._name = name  # type: str
-        self._ptype = ':' + self.__class__.__name__  # type: str
-
-    def __repr__(self):
-         return self.name + self.ptype
-
-    def __str__(self):
-        return self.name + (' = ' if self.name else '') + repr(self)
-
-    def __call__(self, text: StringView) -> Tuple[Optional['Node'], StringView]:
-        return None, text
-
-    @property
-    def name(self):
-        """Returns the name of the parser or the empty string '' for unnamed
-        parsers."""
-        return self._name
-
-    @property
-    def ptype(self) -> str:
-        """Returns the type of the parser. By default this is the parser's
-        class name preceded by a colon, e.g. ':ZeroOrMore'."""
-        return self._ptype
-
-    @property
-    def repr(self) -> str:
-        """Returns the parser's name if it has a name and repr()"""
-        return self.name if self.name else repr(self)
-
-    def reset(self):
-        """Resets any parser variables. (Should be overridden.)"""
-        pass
-
-    def grammar(self) -> Optional[object]:
-        """Returns the Grammar object to which the parser belongs. If not
-        yet connected to any Grammar object, None is returned."""
-        return None
-
-    def apply(self, func: Callable) -> bool:
-        """Applies the function `func` to the parser. Returns False, if
-        - for whatever reason - the functions has not been applied, True
-        otherwise."""
-        return False
-
-
-WHITESPACE_PTYPE = ':Whitespace'
-PLAINTEXT_PTYPE = ':PlainText'
-TOKEN_PTYPE = ':Token'
-
-
-class MockParser(ParserBase):
-    """
-    MockParser objects can be used to reconstruct syntax trees from a
-    serialized form like S-expressions or XML. Mock objects can mimic
-    different parser types by assigning them a ptype on initialization.
-
-    Mock objects should not be used for anything other than
-    syntax tree (re-)construction. In all other cases where a parser
-    object substitute is needed, chose the singleton ZOMBIE_PARSER.
-    """
-    def __init__(self, name='', ptype=''):  # , pbases=frozenset()):
-        assert not ptype or ptype[0] == ':'
-        super().__init__(name)
-        self._ptype = ptype or ':' + self.__class__.__name__
-
-
-class ZombieParser(MockParser):
-    """
-    Serves as a substitute for a Parser instance.
-
-    ``ZombieParser`` is the class of the singelton object
-    ``ZOMBIE_PARSER``. The  ``ZOMBIE_PARSER`` has a name and can be
-    called, but it never matches. It serves as a substitute where only
-    these (or one of these properties) is needed, but no real Parser-
-    object is instantiated.
-    """
-    alive = False
-
-    def __init__(self):
-        super(ZombieParser, self).__init__("__ZOMBIE__")
-        assert not self.__class__.alive, "There can be only one!"
-        assert self.__class__ == ZombieParser, "No derivatives, please!"
-        self.__class__.alive = True
-
-    def __copy__(self):
-        return self
-
-    def __deepcopy__(self, memo):
-        return self
-
-    def __call__(self, text):
-        """Better call Saul ;-)"""
-        return None, text
-
-
-ZOMBIE_PARSER = ZombieParser()
-
-
-#######################################################################
-#
-# syntaxtree nodes
-#
-#######################################################################
-
-
-ChildrenType = Tuple['Node', ...]
-NoChildren = cast(ChildrenType, ())  # type: ChildrenType
-StrictResultType = Union[ChildrenType, StringView, str]
-ResultType = Union[ChildrenType, 'Node', StringView, str, None]
-
-
-def flatten_sxpr(sxpr: str) -> str:
-    """Returns S-expression ``sxpr`` as a one-liner without unnecessary
-    whitespace.
-
-    Example:
-    >>> flatten_sxpr('(a\\n    (b\\n        c\\n    )\\n)\\n')
-    '(a (b c))'
-    """
-    return re.sub(r'\s(?=\))', '', re.sub(r'\s+', ' ', sxpr)).strip()
-
-
-def flatten_xml(xml: str) -> str:
-    """Returns an XML-tree as a one linter without unnecessary whitespace,
-    i.e. only whitespace within leaf-nodes is preserved.
-    """
-    return re.sub(r'\s+(?=<\w)', '', re.sub(r'(?<=</\w+>)\s+', '', xml))
-
-
-class Node(collections.abc.Sized):
-    """
-    Represents a node in the concrete or abstract syntax tree.
-
-    Attributes:
-        tag_name (str):  The name of the node, which is either its
-            parser's name or, if that is empty, the parser's class name
-
-        result (str or tuple):  The result of the parser which
-            generated this node, which can be either a string or a
-            tuple of child nodes.
-
-        children (tuple):  The tuple of child nodes or an empty tuple
-            if there are no child nodes. READ ONLY!
-
-        content (str):  Yields the contents of the tree as string. The
-            difference to ``str(node)`` is that ``node.content`` does
-            not add the error messages to the returned string.
-
-        parser (Parser):  The parser which generated this node.
-            WARNING: In case you use mock syntax trees for testing or
-            parser replacement during the AST-transformation: DO NOT
-            rely on this being a real parser object in any phase after
-            parsing (i.e. AST-transformation and compiling), for
-            example by calling ``isinstance(node.parer, ...)``.
-
-        len (int):  The full length of the node's string result if the
-            node is a leaf node or, otherwise, the concatenated string
-            result's of its descendants. The figure always represents
-            the length before AST-transformation and will never change
-            through AST-transformation. READ ONLY!
-
-        pos (int):  the position of the node within the parsed text.
-
-            The value of ``pos`` is -1 meaning invalid by default.
-            Setting this value will set the positions of all child
-            nodes relative to this value.
-
-            To set the pos values of all nodes in a syntax tree, the
-            pos value of the root node should be set to 0 right
-            after parsing.
-
-            Other than that, this value should be considered READ ONLY.
-            At any rate, it should only be reassigned during the parsing
-            stage and never during or after the AST-transformation.
-
-        errors (list):  A list of all errors that occured on this node.
-
-        attributes (dict): An optional dictionary of XML-attributes. This
-            dictionary is created lazily upon first usage. The attributes
-            will only be shown in the XML-Representation, not in the
-            S-Expression-output.
-    """
-
-    __slots__ = ['_result', 'children', '_len', '_pos', 'parser', 'errors', '_xml_attr', '_content']
-
-    def __init__(self, parser, result: ResultType, leafhint: bool = False) -> None:
-        """
-        Initializes the ``Node``-object with the ``Parser``-Instance
-        that generated the node and the parser's result.
-        """
-        self.errors = []               # type: List[Error]
-        self._pos = -1                  # type: int
-        # Assignment to self.result initializes the attributes _result, children and _len
-        # The following if-clause is merely an optimization, i.e. a fast-path for leaf-Nodes
-        if leafhint:
-            self._result = result       # type: StrictResultType
-            self._content = None        # type: Optional[str]
-            self.children = NoChildren  # type: ChildrenType
-            self._len = -1              # type: int  # lazy evaluation
-        else:
-            self.result = result
-        self.parser = parser or ZOMBIE_PARSER
-
-
-    def __str__(self):
-        s = "".join(str(child) for child in self.children) if self.children else self.content
-        if self.errors:
-            return ' <<< Error on "%s" | %s >>> ' % \
-                   (s, '; '.join(e.message for e in self.errors))
-        return s
-
-
-    def __repr__(self):
-        mpargs = {'name': self.parser.name, 'ptype': self.parser.ptype}
-        parg = "MockParser({name}, {ptype})".format(**mpargs)
-        rarg = str(self) if not self.children else \
-               "(" + ", ".join(repr(child) for child in self.children) + ")"
-        return "Node(%s, %s)" % (parg, rarg)
-
-
-    def __len__(self):
-        if self._len < 0:
-            self._len = sum(len(child) for child in self.children) \
-                if self.children else len(self._result)
-        return self._len
-
-
-    def __bool__(self):
-        # A node that is not None is always True, even if it's empty
-        return True
-
-
-    def __eq__(self, other):
-        """
-        Equality of nodes: Two nodes are considered as equal, if their tag
-        name is the same and if their results are equal.
-        """
-        return self.tag_name == other.tag_name and self.result == other.result
-
-
-    def __hash__(self):
-        return hash(self.tag_name)
-
-
-    def __deepcopy__(self, memodict={}):
-        result = copy.deepcopy(self.result)
-        other = Node(self.parser, result)
-        other._pos = self._pos
-        return other
-
-
-    def __getitem__(self, index_or_tagname: Union[int, str]) -> Union['Node', Iterator['Node']]:
-        """
-        Returns the child node with the given index if ``index_or_tagname`` is
-        an integer or the first child node with the given tag name. Examples::
-
-            >>> tree = parse_sxpr('(a (b "X") (X (c "d")) (e (X "F")))')
-            >>> flatten_sxpr(tree[0].as_sxpr())
-            '(b "X")'
-            >>> flatten_sxpr(tree["X"].as_sxpr())
-            '(X (c "d"))'
-
-        Args:
-            index_or_tagname(str): Either an index of a child node or a
-                tag name.
-        Returns:
-            Node: All nodes which have a given tag name.
-        """
-        if self.children:
-            if isinstance(index_or_tagname, int):
-                return self.children[index_or_tagname]
-            else:
-                for child in self.children:
-                    if child.tag_name == index_or_tagname:
-                        return child
-                raise KeyError(index_or_tagname)
-        raise ValueError('Leave nodes have no children that can be indexed!')
-
-
-    def __contains__(self, tag_name: str) -> bool:
-        """
-        Returns true if a child with the given tag name exists.
-        Args:
-            tag_name (str): tag_name which will be searched among to immediate
-                descendants of this node.
-        Returns:
-            bool:  True, if at least one descendant node with the given tag
-                name exists, False otherwise
-        """
-        # assert isinstance(tag_name, str)
-        if self.children:
-            for child in self.children:
-                if child.tag_name == tag_name:
-                    return True
-            return False
-        raise ValueError('Leave node cannot contain other nodes')
-        # generator = self.select_by_tag(tag_name, False)
-        # try:
-        #     generator.__next__()
-        #     return True
-        # except StopIteration:
-        #     return False
-
-
-    @property   # this needs to be a (dynamic) property, in case sef.parser gets updated
-    def tag_name(self) -> str:
-        """
-        Returns the tage name of Node, i.e. the name for XML or
-        S-expression representation. By default the tag name is the
-        name of the node's parser or, if the node's parser is unnamed, the
-        node's parser's `ptype`.
-        """
-        return self.parser.name or self.parser.ptype
-
-
-    @property
-    def result(self) -> StrictResultType:
-        """
-        Returns the result from the parser that created the node.
-        Error messages are not included in the result. Use `self.content()`
-        if the result plus any error messages is needed.
-        """
-        return self._result
-
-
-    @result.setter
-    def result(self, result: ResultType):
-        # # made obsolete by static type checking with mypy
-        # assert ((isinstance(result, tuple) and all(isinstance(child, Node) for child in result))
-        #         or isinstance(result, Node)
-        #         or isinstance(result, str)), str(result)
-        # Possible optimization: Do not allow single nodes as argument:
-        # assert not isinstance(result, Node)
-        self._len = -1        # lazy evaluation
-        self._content = None
-        if isinstance(result, Node):
-            self.children = (result,)
-            self._result = self.children
-        else:
-            if isinstance(result, tuple):
-                self.children = result
-                self._result = result or ''
-            else:
-                self.children = NoChildren
-                self._result = result
-
-
-    @property
-    def content(self) -> str:
-        """
-        Returns content as string, omitting error messages.
-        """
-        if self._content is None:
-            if self.children:
-                self._content = "".join(child.content for child in self.children)
-            else:
-                # self._content = self._result
-                self._content = str(self._result)
-                self._result = self._content  # self._result might be more efficient as a string!?
-        return self._content
-
-
-    @property
-    def structure(self) -> str:
-        """
-        Return structure (and content) as S-expression on a single line
-        without any line breaks.
-        """
-        return flatten_sxpr(self.as_sxpr(showerrors=False))
-
-
-    @property
-    def pos(self) -> int:
-        """Returns the position of the Node's content in the source text."""
-        if self._pos < 0:
-            raise AssertionError("Position value not initialized!")
-        return self._pos
-
-
-    def init_pos(self, pos: int) -> 'Node':
-        """
-        (Re-)initialize position value. Usually, the parser guard
-        (`parsers.add_parser_guard()`) takes care of assigning the
-        position in the document to newly created nodes. However,
-        where Nodes are created outside the reach of the parser
-        guard, their document-position must be assigned manually.
-        This function recursively reassigns the position values
-        of the child nodes, too.
-        """
-        assert self._pos < 0 or self.pos == pos, str("pos mismatch %i != %i" % (self._pos, pos))
-        self._pos = pos
-        # recursively adjust pos-values of all children
-        offset = self.pos
-        for child in self.children:
-            child.init_pos(offset)
-            offset = child.pos + len(child)
-        return self
-
-
-    @property
-    def attributes(self):
-        """
-        Returns a dictionary of XML-Attributes attached to the Node.
-        """
-        if not hasattr(self, '_xml_attr'):
-            self._xml_attr = OrderedDict()
-        return self._xml_attr
-
-
-    def _tree_repr(self, tab, open_fn, close_fn, data_fn=lambda i: i, density=0) -> str:
-        """
-        Generates a tree representation of this node and its children
-        in string from.
-
-        The kind ot tree-representation that is determined by several
-        function parameters. This could be an XML-representation or a
-        lisp-like S-expression.
-
-        Args:
-            tab (str):  The indentation string, e.g. '\t' or '    '
-            open_fn:   (Node->str) A function that returns an opening
-                string (e.g. an XML-tag_name) for a given node
-            close_fn:  (Node->str) A function that returns a closeF
-                string (e.g. an XML-tag_name) for a given node.
-            data_fn:   (str->str) A function that filters the data string
-                before printing, e.g. to add quotation marks
-
-        Returns (str):
-            A string that contains a (serialized) tree representation
-            of the node and its children.
-        """
-        head = open_fn(self)
-        tail = close_fn(self)
-
-        if not self.result:
-            return head.rstrip() + tail.lstrip()
-
-        tail = tail.lstrip(None if density & 2 else '')
-
-        if self.children:
-            content = []
-            for child in self.children:
-                subtree = child._tree_repr(tab, open_fn, close_fn, data_fn, density).split('\n')
-                content.append('\n'.join((tab + s) for s in subtree))
-            return head + '\n'.join(content) + tail
-
-        res = cast(str, self.result)  # safe, because if there are no children, result is a string
-        if density & 1 and res.find('\n') < 0:  # and head[0] == "<":
-            # except for XML, add a gap between opening statement and content
-            gap = ' ' if head.rstrip()[-1] != '>' else ''
-            return head.rstrip() + gap + data_fn(self.result) + tail.lstrip()
-        else:
-            return head + '\n'.join([tab + data_fn(s) for s in res.split('\n')]) + tail
-
-
-    def as_sxpr(self, src: str = None, compact: bool = False, showerrors: bool = True,
-                indentation: int = 2) -> str:
-        """
-        Returns content as S-expression, i.e. in lisp-like form.
-
-        Args:
-            src:  The source text or `None`. In case the source text is
-                given the position of the element in the text will be
-                reported as line and column.
-            compact:  If True a compact representation is returned where
-                brackets are omitted and only the indentation indicates the
-                tree structure.
-        """
-
-        left_bracket, right_bracket, density = ('', '', 1) if compact else ('(', '\n)', 0)
-        lbreaks = linebreaks(src) if src else []  # type: List[int]
-
-        def opening(node) -> str:
-            """Returns the opening string for the representation of `node`."""
-            txt = [left_bracket,  node.tag_name]
-            # s += " '(pos %i)" % node.add_pos
-            if hasattr(node, '_xml_attr'):
-                txt.extend(' `(%s "%s")' % (k, v) for k, v in node.attributes.items())
-            if src:
-                txt.append(" `(pos %i %i %i)" % (node.pos, *line_col(lbreaks, node.pos)))
-            # if node.error_flag:   # just for debugging error collecting
-            #     txt += " HAS ERRORS"
-            if showerrors and node.errors:
-                txt.append(" `(err `%s)" % ' '.join(str(err) for err in node.errors))
-            return "".join(txt) + '\n'
-
-        def closing(node) -> str:
-            """Returns the closing string for the representation of `node`."""
-            return right_bracket
-
-        def pretty(strg):
-            """Encloses `strg` with the right kind of quotation marks."""
-            return '"%s"' % strg if strg.find('"') < 0 \
-                else "'%s'" % strg if strg.find("'") < 0 \
-                else '"%s"' % strg.replace('"', r'\"')
-
-        return self._tree_repr(' ' * indentation, opening, closing, pretty, density=density)
-
-
-    def as_xml(self, src: str = None, showerrors: bool = True, indentation: int = 2) -> str:
-        """
-        Returns content as XML-tree.
-
-        Args:
-            src:  The source text or `None`. In case the source text is
-                given the position will also be reported as line and
-                column.
-        """
-
-        def opening(node) -> str:
-            """Returns the opening string for the representation of `node`."""            
-            txt = ['<', node.tag_name]
-            has_reserved_attrs = hasattr(node, '_xml_attr') \
-                and any (r in node.attributes for r in {'err', 'line', 'col'})
-            if hasattr(node, '_xml_attr'):
-                txt.extend(' %s="%s"' % (k, v) for k, v in node.attributes.items())
-            if src and not has_reserved_attrs:
-                txt.append(' line="%i" col="%i"' % line_col(line_breaks, node.pos))
-            if showerrors and node.errors and not has_reserved_attrs:
-                txt.append(' err="%s"' % ''.join(str(err).replace('"', r'\"')
-                                                 for err in node.errors))
-            return "".join(txt + [">\n"])
-
-        def closing(node):
-            """Returns the closing string for the representation of `node`."""            
-            return '\n</' + node.tag_name + '>'
-
-        line_breaks = linebreaks(src) if src else []
-        return self._tree_repr(' ' * indentation, opening, closing, density=1)
-
-
-    def select(self, match_function: Callable, include_root: bool=True) -> Iterator['Node']:
-        """
-        Finds nodes in the tree that fulfill a given criterion.
-
-        `select` is a generator that yields all nodes for which the
-        given `match_function` evaluates to True. The tree is
-        traversed pre-order.
-
-        See function `Node.select_by_tag` for some examples.
-
-        Args:
-            match_function (function): A function  that takes as Node
-                object as argument and returns True or False
-            include_root (bool): If False, only descendant nodes will be
-                checked for a match.
-        Yields:
-            Node: All nodes of the tree for which
-            ``match_function(node)`` returns True
-        """
-        if include_root and match_function(self):
-            yield self
-        else:
-            for child in self.children:
-                for node in child.select(match_function, True):
-                    yield node
-
-
-    def select_by_tag(self, tag_names: Union[str, AbstractSet[str]],
-                      include_root: bool=True) -> Iterator['Node']:
-        """
-        Returns an iterator that runs through all descendants that have one
-        of the given tag names.
-
-        Examples::
-
-            >>> tree = parse_sxpr('(a (b "X") (X (c "d")) (e (X "F")))')
-            >>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag("X", False))
-            ['(X (c "d"))', '(X "F")']
-            >>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag({"X", "b"}, False))
-            ['(b "X")', '(X (c "d"))', '(X "F")']
-            >>> any(tree.select_by_tag('a', False))
-            False
-            >>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag('a', True))
-            ['(a (b "X") (X (c "d")) (e (X "F")))']
-            >>> flatten_sxpr(next(tree.select_by_tag("X", False)).as_sxpr())
-            '(X (c "d"))'
-
-        Args:
-            tag_name(set): A tag name or set of tag names that is being
-                searched for
-            include_root (bool): If False, only descendant nodes will be
-                checked for a match.
-        Yields:
-            Node: All nodes which have a given tag name.
-        """
-        if isinstance(tag_names, str):
-            tag_names = frozenset({tag_names})
-        return self.select(lambda node: node.tag_name in tag_names, include_root)
-
-
-    def pick(self, tag_names: Union[str, Set[str]]) -> Optional['Node']:
-        """
-        Picks the first descendant with one of the given tag_names.
-
-        This function is just syntactic sugar for
-        ``next(node.select_by_tag(tag_names, False))``. However, rather than
-        raising a StopIterationError if no descendant with the given tag-name
-        exists, it returns None.
-        """
-        try:
-            return next(self.select_by_tag(tag_names, False))
-        except StopIteration:
-            return None
-
-
-    def tree_size(self) -> int:
-        """
-        Recursively counts the number of nodes in the tree including the root node.
-        """
-        return sum(child.tree_size() for child in self.children) + 1
-
-
-class RootNode(Node):
-    """TODO: Add Documentation!!!
-
-        errors (list):  A list of all errors that have occured so far during
-                processing (i.e. parsing, AST-transformation, compiling)
-                of this tree.
-
-        error_flag (int):  the highest warning or error level of all errors
-                that occurred.
-    """
-    def __init__(self, node: Optional[Node] = None) -> 'RootNode':
-        super().__init__(ZOMBIE_PARSER, '')
-        self.all_errors = []
-        self.err_nodes_keep = []
-        self.error_flag = 0
-        if node is not None:
-            self.swallow(node)
-
-    # def _propagate_errors(self):
-    #     if not self.all_errors or not self.error_propagation:
-    #         return
-    #     self.all_errors.sort(key=lambda e: e.pos)
-    #     i = 0
-    #     for leaf in self.select(lambda nd: not nd.children, False):
-    #         leaf.errors = []
-    #         while i < len(self.all_errors) \
-    #                 and leaf.pos <= self.all_errors[i].add_pos < leaf.add_pos + leaf.len:
-    #             leaf._errors.append(self.all_errors[i])
-    #             i += 1
-    #         if i >= len(self.all_errors):
-    #             break
-    #
-    # def _propagate_new_error(self, error):
-    #     if self.error_propagation:
-    #         for leaf in self.select(lambda nd: not nd.children, True):
-    #             if leaf.pos <= error.add_pos < leaf.add_pos + leaf.len:
-    #                 leaf._errors.append(error)
-    #                 break
-    #         else:
-    #             assert False, "Error %s at pos %i out of bounds" % (str(error), error.add_pos)
-
-    def swallow(self, node: Node) -> 'RootNode':
-        self._result = node._result
-        self.children = node.children
-        self._len = node._len
-        self._pos = node._pos
-        self.parser = node.parser
-        if hasattr(node, '_xml_attr'):
-            self._xml_attr = node._xml_attr
-        self._content = node._content
-        return self
-
-    def add_error(self, node: Node, error: Error) -> 'RootNode':
-        """Adds an Error object to the tree, locating it at a specific node."""
-        self.all_errors.append(error)
-        self.error_flag = max(self.error_flag, error.code)
-        node.errors.append(error)
-        self.err_nodes_keep.append(node)
-        return self
-
-    def new_error(self,
-                  node: Node,
-                  message: str,
-                  code: int = Error.ERROR) -> 'RootNode':
-        """
-        Adds an error to this tree, locating it at a specific node.
-        Parameters:
-            pos(int):     The position of the error in the source text
-            message(str): A string with the error message.abs
-            code(int):    An error code to identify the kind of error
-        """
-        error = Error(message, code, node=node)
-        self.add_error(node, error)
-        return self
-
-    def collect_errors(self) -> List[Error]:
-        """Returns the list of errors, ordered bv their position.
-        """
-        # for node in self.err_nodes:  # lazy evaluation of positions
-        #     for err in node.errors:  # moved to error.Error.pos
-        #         err.pos = node.pos
-        self.all_errors.sort(key=lambda e: e.pos)
-        for node in self.err_nodes_keep:  # redundant: consider removing Error.Error._node_keep
-            for error in node.errors:
-                assert error._pos < 0 or node.pos <= error._pos <= node.pos + len(node)
-                if error._pos < 0:
-                    error._pos = node.pos
-        self.err_nodes_keep = []
-        errors = self.all_errors
-        # for error in self.all_errors:
-        #     _ = error.pos
-        return errors
-
-
-ZOMBIE_NODE = Node(ZOMBIE_PARSER, '')
-
-
-def parse_sxpr(sxpr: str) -> Node:
-    """
-    Generates a tree of nodes from an S-expression.
-
-    This can - among other things - be used for deserialization of trees that
-    have been serialized with `Node.as_sxpr()` or as a convenient way to
-    generate test data.
-
-    Example:
-    >>> parse_sxpr("(a (b c))").as_sxpr()
-    '(a\\n    (b\\n        "c"\\n    )\\n)'
-    """
-    sxpr = StringView(sxpr).strip()
-    mock_parsers = dict()
-
-    def next_block(s: StringView):
-        """Generator that yields all characters until the next closing bracket
-        that does not match an opening bracket matched earlier within the same
-        package."""
-        s = s.strip()
-        try:
-            while s[0] != ')':
-                if s[0] != '(':
-                    raise ValueError('"(" expected, not ' + s[:10])
-                # assert s[0] == '(', s
-                level = 1
-                k = 1
-                while level > 0:
-                    if s[k] == '(':
-                        level += 1
-                    elif s[k] == ')':
-                        level -= 1
-                    k += 1
-                yield s[:k]
-                s = s[k:].strip()
-        except IndexError:
-            errmsg = ('Malformed S-expression. Unprocessed part: "%s"' % s) if s \
-                else 'Malformed S-expression. Closing bracket(s) ")" missing.'
-            raise AssertionError(errmsg)
-
-    def inner_parser(sxpr: StringView) -> Node:
-        if sxpr[0] != '(':
-            raise ValueError('"(" expected, not ' + sxpr[:10])
-        # assert sxpr[0] == '(', sxpr
-        sxpr = sxpr[1:].strip()
-        match = sxpr.match(re.compile(r'[\w:]+'))
-        if match is None:
-            raise AssertionError('Malformed S-expression Node-tagname or identifier expected, '
-                                 'not "%s"' % sxpr[:40].replace('\n', ''))
-        end = match.end() - sxpr.begin
-        tagname = sxpr[:end]
-        name, class_name = (tagname.split(':') + [''])[:2]
-        sxpr = sxpr[end:].strip()
-        attributes = OrderedDict()
-        if sxpr[0] == '(':
-            result = tuple(inner_parser(block) for block in next_block(sxpr))
-        else:
-            lines = []
-            while sxpr and sxpr[0:1] != ')':
-                # parse attributes
-                while sxpr[:2] == "`(":
-                    i = sxpr.find('"')
-                    k = sxpr.find(')')
-                    # read very special attribute pos
-                    if sxpr[2:5] == "pos" and 0 < i < k:
-                        pos = int(sxpr[5:k].strip().split(' ')[0])
-                    # ignore very special attribute err
-                    elif sxpr[2:5] == "err" and 0 <= sxpr.find('`', 5) < k:
-                        m = sxpr.find('(', 5)
-                        while m >= 0 and m < k:
-                            m = sxpr.find('(', k)
-                            k = max(k, sxpr.find(')', max(m, 0)))
-                    # read attributes
-                    else:
-                        attr = sxpr[2:i].strip()
-                        value = sxpr[i:k].strip()[1:-1]
-                        attributes[attr] = value
-                    sxpr = sxpr[k+1:].strip()
-                # parse content
-                for qtmark in ['"""', "'''", '"', "'"]:
-                    match = sxpr.match(re.compile(qtmark + r'.*?' + qtmark, re.DOTALL))
-                    if match:
-                        end = match.end() - sxpr.begin
-                        i = len(qtmark)
-                        lines.append(str(sxpr[i:end - i]))
-                        sxpr = sxpr[end:].strip()
-                        break
-                else:
-                    match = sxpr.match(re.compile(r'(?:(?!\)).)*', re.DOTALL))
-                    end = match.end() - sxpr.begin
-                    lines.append(str(sxpr[:end]))
-                    sxpr = sxpr[end:]
-            result = "\n".join(lines)
-        node = Node(mock_parsers.setdefault(tagname, MockParser(name, ':' + class_name)), result)
-        if attributes:
-            node.attributes.update(attributes)
-        return node
-
-    return inner_parser(sxpr)
-
-
-RX_WHITESPACE_TAIL = re.compile(r'\s*$')
-
-
-def parse_xml(xml: str) -> Node:
-    """
-    Generates a tree of nodes from a (Pseudo-)XML-source.
-    """
-    xml = StringView(xml)
-    PlainText = MockParser('', PLAINTEXT_PTYPE)
-    mock_parsers = {PLAINTEXT_PTYPE: PlainText}
-
-    def parse_attributes(s: StringView) -> Tuple[StringView, OrderedDict]:
-        """Parses a sqeuence of XML-Attributes. Returns the string-slice
-        beginning after the end of the attributes."""
-        attributes = OrderedDict()
-        restart = 0
-        for match in s.finditer(re.compile(r'\s*(?P<attr>\w+)\s*=\s*"(?P<value>.*)"\s*')):
-            d = match.groupdict()
-            attributes[d['attr']] = d['value']
-            restart = match.end() - s.begin
-        return (s[restart:], attributes)
-
-    def parse_opening_tag(s: StringView) -> Tuple[StringView, str, OrderedDict, bool]:
-        """Parses an opening tag. Returns the string segment following the
-        the opening tag, the tag name, a dictionary of attributes and
-        a flag indicating whether the tag is actually a solitary tag as
-        indicated by a slash at the end, i.e. <br/>."""
-        match = s.match(re.compile(r'<\s*(?P<tagname>[\w:]+)\s*'))
-        assert match
-        tagname = match.groupdict()['tagname']
-        section = s[match.end() - s.begin:]
-        s, attributes = parse_attributes(section)
-        i = s.find('>')
-        assert i >= 0
-        return s[i+1:], tagname, attributes, s[i-1] == "/"
-
-    def parse_closing_tag(s: StringView) -> Tuple[StringView, str]:
-        """Parses a closing tag and returns the string segment, just after
-        the closing tag."""
-        match = s.match(re.compile(r'</\s*(?P<tagname>[\w:]+)>'))
-        assert match
-        tagname = match.groupdict()['tagname']
-        return s[match.end() - s.begin:], tagname
-
-    def parse_leaf_content(s: StringView) -> Tuple[StringView, str]:
-        """Parses a piece of the content of a tag, just until the next opening,
-        closing or solitary tag is reached."""
-        i = 0
-        while s[i] != "<" or s[max(0, i-1)] == "\\":
-            i = s.find("<", i)
-        return s[i:], s[:i]
-
-    def parse_full_content(s: StringView) -> Tuple[StringView, Node]:
-        """Parses the full content of a tag, starting right at the beginning
-        of the opening tag and ending right after the closing tag.
-        """
-        result = []
-        s, tagname, attributes, solitary = parse_opening_tag(s)
-        name, class_name = (tagname.split(":") + [''])[:2]
-        if not solitary:
-            while s and not s[:2] == "</":
-                s, leaf = parse_leaf_content(s)
-                if not leaf.match(RX_WHITESPACE_TAIL):
-                    result.append(Node(PlainText, leaf))
-                if s[:1] == "<" and s[:2] != "</":
-                    s, child = parse_full_content(s)
-                    result.append(child)
-            s, closing_tagname = parse_closing_tag(s)
-            assert tagname == closing_tagname
-        if len(result) == 1 and result[0].parser.ptype == PLAINTEXT_PTYPE:
-            result = result[0].result
-        else:
-            result = tuple(result)
-        return s, Node(mock_parsers.setdefault(tagname, MockParser(name, ":" + class_name)), result)
-
-    match_header = xml.search(re.compile(r'<(?!\?)'))
-    start = match_header.start() if match_header else 0
-    _, tree = parse_full_content(xml[start:])
-    assert _.match(RX_WHITESPACE_TAIL)
-    return tree
-
-# if __name__ == "__main__":
-#     st = parse_sxpr("(alpha (beta (gamma i\nj\nk) (delta y)) (epsilon z))")
-#     print(st.as_sxpr())
-#     print(st.as_xml())
-
- -
- -
-
- - -
- -
-

- © Copyright 2018, Eckhart Arnold. - -

-
- Built with Sphinx using a theme provided by Read the Docs. - -
- -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation/_modules/compile.html b/documentation/_modules/compile.html deleted file mode 100644 index 5a75595350dda809cce5989e11c0c082f8994b6e..0000000000000000000000000000000000000000 --- a/documentation/_modules/compile.html +++ /dev/null @@ -1,481 +0,0 @@ - - - - - - - - - - - compile — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -

Source code for compile

-# compile.py - Syntax driven compilation support for DHParser
-#
-# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
-#                 Bavarian Academy of Sciences an Humanities (badw.de)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied.  See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Module ``compile`` contains a skeleton class for syntax
-driven compilation support. Class ``Compiler`` can serve as base
-class for a compiler. Compiler objects
-are callable an receive the Abstract syntax tree (AST)
-as argument and yield whatever output the compiler produces. In
-most Digital Humanities applications this will be
-XML-code. However, it can also be anything else, like binary
-code or, as in the case of DHParser's EBNF-compiler, Python
-source code.
-
-Function ``compile_source`` invokes all stages of the compilation
-process, i.e. pre-processing, parsing, CST to AST-transformation
-and compilation.
-
-See module ``ebnf`` for a sample of the implementation of a
-compiler object.
-"""
-
-import os
-import re
-
-from DHParser.preprocess import strip_tokens, with_source_mapping, PreprocessorFunc
-from DHParser.syntaxtree import Node, RootNode
-from DHParser.transform import TransformationFunc
-from DHParser.parse import Grammar
-from DHParser.error import adjust_error_locations, is_error, Error
-from DHParser.log import log_parsing_history, log_ST, is_logging, logfile_basename
-from DHParser.toolkit import typing, sane_parser_name, load_if_file
-from typing import Any, Optional, Tuple, List, Callable
-
-
-__all__ = ('CompilerError', 'Compiler', 'compile_source')
-
-
-
[docs]class CompilerError(Exception): - """Exception raised when an error of the compiler itself is detected. - Compiler errors are not to be confused with errors in the source - code to be compiled, which do not raise Exceptions but are merely - reported as an error.""" - pass
- - -
[docs]class Compiler: - """ - Class Compiler is the abstract base class for compilers. Compiler - objects are callable and take the root node of the abstract - syntax tree (AST) as argument and return the compiled code in a - format chosen by the compiler itself. - - Subclasses implementing a compiler must define `on_XXX()`-methods - for each node name that can occur in the AST where 'XXX' is the - node's name(for unnamed nodes it is the node's ptype without the - leading colon ':'). - - These compiler methods take the node on which they are run as - argument. Other than in the AST transformation, which runs depth-first, - compiler methods are called forward moving starting with the root - node, and they are responsible for compiling the child nodes - themselves. This should be done by invoking the `compile(node)`- - method which will pick the right `on_XXX`-method. It is not - recommended to call the `on_XXX`-methods directly. - - Attributes: - context: A list of parent nodes that ends with the currently - compiled node. - grammar_name: The name of the grammar this compiler is related to - grammar_source: The source code of the grammar this compiler is - related to. - _dirty_flag: A flag indicating that the compiler has already been - called at least once and that therefore all compilation - variables must be reset when it is called again. - """ - - def __init__(self, grammar_name="", grammar_source=""): - self._reset() - self.set_grammar_name(grammar_name, grammar_source) - - def _reset(self): - self.tree = None # type: Optional[RootNode] - self.context = [] # type: List[Node] - self._dirty_flag = False - - def __call__(self, root: RootNode) -> Any: - """ - Compiles the abstract syntax tree with the root node `node` and - returns the compiled code. It is up to subclasses implementing - the compiler to determine the format of the returned data. - (This very much depends on the kind and purpose of the - implemented compiler.) - """ - if self._dirty_flag: - self._reset() - self._dirty_flag = True - self.tree = root - result = self.compile(root) - return result - -
[docs] def set_grammar_name(self, grammar_name: str="", grammar_source: str=""): - """ - Changes the grammar's name and the grammar's source. - - The grammar name and the source text of the grammar are - metadata about the grammar that do not affect the compilation - process. Classes inheriting from `Compiler` can use this - information to name and annotate its output. Returns `self`. - """ - assert grammar_name == "" or re.match(r'\w+\Z', grammar_name) - if not grammar_name and re.fullmatch(r'[\w/:\\]+', grammar_source): - grammar_name = os.path.splitext(os.path.basename(grammar_source))[0] - self.grammar_name = grammar_name - self.grammar_source = load_if_file(grammar_source) - return self
- - # @staticmethod - # def propagate_error_flags(node: Node, lazy: bool = True) -> None: - # # See test_parser.TestCompilerClass.test_propagate_error().. - # """Propagates error flags from children to parent nodes to make sure - # that the parent's error flag is always greater or equal the maximum - # of the children's error flags.""" - # if not lazy or node.error_flag < Error.HIGHEST: - # for child in node.children: - # Compiler.propagate_error_flags(child) - # node.error_flag = max(node.error_flag, child.error_flag) - # if lazy and node.error_flag >= Error.HIGHEST: - # return - -
[docs] @staticmethod - def method_name(node_name: str) -> str: - """ - Returns the method name for `node_name`, e.g.:: - - >>> Compiler.method_name('expression') - 'on_expression' - """ - return 'on_' + node_name
- -
[docs] def fallback_compiler(self, node: Node) -> Any: - """This is a generic compiler function which will be called on - all those node types for which no compiler method `on_XXX` has - been defined.""" - if node.children: - result = tuple(self.compile(nd) for nd in node.children) - node.result = result - return node
- -
[docs] def compile(self, node: Node) -> Any: - """ - Calls the compilation method for the given node and returns the - result of the compilation. - - The method's name is derived from either the node's parser - name or, if the parser is anonymous, the node's parser's class - name by adding the prefix ``on_``. - - Note that ``compile`` does not call any compilation functions - for the parsers of the sub nodes by itself. Rather, this should - be done within the compilation methods. - """ - elem = node.parser.name or node.parser.ptype[1:] - if not sane_parser_name(elem): - node.add_error("Reserved name '%s' not allowed as parser " - "name! " % elem + "(Any name starting with " - "'_' or '__' or ending with '__' is reserved.)") - return None - else: - try: - compiler = self.__getattribute__(self.method_name(elem)) - except AttributeError: - compiler = self.fallback_compiler - self.context.append(node) - result = compiler(node) - self.context.pop() - if result is None: - raise CompilerError('Method on_%s returned `None` instead of a ' - 'valid compilation result!' % elem) - # # the following statement makes sure that the error_flag - # # is propagated early on. Otherwise it is redundant, because - # # the __call__ method globally propagates the node's error_flag - # # later anyway. So, maybe it could be removed here. - # for child in node.children: - # node.error_flag = node.error_flag or child.error_flag - return result
- - -
[docs]def compile_source(source: str, - preprocessor: Optional[PreprocessorFunc], # str -> str - parser: Grammar, # str -> Node (concrete syntax tree (CST)) - transformer: TransformationFunc, # Node -> Node (abstract syntax tree (AST)) - compiler: Compiler) -> Tuple[Any, List[Error], Node]: # Node (AST) -> Any - """ - Compiles a source in four stages: - 1. Pre-Processing (if needed) - 2. Parsing - 3. AST-transformation - 4. Compiling. - - The compilations stage is only invoked if no errors occurred in - either of the two previous stages. - - Args: - source (str): The input text for compilation or a the name of a - file containing the input text. - preprocessor (function): text -> text. A preprocessor function - or None, if no preprocessor is needed. - parser (function): A parsing function or grammar class - transformer (function): A transformation function that takes - the root-node of the concrete syntax tree as an argument and - transforms it (in place) into an abstract syntax tree. - compiler (function): A compiler function or compiler class - instance - - Returns (tuple): - The result of the compilation as a 3-tuple - (result, errors, abstract syntax tree). In detail: - 1. The result as returned by the compiler or ``None`` in case of failure - 2. A list of error or warning messages - 3. The root-node of the abstract syntax tree - """ - original_text = load_if_file(source) - log_file_name = logfile_basename(source, compiler) - if preprocessor is None: - source_text = original_text - source_mapping = lambda i: i - else: - source_text, source_mapping = with_source_mapping(preprocessor(original_text)) - syntax_tree = parser(source_text) - if is_logging(): - log_ST(syntax_tree, log_file_name + '.cst') - log_parsing_history(parser, log_file_name) - - assert is_error(syntax_tree.error_flag) or str(syntax_tree) == strip_tokens(source_text) - # only compile if there were no syntax errors, for otherwise it is - # likely that error list gets littered with compile error messages - result = None - # efl = syntax_tree.error_flag - # messages = syntax_tree.collect_errors(clear_errors=True) - if not is_error(syntax_tree.error_flag): - transformer(syntax_tree) - # efl = max(efl, syntax_tree.error_flag) - # messages.extend(syntax_tree.collect_errors(clear_errors=True)) - if is_logging(): - log_ST(syntax_tree, log_file_name + '.ast') - if not is_error(syntax_tree.error_flag): - result = compiler(syntax_tree) - # print(syntax_tree.as_sxpr()) - # messages.extend(syntax_tree.collect_errors()) - # syntax_tree.error_flag = max(syntax_tree.error_flag, efl) - - messages = syntax_tree.collect_errors() - adjust_error_locations(messages, original_text, source_mapping) - return result, messages, syntax_tree
-
- -
- -
-
- - -
- -
-

- © Copyright 2018, Eckhart Arnold. - -

-
- Built with Sphinx using a theme provided by Read the Docs. - -
- -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation/_modules/dsl.html b/documentation/_modules/dsl.html deleted file mode 100644 index b8339e62743bc1518a9c32b85badb830eb923dab..0000000000000000000000000000000000000000 --- a/documentation/_modules/dsl.html +++ /dev/null @@ -1,806 +0,0 @@ - - - - - - - - - - - dsl — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -

Source code for dsl

-# dsl.py - Support for domain specific notations for DHParser
-#
-# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
-#                 Bavarian Academy of Sciences an Humanities (badw.de)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied.  See the License for the specific language governing
-# permissions and limitations under the License.
-
-
-"""
-Module ``dsl`` contains various functions to support the
-compilation of domain specific languages based on an EBNF-grammar.
-"""
-
-
-import os
-import platform
-import stat
-
-from DHParser.compile import Compiler, compile_source
-from DHParser.ebnf import EBNFCompiler, grammar_changed, \
-    get_ebnf_preprocessor, get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
-    PreprocessorFactoryFunc, ParserFactoryFunc, TransformerFactoryFunc, CompilerFactoryFunc
-from DHParser.error import Error, is_error, has_errors, only_errors
-from DHParser.log import logging
-from DHParser.parse import Grammar
-from DHParser.preprocess import nil_preprocessor, PreprocessorFunc
-from DHParser.syntaxtree import Node
-from DHParser.transform import TransformationFunc
-from DHParser.toolkit import load_if_file, is_python_code, compile_python_object, \
-    re, typing
-from typing import Any, cast, List, Tuple, Union, Iterator, Iterable
-
-
-__all__ = ('DHPARSER_IMPORTS',
-           'GrammarError',
-           'CompilationError',
-           'load_compiler_suite',
-           'compileDSL',
-           'raw_compileEBNF',
-           'compileEBNF',
-           'grammar_provider',
-           'compile_on_disk',
-           'recompile_grammar')
-
-
-SECTION_MARKER = """\n
-#######################################################################
-#
-# {marker}
-#
-#######################################################################
-\n"""
-
-RX_SECTION_MARKER = re.compile(SECTION_MARKER.format(marker=r'.*?SECTION.*?'))
-RX_WHITESPACE = re.compile(r'\s*')
-
-SYMBOLS_SECTION = "SYMBOLS SECTION - Can be edited. Changes will be preserved."
-PREPROCESSOR_SECTION = "PREPROCESSOR SECTION - Can be edited. Changes will be preserved."
-PARSER_SECTION = "PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!"
-AST_SECTION = "AST SECTION - Can be edited. Changes will be preserved."
-COMPILER_SECTION = "COMPILER SECTION - Can be edited. Changes will be preserved."
-END_SECTIONS_MARKER = "END OF DHPARSER-SECTIONS"
-
-
-dhparserdir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-
-
-DHPARSER_IMPORTS = '''
-from functools import partial
-import os
-import sys
-
-sys.path.append(r'{dhparserdir}')
-
-try:
-    import regex as re
-except ImportError:
-    import re
-from DHParser import logging, is_filename, load_if_file, \\
-    Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, \\
-    Lookbehind, Lookahead, Alternative, Pop, Token, Synonym, AllOf, SomeOf, Unordered, \\
-    Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, RE, Capture, \\
-    ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \\
-    grammar_changed, last_value, counterpart, accumulate, PreprocessorFunc, \\
-    Node, TransformationFunc, TransformationDict, \\
-    traverse, remove_children_if, merge_children, is_anonymous, \\
-    reduce_single_child, replace_by_single_child, replace_or_reduce, remove_whitespace, \\
-    remove_expendables, remove_empty, remove_tokens, flatten, is_whitespace, \\
-    is_empty, is_expendable, collapse, replace_content, WHITESPACE_PTYPE, TOKEN_PTYPE, \\
-    remove_nodes, remove_content, remove_brackets, replace_parser, \\
-    keep_children, is_one_of, has_content, apply_if, remove_first, remove_last, \\
-    remove_anonymous_empty, keep_nodes, traverse_locally, strip, lstrip, rstrip
-'''.format(dhparserdir=dhparserdir)
-
-
-DHPARSER_MAIN = '''
-def compile_src(source, log_dir=''):
-    """Compiles ``source`` and returns (result, errors, ast).
-    """
-    with logging(log_dir):
-        compiler = get_compiler()
-        cname = compiler.__class__.__name__
-        log_file_name = os.path.basename(os.path.splitext(source)[0]) \\
-            if is_filename(source) < 0 else cname[:cname.find('.')] + '_out'
-        result = compile_source(source, get_preprocessor(),
-                                get_grammar(),
-                                get_transformer(), compiler)
-    return result
-
-
-if __name__ == "__main__":
-    if len(sys.argv) > 1:
-        try:
-            grammar_file_name = os.path.basename(__file__).replace('Compiler.py', '.ebnf')
-            if grammar_changed({NAME}Grammar, grammar_file_name):
-                print("Grammar has changed. Please recompile Grammar first.")
-                sys.exit(1)
-        except FileNotFoundError:
-            print('Could not check for changed grammar, because grammar file "%s" was not found!'
-                  % grammar_file_name)    
-        file_name, log_dir = sys.argv[1], ''
-        if file_name in ['-d', '--debug'] and len(sys.argv) > 2:
-            file_name, log_dir = sys.argv[2], 'LOGS'
-        result, errors, ast = compile_src(file_name, log_dir)
-        if errors:
-            cwd = os.getcwd()
-            rel_path = file_name[len(cwd):] if file_name.startswith(cwd) else file_name
-            for error in errors:
-                print(rel_path + ':' + str(error))
-            sys.exit(1)
-        else:
-            print(result.as_xml() if isinstance(result, Node) else result)
-    else:
-        print("Usage: {NAME}Compiler.py [FILENAME]")
-'''
-
-
-class DSLException(Exception):
-    """
-    Base class for DSL-exceptions.
-    """
-    def __init__(self, errors):
-        assert isinstance(errors, Iterator) or isinstance(errors, list) \
-               or isinstance(errors, tuple)
-        self.errors = errors
-
-    def __str__(self):
-        return '\n'.join(str(err) for err in self.errors)
-
-
-
[docs]class GrammarError(DSLException): - """ - Raised when (already) the grammar of a domain specific language (DSL) - contains errors. - """ - def __init__(self, errors, grammar_src): - super().__init__(errors) - self.grammar_src = grammar_src
- - -
[docs]class CompilationError(DSLException): - """ - Raised when a string or file in a domain specific language (DSL) - contains errors. - """ - def __init__(self, errors, dsl_text, dsl_grammar, AST, result): - super().__init__(errors) - self.dsl_text = dsl_text - self.dsl_grammar = dsl_grammar - self.AST = AST - self.result = result
- - -def error_str(messages: Iterable[Error]) -> str: - """ - Returns all true errors (i.e. not just warnings) from the - `messages` as a concatenated multiline string. - """ - return '\n\n'.join(str(m) for m in messages if is_error(m.code)) - - -def grammar_instance(grammar_representation) -> Tuple[Grammar, str]: - """ - Returns a grammar object and the source code of the grammar, from - the given `grammar`-data which can be either a file name, ebnf-code, - python-code, a Grammar-derived grammar class or an instance of - such a class (i.e. a grammar object already). - """ - if isinstance(grammar_representation, str): - # read grammar - grammar_src = load_if_file(grammar_representation) - if is_python_code(grammar_src): - parser_py, messages = grammar_src, [] # type: str, List[Error] - else: - with logging(False): - parser_py, messages, _ = compile_source( - grammar_src, None, - get_ebnf_grammar(), get_ebnf_transformer(), get_ebnf_compiler()) - if has_errors(messages): - raise GrammarError(only_errors(messages), grammar_src) - parser_root = compile_python_object(DHPARSER_IMPORTS + parser_py, r'\w+Grammar$')() - else: - # assume that dsl_grammar is a ParserHQ-object or Grammar class - grammar_src = '' - if isinstance(grammar_representation, Grammar): - parser_root = grammar_representation - else: - # assume ``grammar_representation`` is a grammar class and get the root object - parser_root = grammar_representation() - return parser_root, grammar_src - - -
[docs]def compileDSL(text_or_file: str, - preprocessor: PreprocessorFunc, - dsl_grammar: Union[str, Grammar], - ast_transformation: TransformationFunc, - compiler: Compiler) -> Any: - """ - Compiles a text in a domain specific language (DSL) with an - EBNF-specified grammar. Returns the compiled text or raises a - compilation error. - - Raises: - CompilationError if any errors occurred during compilation - """ - assert isinstance(text_or_file, str) - assert isinstance(compiler, Compiler) - - parser, grammar_src = grammar_instance(dsl_grammar) - result, messages, AST = compile_source(text_or_file, preprocessor, parser, - ast_transformation, compiler) - if has_errors(messages): - src = load_if_file(text_or_file) - raise CompilationError(only_errors(messages), src, grammar_src, AST, result) - return result
- - -
[docs]def raw_compileEBNF(ebnf_src: str, branding="DSL") -> EBNFCompiler: - """ - Compiles an EBNF grammar file and returns the compiler object - that was used and which can now be queried for the result as well - as skeleton code for preprocessor, transformer and compiler objects. - - Args: - ebnf_src(str): Either the file name of an EBNF grammar or - the EBNF grammar itself as a string. - branding (str): Branding name for the compiler suite source - code. - Returns: - An instance of class ``ebnf.EBNFCompiler`` - Raises: - CompilationError if any errors occurred during compilation - """ - grammar = get_ebnf_grammar() - compiler = get_ebnf_compiler(branding, ebnf_src) - transformer = get_ebnf_transformer() - compileDSL(ebnf_src, nil_preprocessor, grammar, transformer, compiler) - return compiler
- - -
[docs]def compileEBNF(ebnf_src: str, branding="DSL") -> str: - """ - Compiles an EBNF source file and returns the source code of a - compiler suite with skeletons for preprocessor, transformer and - compiler. - - Args: - ebnf_src(str): Either the file name of an EBNF grammar or - the EBNF grammar itself as a string. - branding (str): Branding name for the compiler suite source - code. - Returns: - The complete compiler suite skeleton as Python source code. - Raises: - CompilationError if any errors occurred during compilation - """ - compiler = raw_compileEBNF(ebnf_src, branding) - src = ["#/usr/bin/python\n", - SECTION_MARKER.format(marker=SYMBOLS_SECTION), DHPARSER_IMPORTS, - SECTION_MARKER.format(marker=PREPROCESSOR_SECTION), compiler.gen_preprocessor_skeleton(), - SECTION_MARKER.format(marker=PARSER_SECTION), compiler.result, - SECTION_MARKER.format(marker=AST_SECTION), compiler.gen_transformer_skeleton(), - SECTION_MARKER.format(marker=COMPILER_SECTION), compiler.gen_compiler_skeleton(), - SECTION_MARKER.format(marker=SYMBOLS_SECTION), DHPARSER_MAIN.format(NAME=branding)] - return '\n'.join(src)
- - -
[docs]def grammar_provider(ebnf_src: str, branding="DSL") -> Grammar: - """ - Compiles an EBNF grammar and returns a grammar-parser provider - function for that grammar. - - Args: - ebnf_src(str): Either the file name of an EBNF grammar or - the EBNF grammar itself as a string. - branding (str or bool): Branding name for the compiler - suite source code. - - Returns: - A provider function for a grammar object for texts in the - language defined by ``ebnf_src``. - """ - grammar_src = compileDSL(ebnf_src, nil_preprocessor, get_ebnf_grammar(), - get_ebnf_transformer(), get_ebnf_compiler(branding, ebnf_src)) - grammar_obj = compile_python_object(DHPARSER_IMPORTS + grammar_src, r'get_(?:\w+_)?grammar$') - grammar_obj.python_src__ = grammar_src - return grammar_obj
- - -
[docs]def load_compiler_suite(compiler_suite: str) -> \ - Tuple[PreprocessorFactoryFunc, ParserFactoryFunc, - TransformerFactoryFunc, CompilerFactoryFunc]: - """ - Extracts a compiler suite from file or string `compiler_suite` - and returns it as a tuple (preprocessor, parser, ast, compiler). - - Returns: - 4-tuple (preprocessor function, parser class, - ast transformer function, compiler class) - """ - global RX_SECTION_MARKER - assert isinstance(compiler_suite, str) - source = load_if_file(compiler_suite) - imports = DHPARSER_IMPORTS - if is_python_code(compiler_suite): - try: - _, imports, preprocessor_py, parser_py, ast_py, compiler_py, _ = \ - RX_SECTION_MARKER.split(source) - except ValueError: - raise AssertionError('File "' + compiler_suite + '" seems to be corrupted. ' - 'Please delete or repair file manually.') - # TODO: Compile in one step and pick parts from namespace later ? - preprocessor = compile_python_object(imports + preprocessor_py, - r'get_(?:\w+_)?preprocessor$') - parser = compile_python_object(imports + parser_py, r'get_(?:\w+_)?grammar$') - ast = compile_python_object(imports + ast_py, r'get_(?:\w+_)?transformer$') - else: - # Assume source is an ebnf grammar. - # Is there really any reasonable application case for this? - with logging(False): - compiler_py, messages, n = compile_source(source, None, get_ebnf_grammar(), - get_ebnf_transformer(), - get_ebnf_compiler(compiler_suite, source)) - if has_errors(messages): - raise GrammarError(only_errors(messages), source) - preprocessor = get_ebnf_preprocessor - parser = get_ebnf_grammar - ast = get_ebnf_transformer - compiler = compile_python_object(imports + compiler_py, r'get_(?:\w+_)?compiler$') - - return preprocessor, parser, ast, compiler
- - -def is_outdated(compiler_suite: str, grammar_source: str) -> bool: - """ - Returns ``True`` if the ``compile_suite`` needs to be updated. - - An update is needed, if either the grammar in the compieler suite - does not reflect the latest changes of ``grammar_source`` or if - sections from the compiler suite have diligently been overwritten - with whitespace order to trigger their recreation. Note: Do not - delete or overwrite the section marker itself. - - Args: - compiler_suite: the parser class representing the grammar - or the file name of a compiler suite containing the grammar - grammar_source: File name or string representation of the - EBNF code of the grammar - - Returns (bool): - True, if ``compiler_suite`` seems to be out of date. - """ - try: - n1, grammar, n2, n3 = load_compiler_suite(compiler_suite) - return grammar_changed(grammar(), grammar_source) - except ValueError: - return True - - -def run_compiler(text_or_file: str, compiler_suite: str) -> Any: - """Compiles a source with a given compiler suite. - - Args: - text_or_file (str): Either the file name of the source code or - the source code directly. (Which is determined by - heuristics. If ``text_or_file`` contains at least on - linefeed then it is always assumed to be a source text and - not a file name.) - compiler_suite(str): File name of the compiler suite to be - used. - - Returns: - The result of the compilation, the form and type of which - depends entirely on the compiler. - - Raises: - CompilerError - """ - preprocessor, parser, ast, compiler = load_compiler_suite(compiler_suite) - return compileDSL(text_or_file, preprocessor(), parser(), ast(), compiler()) - - -
[docs]def compile_on_disk(source_file: str, compiler_suite="", extension=".xml") -> Iterable[Error]: - """ - Compiles the a source file with a given compiler and writes the - result to a file. - - If no ``compiler_suite`` is given it is assumed that the source - file is an EBNF grammar. In this case the result will be a Python - script containing a parser for that grammar as well as the - skeletons for a preprocessor, AST transformation table, and compiler. - If the Python script already exists only the parser name in the - script will be updated. (For this to work, the different names - need to be delimited section marker blocks.). `compile_on_disk()` - returns a list of error messages or an empty list if no errors - occurred. - - Parameters: - source_file(str): The file name of the source text to be - compiled. - compiler_suite(str): The file name of the compiler suite - (usually ending with 'Compiler.py'), with which the source - file shall be compiled. If this is left empty, the source - file is assumed to be an EBNF-Grammar that will be compiled - with the internal EBNF-Compiler. - extension(str): The result of the compilation (if successful) - is written to a file with the same name but a different - extension than the source file. This parameter sets the - extension. - - Returns: - A (potentially empty) list of error or warning messages. - """ - filepath = os.path.normpath(source_file) - with open(source_file, encoding="utf-8") as f: - source = f.read() - rootname = os.path.splitext(filepath)[0] - compiler_name = os.path.basename(rootname) - if compiler_suite: - sfactory, pfactory, tfactory, cfactory = load_compiler_suite(compiler_suite) - else: - sfactory = get_ebnf_preprocessor - pfactory = get_ebnf_grammar - tfactory = get_ebnf_transformer - cfactory = get_ebnf_compiler - compiler1 = cfactory() - compiler1.set_grammar_name(compiler_name, source_file) - result, messages, AST = compile_source(source, sfactory(), pfactory(), tfactory(), compiler1) - if has_errors(messages): - return messages - - elif cfactory == get_ebnf_compiler: - # trans == get_ebnf_transformer or trans == EBNFTransformer: - # either an EBNF- or no compiler suite given - ebnf_compiler = cast(EBNFCompiler, compiler1) - global SECTION_MARKER, RX_SECTION_MARKER, PREPROCESSOR_SECTION, PARSER_SECTION, \ - AST_SECTION, COMPILER_SECTION, END_SECTIONS_MARKER, RX_WHITESPACE, \ - DHPARSER_MAIN, DHPARSER_IMPORTS - f = None - try: - f = open(rootname + 'Compiler.py', 'r', encoding="utf-8") - source = f.read() - sections = RX_SECTION_MARKER.split(source) - intro, imports, preprocessor, parser, ast, compiler, outro = sections - # TODO: Verify transformation table - ast_trans_table = compile_python_object(DHPARSER_IMPORTS + ast, - r'(?:\w+_)?AST_transformation_table$') - messages.extend(ebnf_compiler.verify_transformation_table(ast_trans_table)) - except (PermissionError, FileNotFoundError, IOError) as error: - intro, imports, preprocessor, parser, ast, compiler, outro = '', '', '', '', '', '', '' - except ValueError as error: - name = '"' + rootname + 'Compiler.py"' - raise ValueError('Could not identify all required sections in ' + name + - '. Please delete or repair ' + name + ' manually!') - finally: - if f: - f.close() - f = None - - if RX_WHITESPACE.fullmatch(intro): - intro = '#!/usr/bin/python' - if RX_WHITESPACE.fullmatch(outro): - outro = DHPARSER_MAIN.format(NAME=compiler_name) - if RX_WHITESPACE.fullmatch(imports): - imports = DHPARSER_IMPORTS - if RX_WHITESPACE.fullmatch(preprocessor): - preprocessor = ebnf_compiler.gen_preprocessor_skeleton() - if RX_WHITESPACE.fullmatch(ast): - ast = ebnf_compiler.gen_transformer_skeleton() - if RX_WHITESPACE.fullmatch(compiler): - compiler = ebnf_compiler.gen_compiler_skeleton() - - compilerscript = rootname + 'Compiler.py' - try: - f = open(compilerscript, 'w', encoding="utf-8") - f.write(intro) - f.write(SECTION_MARKER.format(marker=SYMBOLS_SECTION)) - f.write(imports) - f.write(SECTION_MARKER.format(marker=PREPROCESSOR_SECTION)) - f.write(preprocessor) - f.write(SECTION_MARKER.format(marker=PARSER_SECTION)) - f.write(result) - f.write(SECTION_MARKER.format(marker=AST_SECTION)) - f.write(ast) - f.write(SECTION_MARKER.format(marker=COMPILER_SECTION)) - f.write(compiler) - f.write(SECTION_MARKER.format(marker=END_SECTIONS_MARKER)) - f.write(outro) - except (PermissionError, FileNotFoundError, IOError) as error: - print('# Could not write file "' + compilerscript + '" because of: ' - + "\n# ".join(str(error).split('\n)'))) - print(result) - finally: - if f: - f.close() - - if platform.system() != "Windows": - # set file permissions so that the compilerscript can be executed - st = os.stat(compilerscript) - os.chmod(compilerscript, st.st_mode | stat.S_IEXEC) - - else: - f = None - try: - f = open(rootname + extension, 'w', encoding="utf-8") - if isinstance(result, Node): - if extension.lower() == '.xml': - f.write(result.as_xml()) - else: - f.write(result.as_sxpr()) - else: - f.write(result) - except (PermissionError, FileNotFoundError, IOError) as error: - print('# Could not write file "' + rootname + '.py" because of: ' - + "\n# ".join(str(error).split('\n)'))) - print(result) - finally: - if f: - f.close() - - return messages
- - -
[docs]def recompile_grammar(ebnf_filename, force=False) -> bool: - """ - Re-compiles an EBNF-grammar if necessary, that is, if either no - corresponding 'XXXXCompiler.py'-file exists or if that file is - outdated. - - Parameters: - ebnf_filename(str): The filename of the ebnf-source of the - grammar. In case this is a directory and not a file, all - files within this directory ending with .ebnf will be - compiled. - force(bool): If False (default), the grammar will only be - recompiled if it has been changed. - """ - if os.path.isdir(ebnf_filename): - success = True - for entry in os.listdir(ebnf_filename): - if entry.lower().endswith('.ebnf') and os.path.isfile(entry): - success = success and recompile_grammar(entry, force) - return success - - base, ext = os.path.splitext(ebnf_filename) - compiler_name = base + 'Compiler.py' - error_file_name = base + '_ebnf_ERRORS.txt' - messages = [] # type: Iterable[Error] - if (not os.path.exists(compiler_name) or force or - grammar_changed(compiler_name, ebnf_filename)): - # print("recompiling parser for: " + ebnf_filename) - messages = compile_on_disk(ebnf_filename) - if messages: - # print("Errors while compiling: " + ebnf_filename + '!') - with open(error_file_name, 'w', encoding="utf-8") as f: - for e in messages: - f.write(str(e)) - f.write('\n') - if has_errors(messages): - return False - - if not messages and os.path.exists(error_file_name): - os.remove(error_file_name) - return True
-
- -
- -
-
- - -
- -
-

- © Copyright 2018, Eckhart Arnold. - -

-
- Built with Sphinx using a theme provided by Read the Docs. - -
- -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation/_modules/ebnf.html b/documentation/_modules/ebnf.html deleted file mode 100644 index 8018280a72c7213fc6083d35555896816a7406bb..0000000000000000000000000000000000000000 --- a/documentation/_modules/ebnf.html +++ /dev/null @@ -1,1189 +0,0 @@ - - - - - - - - - - - ebnf — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -

Source code for ebnf

-# ebnf.py - EBNF -> Python-Parser compilation for DHParser
-#
-# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
-#                 Bavarian Academy of Sciences an Humanities (badw.de)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied.  See the License for the specific language governing
-# permissions and limitations under the License.
-
-
-"""
-Module ``ebnf`` provides a self-hosting parser for EBNF-Grammars as
-well as an EBNF-compiler that compiles an EBNF-Grammar into a
-DHParser based Grammar class that can be executed to parse source text
-conforming to this grammar into contrete syntax trees.
-"""
-
-
-import keyword
-from collections import OrderedDict
-from functools import partial
-
-from DHParser.compile import CompilerError, Compiler
-from DHParser.error import Error
-from DHParser.parse import Grammar, mixin_comment, Forward, RegExp, Whitespace, RE, \
-    NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, Token
-from DHParser.preprocess import nil_preprocessor, PreprocessorFunc
-from DHParser.syntaxtree import Node, WHITESPACE_PTYPE, TOKEN_PTYPE
-from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name, re, expand_table, \
-    typing
-from DHParser.transform import TransformationFunc, traverse, remove_brackets, \
-    reduce_single_child, replace_by_single_child, remove_expendables, \
-    remove_tokens, flatten, forbid, assert_content, remove_infix_operator
-from DHParser.versionnumber import __version__
-from typing import Callable, Dict, List, Set, Tuple
-
-
-__all__ = ('get_ebnf_preprocessor',
-           'get_ebnf_grammar',
-           'get_ebnf_transformer',
-           'get_ebnf_compiler',
-           'EBNFGrammar',
-           'EBNFTransform',
-           'EBNFCompilerError',
-           'EBNFCompiler',
-           'grammar_changed',
-           'PreprocessorFactoryFunc',
-           'ParserFactoryFunc',
-           'TransformerFactoryFunc',
-           'CompilerFactoryFunc')
-
-
-########################################################################
-#
-# EBNF scanning
-#
-########################################################################
-
-
-def get_ebnf_preprocessor() -> PreprocessorFunc:
-    return nil_preprocessor
-
-
-########################################################################
-#
-# EBNF parsing
-#
-########################################################################
-
-
-
[docs]class EBNFGrammar(Grammar): - r""" - Parser for an EBNF source file, with this grammar:: - - # EBNF-Grammar in EBNF - - @ comment = /#.*(?:\n|$)/ # comments start with '#' and eat all chars up to and including '\n' - @ whitespace = /\s*/ # whitespace includes linefeed - @ literalws = right # trailing whitespace of literals will be ignored tacitly - - syntax = [~//] { definition | directive } §EOF - definition = symbol §"=" expression - directive = "@" §symbol "=" ( regexp | literal | list_ ) - - expression = term { "|" term } - term = { ["§"] factor }+ # "§" means all following factors mandatory - factor = [flowmarker] [retrieveop] symbol !"=" # negative lookahead to be sure it's not a definition - | [flowmarker] literal - | [flowmarker] plaintext - | [flowmarker] regexp - | [flowmarker] whitespace - | [flowmarker] oneormore - | [flowmarker] group - | [flowmarker] unordered - | repetition - | option - - flowmarker = "!" | "&" # '!' negative lookahead, '&' positive lookahead - | "-!" | "-&" # '-' negative lookbehind, '-&' positive lookbehind - retrieveop = "::" | ":" # '::' pop, ':' retrieve - - group = "(" §expression ")" - unordered = "<" §expression ">" # elements of expression in arbitrary order - oneormore = "{" expression "}+" - repetition = "{" §expression "}" - option = "[" §expression "]" - - symbol = /(?!\d)\w+/~ # e.g. expression, factor, parameter_list - literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while' - | /'(?:[^']|\\')*?'/~ # whitespace following literals will be ignored tacitly. - plaintext = /`(?:[^"]|\\")*?`/~ # like literal but does not eat whitespace - regexp = /~?\/(?:\\\/|[^\/])*?\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~ - # '~' is a whitespace-marker, if present leading or trailing - # whitespace of a regular expression will be ignored tacitly. - whitespace = /~/~ # implicit or default whitespace - list_ = /\w+/~ { "," /\w+/~ } # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST, - # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple - EOF = !/./ - """ - expression = Forward() - source_hash__ = "3fc9f5a340f560e847d9af0b61a68743" - parser_initialization__ = "upon instantiation" - COMMENT__ = r'#.*(?:\n|$)' - WHITESPACE__ = r'\s*' - WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__) - wspL__ = '' - wspR__ = WSP__ - whitespace__ = Whitespace(WSP__) - EOF = NegativeLookahead(RegExp('.')) - list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+')))) - whitespace = RE('~') - regexp = RE('~?/(?:\\\\/|[^/])*?/~?') - plaintext = RE('`(?:[^"]|\\\\")*?`') - literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'")) - symbol = RE('(?!\\d)\\w+') - option = Series(Token("["), expression, Token("]"), mandatory=1) - repetition = Series(Token("{"), expression, Token("}"), mandatory=1) - oneormore = Series(Token("{"), expression, Token("}+")) - unordered = Series(Token("<"), expression, Token(">"), mandatory=1) - group = Series(Token("("), expression, Token(")"), mandatory=1) - retrieveop = Alternative(Token("::"), Token(":")) - flowmarker = Alternative(Token("!"), Token("&"), Token("-!"), Token("-&")) - factor = Alternative(Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("="))), - Series(Option(flowmarker), literal), Series(Option(flowmarker), plaintext), - Series(Option(flowmarker), regexp), Series(Option(flowmarker), whitespace), - Series(Option(flowmarker), oneormore), Series(Option(flowmarker), group), - Series(Option(flowmarker), unordered), repetition, option) - term = OneOrMore(Series(Option(Token("§")), factor)) - expression.set(Series(term, ZeroOrMore(Series(Token("|"), term)))) - directive = Series(Token("@"), symbol, Token("="), Alternative(regexp, literal, list_), mandatory=1) - definition = Series(symbol, Token("="), expression, mandatory=1) - syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), EOF, mandatory=2) - root__ = syntax
- - -
[docs]def grammar_changed(grammar_class, grammar_source: str) -> bool: - """ - Returns ``True`` if ``grammar_class`` does not reflect the latest - changes of ``grammar_source`` - - Parameters: - grammar_class: the parser class representing the grammar - or the file name of a compiler suite containing the grammar - grammar_source: File name or string representation of the - EBNF code of the grammar - - Returns (bool): - True, if the source text of the grammar is different from the - source from which the grammar class was generated - """ - grammar = load_if_file(grammar_source) - chksum = md5(grammar, __version__) - if isinstance(grammar_class, str): - # grammar_class = load_compiler_suite(grammar_class)[1] - with open(grammar_class, 'r', encoding='utf8') as f: - pycode = f.read() - m = re.search('class \w*\(Grammar\)', pycode) - if m: - m = re.search(' source_hash__ *= *"([a-z0-9]*)"', - pycode[m.span()[1]:]) - return not (m and m.groups() and m.groups()[-1] == chksum) - else: - return True - else: - return chksum != grammar_class.source_hash__
- - -def get_ebnf_grammar() -> EBNFGrammar: - global thread_local_ebnf_grammar_singleton - try: - grammar = thread_local_ebnf_grammar_singleton - return grammar - except NameError: - thread_local_ebnf_grammar_singleton = EBNFGrammar() - return thread_local_ebnf_grammar_singleton - - -######################################################################## -# -# EBNF concrete to abstract syntax tree transformation and validation -# -######################################################################## - - -EBNF_AST_transformation_table = { - # AST Transformations for EBNF-grammar - "+": - remove_expendables, - "syntax": - [], # otherwise '"*": replace_by_single_child' would be applied - "directive, definition": - remove_tokens('@', '='), - "expression": - [replace_by_single_child, flatten, remove_tokens('|')], # remove_infix_operator], - "term": - [replace_by_single_child, flatten], # supports both idioms: - # "{ factor }+" and "factor { factor }" - "factor, flowmarker, retrieveop": - replace_by_single_child, - "group": - [remove_brackets, replace_by_single_child], - "unordered": - remove_brackets, - "oneormore, repetition, option": - [reduce_single_child, remove_brackets, - forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)(?:.|\n)*')], - "symbol, literal, regexp": - reduce_single_child, - (TOKEN_PTYPE, WHITESPACE_PTYPE): - reduce_single_child, - "list_": - [flatten, remove_infix_operator], - "*": - replace_by_single_child -} - - -def EBNFTransform() -> TransformationFunc: - return partial(traverse, processing_table=EBNF_AST_transformation_table.copy()) - -def get_ebnf_transformer() -> TransformationFunc: - global thread_local_EBNF_transformer_singleton - try: - transformer = thread_local_EBNF_transformer_singleton - except NameError: - thread_local_EBNF_transformer_singleton = EBNFTransform() - transformer = thread_local_EBNF_transformer_singleton - return transformer - - -######################################################################## -# -# EBNF abstract syntax tree to Python parser compilation -# -######################################################################## - - -PreprocessorFactoryFunc = Callable[[], PreprocessorFunc] -ParserFactoryFunc = Callable[[], Grammar] -TransformerFactoryFunc = Callable[[], TransformationFunc] -CompilerFactoryFunc = Callable[[], Compiler] - -PREPROCESSOR_FACTORY = ''' -def get_preprocessor() -> PreprocessorFunc: - return {NAME}Preprocessor -''' - - -GRAMMAR_FACTORY = ''' -def get_grammar() -> {NAME}Grammar: - global thread_local_{NAME}_grammar_singleton - try: - grammar = thread_local_{NAME}_grammar_singleton - except NameError: - thread_local_{NAME}_grammar_singleton = {NAME}Grammar() - grammar = thread_local_{NAME}_grammar_singleton - return grammar -''' - - -TRANSFORMER_FACTORY = ''' -def {NAME}Transform() -> TransformationDict: - return partial(traverse, processing_table={NAME}_AST_transformation_table.copy()) - -def get_transformer() -> TransformationFunc: - global thread_local_{NAME}_transformer_singleton - try: - transformer = thread_local_{NAME}_transformer_singleton - except NameError: - thread_local_{NAME}_transformer_singleton = {NAME}Transform() - transformer = thread_local_{NAME}_transformer_singleton - return transformer -''' - - -COMPILER_FACTORY = ''' -def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler: - global thread_local_{NAME}_compiler_singleton - try: - compiler = thread_local_{NAME}_compiler_singleton - compiler.set_grammar_name(grammar_name, grammar_source) - except NameError: - thread_local_{NAME}_compiler_singleton = \\ - {NAME}Compiler(grammar_name, grammar_source) - compiler = thread_local_{NAME}_compiler_singleton - return compiler -''' - - -
[docs]class EBNFCompilerError(CompilerError): - """Error raised by `EBNFCompiler` class. (Not compilation errors - in the strict sense, see `CompilationError` in module ``dsl.py``)""" - pass
- - -
[docs]class EBNFCompiler(Compiler): - """ - Generates a Parser from an abstract syntax tree of a grammar specified - in EBNF-Notation. - - Instances of this class must be called with the root-node of the - abstract syntax tree from an EBNF-specification of a formal language. - The returned value is the Python-source-code of a Grammar class for - this language that can be used to parse texts in this language. - See classes `parser.Compiler` and `parser.Grammar` for more information. - - Addionally, class EBNFCompiler provides helper methods to generate - code-skeletons for a preprocessor, AST-transformation and full - compilation of the formal language. These method's names start with - the prefix `gen_`. - - Attributes: - current_symbols: During compilation, a list containing the root - node of the currently compiled definition as first element - and then the nodes of the symbols that are referred to in - the currently compiled definition. - - rules: Dictionary that maps rule names to a list of Nodes that - contain symbol-references in the definition of the rule. - The first item in the list is the node of the rule- - definition itself. Example: - - `alternative = a | b` - - Now `[node.content for node in self.rules['alternative']]` - yields `['alternative = a | b', 'a', 'b']` - - symbols: A mapping of symbol names to their first usage (not - their definition!) in the EBNF source. - - variables: A set of symbols names that are used with the - Pop or Retrieve operator. Because the values of these - symbols need to be captured they are called variables. - See `test_parser.TestPopRetrieve` for an example. - - recursive: A set of symbols that are used recursively and - therefore require a `Forward`-operator. - - definitions: A dictionary of definitions. Other than `rules` - this maps the symbols to their compiled definienda. - - deferred_taks: A list of callables that is filled during - compilatation, but that will be executed only after - compilation has finished. Typically, it contains - sementatic checks that require information that - is only available upon completion of compilation. - - root: The name of the root symbol. - - directives: A dictionary of all directives and their default - values. - - re_flags: A set of regular expression flags to be added to all - regular expressions found in the current parsing process - """ - COMMENT_KEYWORD = "COMMENT__" - WHITESPACE_KEYWORD = "WSP__" - RAW_WS_KEYWORD = "WHITESPACE__" - WHITESPACE_PARSER_KEYWORD = "whitespace__" - RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, RAW_WS_KEYWORD, COMMENT_KEYWORD} - AST_ERROR = "Badly structured syntax tree. " \ - "Potentially due to erroneous AST transformation." - PREFIX_TABLE = {'§': 'Required', - '&': 'Lookahead', '!': 'NegativeLookahead', - '-&': 'Lookbehind', '-!': 'NegativeLookbehind', - '::': 'Pop', ':': 'Retrieve'} - WHITESPACE = {'horizontal': r'[\t ]*', # default: horizontal - 'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*', - 'vertical': r'\s*'} - REPEATABLE_DIRECTIVES = {'tokens'} - - - def __init__(self, grammar_name="", grammar_source=""): - super(EBNFCompiler, self).__init__(grammar_name, grammar_source) - self._reset() - - - def _reset(self): - super(EBNFCompiler, self)._reset() - self._result = '' # type: str - self.re_flags = set() # type: Set[str] - self.rules = OrderedDict() # type: OrderedDict[str, List[Node]] - self.current_symbols = [] # type: List[Node] - self.symbols = {} # type: Dict[str, Node] - self.variables = set() # type: Set[str] - self.recursive = set() # type: Set[str] - self.definitions = {} # type: Dict[str, str] - self.deferred_tasks = [] # type: List[Callable] - self.root_symbol = "" # type: str - self.directives = {'whitespace': self.WHITESPACE['vertical'], - 'comment': '', - 'literalws': {'right'}, - 'tokens': set(), # alt. 'preprocessor_tokens' - 'filter': dict()} # alt. 'filter' - # self.directives['ignorecase']: False - self.defined_directives = set() # type: Set[str] - - @property - def result(self) -> str: - return self._result - - # methods for generating skeleton code for preprocessor, transformer, and compiler - -
[docs] def gen_preprocessor_skeleton(self) -> str: - """ - Returns Python-skeleton-code for a preprocessor-function for - the previously compiled formal language. - """ - name = self.grammar_name + "Preprocessor" - return "def %s(text):\n return text, lambda i: i\n" % name \ - + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
- - -
[docs] def gen_transformer_skeleton(self) -> str: - """ - Returns Python-skeleton-code for the AST-transformation for the - previously compiled formal language. - """ - if not self.rules: - raise EBNFCompilerError('Compiler must be run before calling ' - '"gen_transformer_Skeleton()"!') - tt_name = self.grammar_name + '_AST_transformation_table' - transtable = [tt_name + ' = {', - ' # AST Transformations for the ' + self.grammar_name + '-grammar'] - transtable.append(' "+": remove_empty,') - for name in self.rules: - transformations = '[]' - rule = self.definitions[name] - if rule.startswith('Alternative'): - transformations = '[replace_or_reduce]' - elif rule.startswith('Synonym'): - transformations = '[reduce_single_child]' - transtable.append(' "' + name + '": %s,' % transformations) - transtable.append(' ":Token, :RE": reduce_single_child,') - transtable += [' "*": replace_by_single_child', '}', ''] - transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)] - return '\n'.join(transtable)
- - -
[docs] def gen_compiler_skeleton(self) -> str: - """ - Returns Python-skeleton-code for a Compiler-class for the - previously compiled formal language. - """ - if not self.rules: - raise EBNFCompilerError('Compiler has not been run before calling ' - '"gen_Compiler_Skeleton()"!') - compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):', - ' """Compiler for the abstract-syntax-tree of a ' + - self.grammar_name + ' source file.', - ' """', '', - ' def __init__(self, grammar_name="' + - self.grammar_name + '", grammar_source=""):', - ' super(' + self.grammar_name + - 'Compiler, self).__init__(grammar_name, grammar_source)', - r" assert re.match('\w+\Z', grammar_name)", ''] - for name in self.rules: - method_name = Compiler.method_name(name) - if name == self.root_symbol: - compiler += [' def ' + method_name + '(self, node):', - ' return self.fallback_compiler(node)', ''] - else: - compiler += [' # def ' + method_name + '(self, node):', - ' # return node', ''] - compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)] - return '\n'.join(compiler)
- -
[docs] def verify_transformation_table(self, transtable): - """ - Checks for symbols that occur in the transformation-table but have - never been defined in the grammar. Usually, this kind of - inconsistency results from an error like a typo in the transformation - table. - """ - assert self._dirty_flag - table_entries = set(expand_table(transtable).keys()) - {'*', '+', '~'} - symbols = self.rules.keys() - messages = [] - for entry in table_entries: - if entry not in symbols and not entry.startswith(":"): - messages.append(Error(('Symbol "%s" is not defined in grammar %s but appears in ' - 'the transformation table!') % (entry, self.grammar_name), - Error.UNDEFINED_SYMBOL_IN_TRANSFORMATION_TABLE, 0)) - return messages
- - -
[docs] def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str: - """ - Creates the Python code for the parser after compilation of - the EBNF-Grammar - """ - - # execute deferred tasks, for example semantic checks that cannot - # be done before the symbol table is complete - - for task in self.deferred_tasks: - task() - - # provide for capturing of symbols that are variables, i.e. the - # value of will be retrieved at some point during the parsing process - - if self.variables: - for i in range(len(definitions)): - if definitions[i][0] in self.variables: - definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1]) - - # add special fields for Grammar class - - definitions.append((self.WHITESPACE_PARSER_KEYWORD, - 'Whitespace(%s)' % self.WHITESPACE_KEYWORD)) - definitions.append(('wspR__', self.WHITESPACE_KEYWORD - if 'right' in self.directives['literalws'] else "''")) - definitions.append(('wspL__', self.WHITESPACE_KEYWORD - if 'left' in self.directives['literalws'] else "''")) - definitions.append((self.WHITESPACE_KEYWORD, - ("mixin_comment(whitespace=" + self.RAW_WS_KEYWORD + - ", comment=" + self.COMMENT_KEYWORD + ")"))) - definitions.append((self.RAW_WS_KEYWORD, "r'{whitespace}'".format(**self.directives))) - definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives))) - - # prepare parser class header and docstring and - # add EBNF grammar to the doc string of the parser class - - article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a ' # what about 'hour', 'universe' etc.? - declarations = ['class ' + self.grammar_name + - 'Grammar(Grammar):', - 'r"""Parser for ' + article + self.grammar_name + - ' source file' + - (', with this grammar:' if self.grammar_source else '.')] - definitions.append(('parser_initialization__', '"upon instantiation"')) - if self.grammar_source: - definitions.append(('source_hash__', - '"%s"' % md5(self.grammar_source, __version__))) - declarations.append('') - declarations += [line for line in self.grammar_source.split('\n')] - while declarations[-1].strip() == '': - declarations = declarations[:-1] - declarations.append('"""') - - # turn definitions into declarations in reverse order - - self.root_symbol = definitions[0][0] if definitions else "" - definitions.reverse() - declarations += [symbol + ' = Forward()' - for symbol in sorted(list(self.recursive))] - for symbol, statement in definitions: - if symbol in self.recursive: - declarations += [symbol + '.set(' + statement + ')'] - else: - declarations += [symbol + ' = ' + statement] - - # check for symbols used but never defined - - defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS - for symbol in self.symbols: - if symbol not in defined_symbols: - self.tree.new_error(self.symbols[symbol], - "Missing definition for symbol '%s'" % symbol) - # root_node.error_flag = True - - # check for unconnected rules - - defined_symbols.difference_update(self.RESERVED_SYMBOLS) - - def remove_connections(symbol): - """Recursively removes all symbols which appear in the - definiens of a particular symbol.""" - if symbol in defined_symbols: - defined_symbols.remove(symbol) - for related in self.rules[symbol][1:]: - remove_connections(str(related)) - - remove_connections(self.root_symbol) - for leftover in defined_symbols: - self.tree.new_error(self.rules[leftover][0], - ('Rule "%s" is not connected to parser root "%s" !') % - (leftover, self.root_symbol), Error.WARNING) - - # set root_symbol parser and assemble python grammar definition - - if self.root_symbol and 'root__' not in self.rules: - declarations.append('root__ = ' + self.root_symbol) - declarations.append('') - self._result = '\n '.join(declarations) \ - + GRAMMAR_FACTORY.format(NAME=self.grammar_name) - return self._result
- - - ## compilation methods - - def on_syntax(self, node: Node) -> str: - definitions = [] # type: List[Tuple[str, str]] - - # drop the wrapping sequence node - if len(node.children) == 1 and not node.children[0].parser.name: - node = node.children[0] - - # compile definitions and directives and collect definitions - for nd in node.children: - if nd.parser.name == "definition": - definitions.append(self.compile(nd)) - else: - assert nd.parser.name == "directive", nd.as_sxpr() - self.compile(nd) - # node.error_flag = max(node.error_flag, nd.error_flag) - self.definitions.update(definitions) - - return self.assemble_parser(definitions, node) - - - def on_definition(self, node: Node) -> Tuple[str, str]: - rule = node.children[0].content - if rule in self.rules: - first = self.rules[rule][0] - if not first.errors: - self.tree.new_error(first, 'First definition of rule "%s" ' - 'followed by illegal redefinitions.' % rule) - self.tree.new_error(node, 'A rule "%s" has already been defined earlier.' % rule) - elif rule in EBNFCompiler.RESERVED_SYMBOLS: - self.tree.new_error(node, 'Symbol "%s" is a reserved symbol.' % rule) - elif not sane_parser_name(rule): - self.tree.new_error(node, 'Illegal symbol "%s". Symbols must not start or ' - ' end with a doube underscore "__".' % rule) - elif rule in self.directives['tokens']: - self.tree.new_error(node, 'Symbol "%s" has already been defined as ' - 'a preprocessor token.' % rule) - elif keyword.iskeyword(rule): - self.tree.new_error(node, 'Python keyword "%s" may not be used as a symbol. ' - % rule + '(This may change in the future.)') - try: - self.current_symbols = [node] - self.rules[rule] = self.current_symbols - defn = self.compile(node.children[1]) - if rule in self.variables: - defn = 'Capture(%s)' % defn - self.variables.remove(rule) - elif defn.find("(") < 0: - # assume it's a synonym, like 'page = REGEX_PAGE_NR' - defn = 'Synonym(%s)' % defn - except TypeError as error: - from traceback import extract_tb - trace = str(extract_tb(error.__traceback__)[-1]) - errmsg = "%s (TypeError: %s; %s)\n%s" \ - % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr()) - self.tree.new_error(node, errmsg) - rule, defn = rule + ':error', '"' + errmsg + '"' - return rule, defn - - - def _check_rx(self, node: Node, rx: str) -> str: - """ - Checks whether the string `rx` represents a valid regular - expression. Makes sure that multiline regular expressions are - prepended by the multiline-flag. Returns the regular expression string. - """ - flags = self.re_flags | {'x'} if rx.find('\n') >= 0 else self.re_flags - if flags: rx = "(?%s)%s" % ("".join(flags), rx) - try: - re.compile(rx) - except Exception as re_error: - self.tree.new_error(node, "malformed regular expression %s: %s" % - (repr(rx), str(re_error))) - return rx - - - def on_directive(self, node: Node) -> str: - key = node.children[0].content.lower() - assert key not in self.directives['tokens'] - - if key not in self.REPEATABLE_DIRECTIVES: - if key in self.defined_directives: - self.tree.new_error(node, 'Directive "%s" has already been defined earlier. ' - % key + 'Later definition will be ignored!', - code=Error.REDEFINED_DIRECTIVE_WARNING) - return "" - self.defined_directives.add(key) - - if key in {'comment', 'whitespace'}: - if node.children[1].parser.name == "list_": - if len(node.children[1].result) != 1: - self.tree.new_error(node, 'Directive "%s" must have one, but not %i values.' - % (key, len(node.children[1].result))) - value = self.compile(node.children[1]).pop() - if key == 'whitespace' and value in EBNFCompiler.WHITESPACE: - value = EBNFCompiler.WHITESPACE[value] # replace whitespace-name by regex - else: - self.tree.new_error(node, 'Value "%s" not allowed for directive "%s".' - % (value, key)) - else: - value = node.children[1].content.strip("~") # cast(str, node.children[ - # 1].result).strip("~") - if value != node.children[1].content: # cast(str, node.children[1].result): - self.tree.new_error(node, "Whitespace marker '~' not allowed in definition " - "of %s regular expression." % key) - if value[0] + value[-1] in {'""', "''"}: - value = escape_re(value[1:-1]) - elif value[0] + value[-1] == '//': - value = self._check_rx(node, value[1:-1]) - if key == 'whitespace' and not re.match(value, ''): - self.tree.new_error(node, "Implicit whitespace should always " - "match the empty string, /%s/ does not." % value) - self.directives[key] = value - - elif key == 'ignorecase': - if node.children[1].content.lower() not in {"off", "false", "no"}: - self.re_flags.add('i') - - # elif key == 'testing': - # value = node.children[1].content - # self.directives['testing'] = value.lower() not in {"off", "false", "no"} - - elif key == 'literalws': - value = {item.lower() for item in self.compile(node.children[1])} - if ((value - {'left', 'right', 'both', 'none'}) - or ('none' in value and len(value) > 1)): - self.tree.new_error(node, 'Directive "literalws" allows only `left`, `right`, ' - '`both` or `none`, not `%s`' % ", ".join(value)) - wsp = {'left', 'right'} if 'both' in value \ - else {} if 'none' in value else value - self.directives[key] = list(wsp) - - elif key in {'tokens', 'preprocessor_tokens'}: - tokens = self.compile(node.children[1]) - redeclared = self.directives['tokens'] & tokens - if redeclared: - self.tree.new_error(node, 'Tokens %s have already been declared earlier. ' - % str(redeclared) + 'Later declaration will be ignored', - code=Error.REDECLARED_TOKEN_WARNING) - self.directives['tokens'] |= tokens - redeclared - - elif key.endswith('_filter'): - filter_set = self.compile(node.children[1]) - if not isinstance(filter_set, set) or len(filter_set) != 1: - self.tree.new_error(node, 'Directive "%s" accepts exactly on symbol, not %s' - % (key, str(filter_set))) - self.directives['filter'][key[:-7]] = filter_set.pop() - - else: - self.tree.new_error(node, 'Unknown directive %s ! (Known ones are %s .)' % - (key, ', '.join(list(self.directives.keys())))) - return "" - - -
[docs] def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str: - """ - Compiles any non-terminal, where `parser_class` indicates the Parser class - name for the particular non-terminal. - """ - arguments = [self.compile(r) for r in node.children] + custom_args - # node.error_flag = max(node.error_flag, max(t.error_flag for t in node.children)) - return parser_class + '(' + ', '.join(arguments) + ')'
- - - def on_expression(self, node) -> str: - # TODO: Add check for errors like "a" | "ab" (which will always yield a, even for ab) - return self.non_terminal(node, 'Alternative') - - - def on_term(self, node) -> str: - # Basically, the following code does only this: - # return self.non_terminal(node, 'Series') - # What makes it (look) more complicated is the handling of the - # mandatory §-operator - mandatory_marker = [] - filtered_children = [] # type: List[Node] - for nd in node.children: - if nd.parser.ptype == TOKEN_PTYPE and nd.content == "§": - mandatory_marker.append(len(filtered_children)) - # if len(filtered_children) == 0: - # self.tree.new_error(nd.pos, 'First item of a series should not be mandatory.', - # Error.WARNING) - if len(mandatory_marker) > 1: - self.tree.new_error(nd, 'One mandatory marker (§) sufficient to declare ' - 'the rest of the series as mandatory.', Error.WARNING) - else: - filtered_children.append(nd) - saved_result = node.result - node.result = tuple(filtered_children) - if len(filtered_children) == 1: - compiled = self.non_terminal(node, 'Required') - else: - custom_args = ['mandatory=%i' % mandatory_marker[0]] if mandatory_marker else [] - compiled = self.non_terminal(node, 'Series', custom_args) - node.result = saved_result - return compiled - - - def on_factor(self, node: Node) -> str: - assert node.children - assert len(node.children) >= 2, node.as_sxpr() - prefix = node.children[0].content - custom_args = [] # type: List[str] - - if prefix in {'::', ':'}: - assert len(node.children) == 2 - arg = node.children[-1] - if arg.parser.name != 'symbol': - self.tree.new_error(node, ('Retrieve Operator "%s" requires a symbol, ' - 'and not a %s.') % (prefix, str(arg.parser))) - return str(arg.result) - if str(arg) in self.directives['filter']: - custom_args = ['rfilter=%s' % self.directives['filter'][str(arg)]] - self.variables.add(str(arg)) # cast(str, arg.result) - - elif len(node.children) > 2: - # shift = (Node(node.parser, node.result[1].result),) - # node.result[1].result = shift + node.result[2:] - node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \ - + node.children[2:] - node.children[1].parser = node.parser - node.result = (node.children[0], node.children[1]) - - node.result = node.children[1:] - try: - parser_class = self.PREFIX_TABLE[prefix] - result = self.non_terminal(node, parser_class, custom_args) - if prefix[:1] == '-': - def check(node): - nd = node - if len(nd.children) >= 1: - nd = nd.children[0] - while nd.parser.name == "symbol": - symlist = self.rules.get(nd.content, []) - if len(symlist) == 2: - nd = symlist[1] - else: - if len(symlist) == 1: - nd = symlist[0].children[1] - break - if (nd.parser.name != "regexp" or nd.content[:1] != '/' - or nd.content[-1:] != '/'): - self.tree.new_error(node, "Lookbehind-parser can only be used with RegExp" - "-parsers, not: " + nd.parser.name + nd.parser.ptype) - - if not result.startswith('RegExp('): - self.deferred_tasks.append(lambda: check(node)) - return result - except KeyError: - self.tree.new_error(node, 'Unknown prefix "%s".' % prefix) - return "" - - - def on_option(self, node) -> str: - return self.non_terminal(node, 'Option') - - - def on_repetition(self, node) -> str: - return self.non_terminal(node, 'ZeroOrMore') - - - def on_oneormore(self, node) -> str: - return self.non_terminal(node, 'OneOrMore') - - - def on_group(self, node) -> str: - raise EBNFCompilerError("Group nodes should have been eliminated by " - "AST transformation!") - - def on_unordered(self, node) -> str: - # return self.non_terminal(node, 'Unordered') - assert len(node.children) == 1 - nd = node.children[0] - for child in nd.children: - if child.parser.ptype == TOKEN_PTYPE and nd.content == "§": - self.tree.new_error(node, "No mandatory items § allowed in Unordered sequences.") - args = ', '.join(self.compile(child) for child in nd.children) - if nd.parser.name == "term": - return "AllOf(" + args + ")" - elif nd.parser.name == "expression": - return "SomeOf(" + args + ")" - else: - self.tree.new_error(node, "Unordered sequence or alternative " - "requires at least two elements.") - return "" - - def on_symbol(self, node: Node) -> str: # called only for symbols on the right hand side! - symbol = node.content # ; assert result == cast(str, node.result) - if symbol in self.directives['tokens']: - return 'PreprocessorToken("' + symbol + '")' - else: - self.current_symbols.append(node) - if symbol not in self.symbols: - self.symbols[symbol] = node # remember first use of symbol - if symbol in self.rules: - self.recursive.add(symbol) - if symbol in EBNFCompiler.RESERVED_SYMBOLS: - # (EBNFCompiler.WHITESPACE_KEYWORD, EBNFCompiler.COMMENT_KEYWORD): - return "RegExp(%s)" % symbol - return symbol - - - def on_literal(self, node: Node) -> str: - return 'Token(' + node.content.replace('\\', r'\\') + ')' - - - def on_plaintext(self, node: Node) -> str: - return 'Token(' + node.content.replace('\\', r'\\').replace('`', '"') \ - + ", wL='', wR='')" - - - def on_regexp(self, node: Node) -> str: - rx = node.content - name = [] # type: List[str] - if rx[0] == '/' and rx[-1] == '/': - parser = 'RegExp(' - else: - parser = 'RE(' - if rx[:2] == '~/': - if not 'left' in self.directives['literalws']: - name = ['wL=' + self.WHITESPACE_KEYWORD] + name - rx = rx[1:] - elif 'left' in self.directives['literalws']: - name = ["wL=''"] + name - if rx[-2:] == '/~': - if 'right' not in self.directives['literalws']: - name = ['wR=' + self.WHITESPACE_KEYWORD] + name - rx = rx[:-1] - elif 'right' in self.directives['literalws']: - name = ["wR=''"] + name - try: - arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/'))) - except AttributeError as error: - from traceback import extract_tb - trace = str(extract_tb(error.__traceback__)[-1]) - errmsg = "%s (AttributeError: %s; %s)\n%s" \ - % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr()) - self.tree.new_error(node, errmsg) - return '"' + errmsg + '"' - return parser + ', '.join([arg] + name) + ')' - - - def on_whitespace(self, node: Node) -> str: - return 'whitespace__' - - - def on_list_(self, node) -> Set[str]: - assert node.children - return set(item.result.strip() for item in node.children)
- - -def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler: - global thread_local_ebnf_compiler_singleton - try: - compiler = thread_local_ebnf_compiler_singleton - compiler.set_grammar_name(grammar_name, grammar_source) - return compiler - except NameError: - thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source) - return thread_local_ebnf_compiler_singleton -
- -
- -
-
- - -
- -
-

- © Copyright 2018, Eckhart Arnold. - -

-
- Built with Sphinx using a theme provided by Read the Docs. - -
- -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation/_modules/error.html b/documentation/_modules/error.html deleted file mode 100644 index ac108466d5eeca38e1e9dd36db5ba41dffe7dad0..0000000000000000000000000000000000000000 --- a/documentation/_modules/error.html +++ /dev/null @@ -1,435 +0,0 @@ - - - - - - - - - - - error — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -

Source code for error

-# error.py - error handling for DHParser
-#
-# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
-#                 Bavarian Academy of Sciences an Humanities (badw.de)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied.  See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Module ``error`` defines class Error and a few helpful functions that are
-needed for error reporting of DHParser. Usually, what is of interest are
-the string representations of the error objects. For example::
-
-    from DHParser import compile_source, has_errors
-
-    result, errors, ast = compile_source(source, preprocessor, grammar,
-                                         transformer, compiler)
-    if errors:
-        for error in errors:
-            print(error)
-
-        if has_errors(errors):
-            print("There have been fatal errors!")
-            sys.exit(1)
-        else:
-            print("There have been warnings, but no errors.")
-"""
-
-
-import bisect
-
-from DHParser.preprocess import SourceMapFunc
-from DHParser.stringview import StringView
-from DHParser.toolkit import typing
-from typing import Iterable, Iterator, Union, Tuple, List, Optional
-
-__all__ = ('Error',
-           'is_error',
-           'is_warning',
-           'has_errors',
-           'only_errors',
-           'linebreaks',
-           'line_col',
-           'adjust_error_locations')
-
-
-class Error:
-    __slots__ = ['message', 'level', 'code', '_pos', 'orig_pos', 'line', 'column', '_node_keep']
-
-    # error levels
-
-    NO_ERROR  = 0
-    MESSAGE   = 1
-    WARNING   = 10
-    ERROR     = 1000
-    HIGHEST   = ERROR
-
-    # warning codes
-
-    REDEFINED_DIRECTIVE_WARNING = 101
-    REDECLARED_TOKEN_WARNING = 102
-
-    UNDEFINED_SYMBOL_IN_TRANSFORMATION_TABLE = 601
-
-    # error codes
-
-    MANDATORY_CONTINUATION = 1001
-
-    def __init__(self, message: str, code: int = ERROR, pos: int = -1,
-                 orig_pos: int = -1, line: int = -1, column: int = -1,
-                 node: Optional['Node'] = None) -> None:
-        self.message = message
-        assert code >= 0
-        self.code = code
-        self._pos = pos
-        self.orig_pos = orig_pos
-        self.line = line
-        self.column = column
-        if node is not None and node._pos >= 0:
-            assert self._pos < 0 or self._pos == node._pos
-            self._pos = node._pos
-            self._node_keep = None  # if node is not needed, if pos has been set
-        else:
-            self._node_keep = node  # redundant: consider removing, see RootNode.collect_errors
-
-    def __str__(self):
-        prefix = ''
-        if self.line > 0:
-            prefix = "%i:%i: " % (max(self.line, 0), max(self.column, 0))
-        return prefix + "%s: %s" % (self.severity, self.message)
-
-    def __repr__(self):
-        return 'Error("%s", %s, %i, %i, %i, %i)' \
-               % (self.message, repr(self.code), self.pos, self.orig_pos, self.line, self.column)
-
-    @property
-    def pos(self):
-        if self._pos < 0:
-            assert self._node_keep and self._node_keep.pos >= 0, "pos value not ready yet"
-            self._pos = self._node_keep.pos   # lazy evaluation of position
-        self._node_keep = None  # forget node to allow GC to free memory
-        return self._pos
-
-    @property
-    def severity(self):
-        """Returns a string representation of the error level, e.g. "warning"."""
-        return "Warning" if is_warning(self.code) else "Error"
-
-    def visualize(self, document: str) -> str:
-        """Shows the line of the document and the position where the error
-        occurred."""
-        start = document.rfind('\n', 0, self.pos) + 1
-        stop = document.find('\n', self.pos)
-        return document[start:stop] + '\n' + ' ' * (self.pos - start) + '^\n'
-
-
-
[docs]def is_warning(code: int) -> bool: - """Returns True, if error is merely a warning.""" - return code < Error.ERROR
- - -
[docs]def is_error(code: int) -> bool: - """Returns True, if error is an error, not just a warning.""" - return code >= Error.ERROR
- - -
[docs]def has_errors(messages: Iterable[Error], level: int = Error.ERROR) -> bool: - """ - Returns True, if at least one entry in `messages` has at - least the given error `level`. - """ - for err_obj in messages: - if err_obj.code >= level: - return True - return False
- - -
[docs]def only_errors(messages: Iterable[Error], level: int = Error.ERROR) -> Iterator[Error]: - """ - Returns an Iterator that yields only those messages that have - at least the given error level. - """ - return (err for err in messages if err.code >= level)
- - -####################################################################### -# -# Setting of line, column and position properties of error messages. -# -####################################################################### - - -
[docs]def linebreaks(text: Union[StringView, str]) -> List[int]: - """ - Returns a list of indices all line breaks in the text. - """ - lbr = [-1] - i = text.find('\n', 0) - while i >= 0: - lbr.append(i) - i = text.find('\n', i + 1) - lbr.append(len(text)) - return lbr
- - -
[docs]def line_col(lbreaks: List[int], pos: int) -> Tuple[int, int]: - """ - Returns the position within a text as (line, column)-tuple based - on a list of all line breaks, including -1 and EOF. - """ - if not lbreaks and pos >= 0: - return 0, pos - if pos < 0 or pos > lbreaks[-1]: # one character behind EOF is still an allowed position! - raise ValueError('Position %i outside text of length %s !' % (pos, lbreaks[-1])) - line = bisect.bisect_left(lbreaks, pos) - column = pos - lbreaks[line - 1] - return line, column
- - -# def line_col(text: Union[StringView, str], pos: int) -> Tuple[int, int]: -# """ -# Returns the position within a text as (line, column)-tuple. -# """ -# if pos < 0 or add_pos > len(text): # one character behind EOF is still an allowed position! -# raise ValueError('Position %i outside text of length %s !' % (pos, len(text))) -# line = text.count("\n", 0, pos) + 1 -# column = pos - text.rfind("\n", 0, add_pos) -# return line, column - - -
[docs]def adjust_error_locations(errors: List[Error], - original_text: Union[StringView, str], - source_mapping: SourceMapFunc=lambda i: i) -> List[Error]: - """Adds (or adjusts) line and column numbers of error messages in place. - - Args: - errors: The list of errors as returned by the method - ``collect_errors()`` of a Node object - original_text: The source text on which the errors occurred. - (Needed in order to determine the line and column numbers.) - source_mapping: A function that maps error positions to their - positions in the original source file. - - Returns: - The list of errors. (Returning the list of errors is just syntactical - sugar. Be aware that the line, col and orig_pos attributes have been - changed in place.) - """ - line_breaks = linebreaks(original_text) - for err in errors: - assert err.pos >= 0 - err.orig_pos = source_mapping(err.pos) - err.line, err.column = line_col(line_breaks, err.orig_pos) - return errors
-
- -
- -
-
- - -
- -
-

- © Copyright 2018, Eckhart Arnold. - -

-
- Built with Sphinx using a theme provided by Read the Docs. - -
- -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation/_modules/index.html b/documentation/_modules/index.html deleted file mode 100644 index d41414f026dabbe799076f5c4242570f0464d4c3..0000000000000000000000000000000000000000 --- a/documentation/_modules/index.html +++ /dev/null @@ -1,224 +0,0 @@ - - - - - - - - - - - Overview: module code — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- -
    - -
  • Docs »
  • - -
  • Overview: module code
  • - - -
  • - -
  • - -
- - -
-
-
-
- -

All modules for which code is available

- - -
- -
-
- - -
- -
-

- © Copyright 2018, Eckhart Arnold. - -

-
- Built with Sphinx using a theme provided by Read the Docs. - -
- -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation/_modules/log.html b/documentation/_modules/log.html deleted file mode 100644 index b4e3cfe10d3c7bb99483092c67b5e7e6572c41eb..0000000000000000000000000000000000000000 --- a/documentation/_modules/log.html +++ /dev/null @@ -1,670 +0,0 @@ - - - - - - - - - - - log — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -

Source code for log

-# logging.py - logging and debugging for DHParser
-#
-# Copyright 2018  by Eckhart Arnold (arnold@badw.de)
-#                 Bavarian Academy of Sciences an Humanities (badw.de)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied.  See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Module ``log`` contains logging and debugging support for the
-parsing process.
-
-For logging functionality, the global variable LOGGING is defined which
-contains the name of a directory where log files shall be placed. By
-setting its value to the empty string "" logging can be turned off.
-
-To read the directory name function ``LOGS_DIR()`` should be called
-rather than reading the variable LOGGING. ``LOGS_DIR()`` makes sure
-the directory exists and raises an error if a file with the same name
-already exists.
-
-For debugging of the parsing process, the parsing history can be
-logged and written to an html-File.
-
-For ease of use module ``log`` defines a context-manager ``logging``
-to which either ``False`` (turn off logging), a log directory name or
-``True`` for the default logging directory is passed as argument.
-The other components of DHParser check whether logging is on and
-write log files in the the logging directory accordingly. Usually,
-this will be concrete and abstract syntax trees as well as the full
-and abreviated parsing history.
-
-Example::
-
-    from DHParser import compile_source, logging
-
-    with logging("LOGS"):
-        result, errors, ast = compile_source(source, preprocessor, grammar,
-                                             transformer, compiler)
-"""
-
-import collections
-import contextlib
-import html
-import os
-
-from DHParser.error import line_col
-from DHParser.stringview import StringView
-from DHParser.syntaxtree import Node, WHITESPACE_PTYPE
-from DHParser.toolkit import is_filename, escape_control_characters, typing
-from typing import List, Tuple, Union
-
-__all__ = ('log_dir',
-           'logging',
-           'is_logging',
-           'logfile_basename',
-           'clear_logs',
-           'HistoryRecord',
-           'log_ST',
-           'log_parsing_history')
-
-
-#######################################################################
-#
-# logging context manager and logfile support
-#
-#######################################################################
-
-
-
[docs]def log_dir() -> str: - """Creates a directory for log files (if it does not exist) and - returns its path. - - WARNING: Any files in the log dir will eventually be overwritten. - Don't use a directory name that could be the name of a directory - for other purposes than logging. - - Returns: - name of the logging directory - """ - # the try-except clauses in the following are precautions for multiprocessing - global LOGGING - try: - dirname = LOGGING # raises a name error if LOGGING is not defined - if not dirname: - raise NameError # raise a name error if LOGGING evaluates to False - except NameError: - raise NameError("No access to log directory before logging has been " - "turned on within the same thread/process.") - if os.path.exists(dirname) and not os.path.isdir(dirname): - raise IOError('"' + dirname + '" cannot be used as log directory, ' - 'because it is not a directory!') - else: - try: - os.mkdir(dirname) - except FileExistsError: - pass - info_file_name = os.path.join(dirname, 'info.txt') - if not os.path.exists(info_file_name): - with open(info_file_name, 'w', encoding="utf-8") as f: - f.write("This directory has been created by DHParser to store log files from\n" - "parsing. ANY FILE IN THIS DIRECTORY CAN BE OVERWRITTEN! Therefore,\n" - "do not place any files here and do not bother editing files in this\n" - "directory as any changes will get lost.\n") - return dirname
- - -
[docs]@contextlib.contextmanager -def logging(dirname="LOGS"): - """Context manager. Log files within this context will be stored in - directory ``dirname``. Logging is turned off if name is empty. - - Args: - dirname: the name for the log directory or the empty string to - turn logging of - """ - global LOGGING - if dirname and not isinstance(dirname, str): - dirname = "LOGS" # be fail tolerant here... - try: - save = LOGGING - except NameError: - save = "" - LOGGING = dirname or "" - yield - LOGGING = save
- - -
[docs]def is_logging() -> bool: - """-> True, if logging is turned on.""" - global LOGGING - try: - return bool(LOGGING) - except NameError: - return False
- - -
[docs]def logfile_basename(filename_or_text, function_or_class_or_instance) -> str: - """Generates a reasonable logfile-name (without extension) based on - the given information. - """ - if is_filename(filename_or_text): - return os.path.basename(os.path.splitext(filename_or_text)[0]) - else: - try: - name = function_or_class_or_instance.__qualname.__ - except AttributeError: - name = function_or_class_or_instance.__class__.__name__ - i = name.find('.') - return name[:i] + '_out' if i >= 0 else name
- - -
[docs]def clear_logs(logfile_types=frozenset(['.cst', '.ast', '.log'])): - """Removes all logs from the log-directory and removes the - log-directory if it is empty. - """ - log_dirname = log_dir() - files = os.listdir(log_dirname) - only_log_files = True - for file in files: - path = os.path.join(log_dirname, file) - if os.path.splitext(file)[1] in logfile_types or file == 'info.txt': - os.remove(path) - else: - only_log_files = False - if only_log_files: - os.rmdir(log_dirname)
- - -####################################################################### -# -# parsing history -# -####################################################################### - - -
[docs]class HistoryRecord: - """ - Stores debugging information about one completed step in the - parsing history. - - A parsing step is "completed" when the last one of a nested - sequence of parser-calls returns. The call stack including - the last parser call will be frozen in the ``HistoryRecord``- - object. In addition a reference to the generated leaf node - (if any) will be stored and the result status of the last - parser call, which ist either MATCH, FAIL (i.e. no match) - or ERROR. - """ - __slots__ = ('call_stack', 'node', 'text', 'line_col') - - MATCH = "MATCH" - ERROR = "ERROR" - FAIL = "FAIL" - Snapshot = collections.namedtuple('Snapshot', ['line', 'column', 'stack', 'status', 'text']) - - COLGROUP = '<colgroup>\n<col style="width:2%"/><col style="width:2%"/><col ' \ - 'style="width:75%"/><col style="width:6%"/><col style="width:15%"/>\n</colgroup>' - HEADINGS = ('<tr><th>L</th><th>C</th><th>parser call sequence</th>' - '<th>success</th><th>text matched or failed</th></tr>') - HTML_LEAD_IN = ('<!DOCTYPE html>\n' - '<html>\n<head>\n<meta charset="utf-8"/>\n<style>\n' - 'td,th {font-family:monospace; ' - 'border-right: thin solid grey; border-bottom: thin solid grey}\n' - 'td.line, td.column {color:darkgrey}\n' # 'td.stack {}\n' - 'td.status {font-weight:bold}\n' - 'td.text {color:darkblue}\n' - 'table {border-spacing: 0px; border: thin solid darkgrey; width:100%}\n' - 'span {color:grey;}\nspan.match {color:darkgreen}\n' - 'span.fail {color:darkgrey}\nspan.error {color:red}\n' - 'span.matchstack {font-weight:bold;color:darkred}' - '\n</style>\n</head>\n<body>\n') - HTML_LEAD_OUT = '\n</body>\n</html>\n' - - def __init__(self, call_stack: List['Parser'], node: Node, text: StringView) -> None: - # copy call stack, dropping uninformative Forward-Parsers - self.call_stack = [p for p in call_stack if p.ptype != ":Forward"] # type: List['Parser'] - self.node = node # type: Node - self.text = text # type: StringView - self.line_col = (1, 1) # type: Tuple[int, int] - if call_stack: - grammar = call_stack[-1].grammar - document = grammar.document__ - lbreaks = grammar.document_lbreaks__ - self.line_col = line_col(lbreaks, len(document) - len(text)) - - def __str__(self): - return '%4i, %2i: %s; %s; "%s"' % self.as_tuple() - -
[docs] def as_tuple(self) -> Snapshot: - """ - Returns history record formatted as a snapshot tuple. - """ - return self.Snapshot(self.line_col[0], self.line_col[1], - self.stack, self.status, self.excerpt)
- -
[docs] def as_csv_line(self) -> str: - """ - Returns history record formatted as a csv table row. - """ - return '"{}", "{}", "{}", "{}", "{}"'.format(*self.as_tuple())
- -
[docs] def as_html_tr(self) -> str: - """ - Returns history record formatted as an html table row. - """ - stack = html.escape(self.stack).replace( - '-&gt;', '<span>&shy;-&gt;</span>') - status = html.escape(self.status) - excerpt = html.escape(self.excerpt) - if status == self.MATCH: - status = '<span class="match">' + status + '</span>' - i = stack.rfind('-&gt;') - chr = stack[i+12:i+13] - while not chr.isidentifier() and i >= 0: - i = stack.rfind('-&gt;', 0, i) - chr = stack[i+12:i+13] - if i >= 0: - i += 12 - k = stack.find('<', i) - if k < 0: - stack = stack[:i] + '<span class="matchstack">' + stack[i:] - else: - stack = stack[:i] + '<span class="matchstack">' + stack[i:k] \ - + '</span>' + stack[k:] - elif status == self.FAIL: - status = '<span class="fail">' + status + '</span>' - else: - stack += '<br/>\n' + status - status = '<span class="error">ERROR</span>' - tpl = self.Snapshot(str(self.line_col[0]), str(self.line_col[1]), stack, status, excerpt) - # return ''.join(['<tr>'] + [('<td>%s</td>' % item) for item in tpl] + ['</tr>']) - return ''.join(['<tr>'] + [('<td class="%s">%s</td>' % (cls, item)) - for cls, item in zip(tpl._fields, tpl)] + ['</tr>'])
- - def err_msg(self) -> str: - return self.ERROR + ": " + "; ".join(str(e) for e in (self.node.errors)) - - @property - def stack(self) -> str: - return "->".join((p.repr if p.ptype in {':RegExp', ':PlainText'} else p.name or p.ptype) - for p in self.call_stack) - - @property - def status(self) -> str: - return self.FAIL if self.node is None else \ - ('"%s"' % self.err_msg()) if self.node.errors else self.MATCH - # has_errors(self.node._errors) - - @property - def excerpt(self): - length = len(self.node) if self.node else len(self.text) - excerpt = str(self.node)[:min(length, 20)] if self.node else str(self.text[:20]) - excerpt = escape_control_characters(excerpt) - if length > 20: - excerpt += '...' - return excerpt - - # @property - # def extent(self) -> slice: - # return (slice(-self.remaining - len(self.node), -self.remaining) if self.node - # else slice(-self.remaining, None)) - - @property - def remaining(self) -> int: - return len(self.text) - (len(self.node) if self.node else 0) - -
[docs] @staticmethod - def last_match(history: List['HistoryRecord']) -> Union['HistoryRecord', None]: - """ - Returns the last match from the parsing-history. - Args: - history: the parsing-history as a list of HistoryRecord objects - - Returns: - the history record of the last match or none if either history is - empty or no parser could match - """ - for record in reversed(history): - if record.status == HistoryRecord.MATCH: - return record - return None
- -
[docs] @staticmethod - def most_advanced_match(history: List['HistoryRecord']) -> Union['HistoryRecord', None]: - """ - Returns the closest-to-the-end-match from the parsing-history. - Args: - history: the parsing-history as a list of HistoryRecord objects - - Returns: - the history record of the closest-to-the-end-match or none if either history is - empty or no parser could match - """ - remaining = -1 - result = None - for record in history: - if (record.status == HistoryRecord.MATCH and - (record.remaining < remaining or remaining < 0)): - result = record - remaining = record.remaining - return result
- - -####################################################################### -# -# context specific log functions, i.e. logging of syntax trees, -# grammar history and the like -# -####################################################################### - - -
[docs]def log_ST(syntax_tree, log_file_name): - """ - Writes an S-expression-representation of the `syntax_tree` to a file, - if logging is turned on. - """ - if is_logging(): - path = os.path.join(log_dir(), log_file_name) - if os.path.exists(path): - print('WARNING: Log-file "%s" already exists and will be overwritten!' % path) - with open(path, "w", encoding="utf-8") as f: - f.write(syntax_tree.as_sxpr())
- - -LOG_SIZE_THRESHOLD = 100000 # maximum number of history records to log -LOG_TAIL_THRESHOLD = 500 # maximum number of history recors for "tail log" - - -
[docs]def log_parsing_history(grammar, log_file_name: str = '', html: bool=True) -> None: - """ - Writes a log of the parsing history of the most recently parsed document. - - Parameters: - grammar (Grammar): The Grammar object from which the parsing history - shall be logged. - log_file_name (str): The (base-)name of the log file to be written. - If no name is given (default), then the class name of the grammar - object will be used. - html (bool): If true (default), the log will be output as html-Table, - otherwise as plain test. (Browsers might take a few seconds or - minutes to display the table for long histories.) - """ - def write_log(history, log_name): - htm = '.html' if html else '' - path = os.path.join(log_dir(), log_name + "_parser.log" + htm) - if os.path.exists(path): - os.remove(path) - print('WARNING: Log-file "%s" already existed and was deleted.' % path) - if history: - with open(path, "w", encoding="utf-8") as f: - if html: - f.write(HistoryRecord.HTML_LEAD_IN + '\n') - f.write("\n".join(history)) - f.write('\n</table>\n' + HistoryRecord.HTML_LEAD_OUT) - else: - f.write("\n".join(history)) - - def append_line(log, line): - """Appends a line to a list of HTML table rows. Starts a new - table every 100 rows to allow browser to speed up rendering. - Does this really work...?""" - log.append(line) - if html and len(log) % 50 == 0: - log.append('\n'.join(['</table>\n<table>', HistoryRecord.COLGROUP])) - - if not is_logging(): - raise AssertionError("Cannot log history when logging is turned off!") - # assert self.history__, \ - # "Parser did not yet run or logging was turned off when running parser!" - if not log_file_name: - name = grammar.__class__.__name__ - log_file_name = name[:-7] if name.lower().endswith('grammar') else name - elif log_file_name.lower().endswith('.log'): - log_file_name = log_file_name[:-4] - - full_history = ['<h1>Full parsing history of "%s"</h1>' % log_file_name] # type: List[str] - # match_history = ['<h1>Match history of parsing "%s"</h1>' % log_file_name] # type: List[str] - # errors_only = ['<h1>Errors when parsing "%s"</h1>' % log_file_name] # type: List[str] - - if len(grammar.history__) > LOG_SIZE_THRESHOLD: - warning =('Sorry, man, %iK history records is just too many! ' - 'Only looking at the last %iK records.' - % (len(grammar.history__)//1000, LOG_SIZE_THRESHOLD//1000)) - html_warning = '<p><strong>' + warning + '</strong></p>' - full_history.append(html_warning) - # match_history.append(html_warning) - # errors_only.append(html_warning) - - lead_in = '\n'. join(['<table>', HistoryRecord.COLGROUP, HistoryRecord.HEADINGS]) - full_history.append(lead_in) - # match_history.append(lead_in) - # errors_only.append(lead_in) - - for record in grammar.history__[-LOG_SIZE_THRESHOLD:]: - line = record.as_html_tr() if html else str(record) - append_line(full_history, line) - # if record.node and record.node.parser.ptype != WHITESPACE_PTYPE: - # append_line(match_history, line) - # if record.node.errors: - # append_line(errors_only, line) - write_log(full_history, log_file_name + '_full') - if len(full_history) > LOG_TAIL_THRESHOLD + 10: - heading = '<h1>Last 500 records of parsing history of "%s"</h1>' % log_file_name + lead_in - write_log([heading] + full_history[-LOG_TAIL_THRESHOLD:], log_file_name + '_full.tail')
- # write_log(match_history, log_file_name + '_match') - # if (len(errors_only) > 3 or (len(grammar.history__) <= LOG_SIZE_THRESHOLD - # and len(errors_only) > 2)): - # write_log(errors_only, log_file_name + '_errors') -
- -
- -
-
- - -
- -
-

- © Copyright 2018, Eckhart Arnold. - -

-
- Built with Sphinx using a theme provided by Read the Docs. - -
- -
-
- -
- -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/documentation/_modules/parse.html b/documentation/_modules/parse.html deleted file mode 100644 index 1c9eebcf0209fd451c474e9923e302c0f6467b72..0000000000000000000000000000000000000000 --- a/documentation/_modules/parse.html +++ /dev/null @@ -1,2174 +0,0 @@ - - - - - - - - - - - parse — DHParser 0.8 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- - - - - -
- -
- - - - - - - - - - - - - - - - - -
- - - - -
-
-
-
- -

Source code for parse

-# parse.py - parser combinators for DHParser
-#
-# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
-#                 Bavarian Academy of Sciences an Humanities (badw.de)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied.  See the License for the specific language governing
-# permissions and limitations under the License.
-
-
-"""
-Module ``parse`` contains the python classes and functions for
-DHParser's packrat-parser. It's central class is the
-``Grammar``-class, which is the base class for any concrete
-Grammar. Grammar-objects are callable and parsing is done by
-calling a Grammar-object with a source text as argument.
-
-The different parsing functions are callable descendants of class
-``Parser``. Usually, they are organized in a tree and defined
-within the namespace of a grammar-class. See ``ebnf.EBNFGrammar``
-for an example.
-"""
-
-
-from collections import defaultdict
-import copy
-
-from DHParser.error import Error, linebreaks
-from DHParser.log import is_logging, HistoryRecord
-from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME
-from DHParser.stringview import StringView, EMPTY_STRING_VIEW
-from DHParser.syntaxtree import Node, RootNode, ParserBase, WHITESPACE_PTYPE, \
-    PLAINTEXT_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER
-from DHParser.toolkit import sane_parser_name, escape_control_characters, re, typing
-from typing import Callable, cast, Dict, DefaultDict, List, Set, Tuple, Union, Optional
-
-
-__all__ = ('Parser',
-           'UnknownParserError',
-           'Grammar',
-           'PreprocessorToken',
-           'RegExp',
-           'Whitespace',
-           'RE',
-           'Token',
-           'mixin_comment',
-           # 'UnaryOperator',
-           # 'NaryOperator',
-           'Synonym',
-           'Option',
-           'ZeroOrMore',
-           'OneOrMore',
-           'Series',
-           'Alternative',
-           'AllOf',
-           'SomeOf',
-           'Unordered',
-           'Required',
-           'Lookahead',
-           'NegativeLookahead',
-           'Lookbehind',
-           'NegativeLookbehind',
-           'last_value',
-           'counterpart',
-           'accumulate',
-           'Capture',
-           'Retrieve',
-           'Pop',
-           'Forward')
-
-
-########################################################################
-#
-# Grammar and parsing infrastructure
-#
-########################################################################
-
-
-LEFT_RECURSION_DEPTH = 8  # type: int
-# because of python's recursion depth limit, this value ought not to be
-# set too high. PyPy allows higher values than CPython
-MAX_DROPOUTS = 3  # type: int
-# stop trying to recover parsing after so many errors
-
-
-def add_parser_guard(parser_func):
-    """
-    Add a wrapper function to a parser functions (i.e. Parser.__call__ method)
-    that takes care of memoizing, left recursion and, optionally, tracing
-    (aka "history tracking") of parser calls. Returns the wrapped call.
-    """
-    def guarded_call(parser: 'Parser', text: StringView) -> Tuple[Optional[Node], StringView]:
-        try:
-            grammar = parser.grammar
-            location = grammar.document_length__ - len(text)
-
-            if grammar.last_rb__loc__ >= location:
-                grammar.rollback_to__(location)
-
-            # if location has already been visited by the current parser,
-            # return saved result
-            if location in parser.visited:
-                # no history recording in case of meomized results
-                return parser.visited[location]
-
-            if grammar.history_tracking__:
-                grammar.call_stack__.append(parser)
-                grammar.moving_forward__ = True
-
-            # break left recursion at the maximum allowed depth
-            if grammar.left_recursion_handling__:
-                if parser.recursion_counter[location] > LEFT_RECURSION_DEPTH:
-                    grammar.recursion_locations__.add(location)
-                    return None, text
-                parser.recursion_counter[location] += 1
-
-            # run original __call__ method
-            node, rest = parser_func(parser, text)
-
-            if grammar.left_recursion_handling__:
-                parser.recursion_counter[location] -= 1
-                # don't clear recursion_locations__ !!!
-
-            if node is None:
-                # retrieve an earlier match result (from left recursion) if it exists
-                if location in grammar.recursion_locations__:
-                    if location in parser.visited:
-                        node, rest = parser.visited[location]
-                        # TODO: maybe add a warning about occurrence of left-recursion here?
-                    # don't overwrite any positive match (i.e. node not None) in the cache
-                    # and don't add empty entries for parsers returning from left recursive calls!
-                elif grammar.memoization__:
-                    # otherwise also cache None-results
-                    parser.visited[location] = (None, rest)
-            else:
-                assert node._pos < 0
-                node._pos = location
-                assert node._pos >= 0, str("%i < %i" % (grammar.document_length__, location))
-                if (grammar.last_rb__loc__ < location
-                        and (grammar.memoization__ or location in grammar.recursion_locations__)):
-                    # - variable manipulating parsers will not be entered into the cache,
-                    #   because caching would interfere with changes of variable state
-                    # - in case of left recursion, the first recursive step that
-                    #   matches will store its result in the cache
-                    parser.visited[location] = (node, rest)
-
-            # Mind that meomized parser calls will not appear in the history record!
-            if grammar.history_tracking__:
-                # don't track returning parsers except in case an error has occurred
-                # remaining = len(rest)
-                if (grammar.moving_forward__ or (node and node.errors)):
-                    record = HistoryRecord(grammar.call_stack__, node, text)
-                    grammar.history__.append(record)
-                    # print(record.stack, record.status, rest[:20].replace('\n', '|'))
-                grammar.moving_forward__ = False
-                grammar.call_stack__.pop()
-
-        except RecursionError:
-            node = Node(None, str(text[:min(10, max(1, text.find("\n")))]) + " ...")
-            node._pos = location
-            grammar.tree__.new_error(node, "maximum recursion depth of parser reached; "
-                                     "potentially due to too many errors!")
-            rest = EMPTY_STRING_VIEW
-
-        return node, rest
-
-    return guarded_call
-
-
-
[docs]class Parser(ParserBase): - """ - (Abstract) Base class for Parser combinator parsers. Any parser - object that is actually used for parsing (i.e. no mock parsers) - should should be derived from this class. - - Since parsers can contain other parsers (see classes UnaryOperator - and NaryOperator) they form a cyclical directed graph. A root - parser is a parser from which all other parsers can be reached. - Usually, there is one root parser which serves as the starting - point of the parsing process. When speaking of "the root parser" - it is this root parser object that is meant. - - There are two different types of parsers: - - 1. *Named parsers* for which a name is set in field `parser.name`. - The results produced by these parsers can later be retrieved in - the AST by the parser name. - - 2. *Anonymous parsers* where the name-field just contains the empty - string. AST-transformation of Anonymous parsers can be hooked - only to their class name, and not to the individual parser. - - Parser objects are callable and parsing is done by calling a parser - object with the text to parse. - - If the parser matches it returns a tuple consisting of a node - representing the root of the concrete syntax tree resulting from the - match as well as the substring `text[i:]` where i is the length of - matched text (which can be zero in the case of parsers like - `ZeroOrMore` or `Option`). If `i > 0` then the parser has "moved - forward". - - If the parser does not match it returns `(None, text). **Note** that - this is not the same as an empty match `("", text)`. Any empty match - can for example be returned by the `ZeroOrMore`-parser in case the - contained parser is repeated zero times. - - Attributes and Properties: - visited: Mapping of places this parser has already been to - during the current parsing process onto the results the - parser returned at the respective place. This dictionary - is used to implement memoizing. - - recursion_counter: Mapping of places to how often the parser - has already been called recursively at this place. This - is needed to implement left recursion. The number of - calls becomes irrelevant once a resault has been memoized. - - cycle_detection: The apply()-method uses this variable to make - sure that one and the same function will not be applied - (recursively) a second time, if it has already been - applied to this parser. - - grammar: A reference to the Grammar object to which the parser - is attached. - """ - - ApplyFunc = Callable[['Parser'], None] - - def __init__(self, name: str = '') -> None: - # assert isinstance(name, str), str(name) - super().__init__(name) - self._grammar = None # type: Optional['Grammar'] - self.reset() - - # add "aspect oriented" wrapper around parser calls - # for memoizing, left recursion and tracing - if not isinstance(self, Forward): # should Forward-Parser no be guarded? Not sure... - guarded_parser_call = add_parser_guard(self.__class__.__call__) - # The following check is necessary for classes that don't override - # the __call__() method, because in these cases the non-overridden - # __call__()-method would be substituted a second time! - if self.__class__.__call__.__code__ != guarded_parser_call.__code__: - self.__class__.__call__ = guarded_parser_call - - def __deepcopy__(self, memo): - """Deepcopy method of the parser. Upon instantiation of a Grammar- - object, parsers will be deep-copied to the Grammar object. If a - derived parser-class changes the signature of the constructor, - `__deepcopy__`-method must be replaced (i.e. overridden without - calling the same method from the superclass) by the derived class. - """ - return self.__class__(self.name) - -
[docs] def reset(self): - """Initializes or resets any parser variables. If overwritten, - the `reset()`-method of the parent class must be called from the - `reset()`-method of the derived class.""" - self.visited = dict() # type: Dict[int, Tuple[Optional[Node], StringView]] - self.recursion_counter = defaultdict(lambda :0) # type: DefaultDict[int, int] - self.cycle_detection = set() # type: Set[Callable]
- - def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]: - """Applies the parser to the given `text` and returns a node with - the results or None as well as the text at the position right behind - the matching string.""" - return None, text # default behaviour: don't match - - def __add__(self, other: 'Parser') -> 'Series': - """The + operator generates a series-parser that applies two - parsers in sequence.""" - return Series(self, other) - - def __or__(self, other: 'Parser') -> 'Alternative': - """The | operator generates an alternative parser that applies - the first parser and, if that does not match, the second parser. - """ - return Alternative(self, other) - - @property - def grammar(self) -> 'Grammar': - return self._grammar - - @grammar.setter - def grammar(self, grammar: 'Grammar'): - if self._grammar is None: - self._grammar = grammar - self._grammar_assigned_notifier() - else: - assert self._grammar == grammar, \ - "Parser has already been assigned to a different Grammar object!" - - def _grammar_assigned_notifier(self): - """A function that notifies the parser object that it has been - assigned to a grammar.""" - pass - -
[docs] def apply(self, func: ApplyFunc) -> bool: - """ - Applies function `func(parser)` recursively to this parser and all - descendant parsers if any exist. The same function can never - be applied twice between calls of the ``reset()``-method! - Returns `True`, if function has been applied, `False` if function - had been applied earlier already and thus has not been applied again. - """ - if func in self.cycle_detection: - return False - else: - assert not self.visited, "No calls to Parser.apply() during or " \ - "after ongoing parsing process. (Call Parser.reset() first.)" - self.cycle_detection.add(func) - func(self) - return True
- - -
[docs]def mixin_comment(whitespace: str, comment: str) -> str: - """ - Returns a regular expression that merges comment and whitespace - regexps. Thus comments cann occur whereever whitespace is allowed - and will be skipped just as implicit whitespace. - - Note, that because this works on the level of regular expressions, - nesting comments is not possible. It also makes it much harder to - use directives inside comments (which isn't recommended, anyway). - """ - wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)' - return wspc
- - -
[docs]class UnknownParserError(KeyError): - """UnknownParserError is raised if a Grammer object is called with a - parser that does not exist or if in the course of parsing a parser - is reffered to that does not exist."""
- - -
[docs]class Grammar: - r""" - Class Grammar directs the parsing process and stores global state - information of the parsers, i.e. state information that is shared - accross parsers. - - Grammars are basically collections of parser objects, which are - connected to an instance object of class Grammar. There exist two - ways of connecting parsers to grammar objects: Either by passing - the root parser object to the constructor of a Grammar object - ("direct instantiation"), or by assigning the root parser to the - class variable "root__" of a descendant class of class Grammar. - - Example for direct instantiation of a grammar:: - - >>> number = RE('\d+') + RE('\.') + RE('\d+') | RE('\d+') - >>> number_parser = Grammar(number) - >>> number_parser("3.1416").content - '3.1416' - - Collecting the parsers that define a grammar in a descendant class of - class Grammar and assigning the named parsers to class variables - rather than global variables has several advantages: - - 1. It keeps the namespace clean. - - 2. The parser names of named parsers do not need to be passed to the - constructor of the Parser object explicitly, but it suffices to - assign them to class variables, which results in better - readability of the Python code. - - 3. The parsers in the class do not necessarily need to be connected - to one single root parser, which is helpful for testing and - building up a parser successively of several components. - - As a consequence, though, it is highly recommended that a Grammar - class should not define any other variables or methods with names - that are legal parser names. A name ending with a double - underscore '__' is *not* a legal parser name and can safely be - used. - - Example:: - - class Arithmetic(Grammar): - # special fields for implicit whitespace and comment configuration - COMMENT__ = r'#.*(?:\n|$)' # Python style comments - wspR__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__) - - # parsers - expression = Forward() - INTEGER = RE('\\d+') - factor = INTEGER | Token("(") + expression + Token(")") - term = factor + ZeroOrMore((Token("*") | Token("/")) + factor) - expression.set(term + ZeroOrMore((Token("+") | Token("-")) + term)) - root__ = expression - - Upon instantiation the parser objects are deep-copied to the - Grammar object and assigned to object variables of the same name. - Any parser that is directly assigned to a class variable is a - 'named' parser and its field `parser.name` contains the variable - name after instantiation of the Grammar class. All other parsers, - i.e. parsers that are defined within a `named` parser, remain - "anonymous parsers" where `parser.name` is the empty string, unless - a name has been passed explicitly upon instantiation. - If one and the same parser is assigned to several class variables - such as, for example the parser `expression` in the example above, - the first name sticks. - - Grammar objects are callable. Calling a grammar object with a UTF-8 - encoded document, initiates the parsing of the document with the - root parser. The return value is the concrete syntax tree. Grammar - objects can be reused (i.e. called again) after parsing. Thus, it - is not necessary to instantiate more than one Grammar object per - thread. - - Grammar classes contain a few special class fields for implicit - whitespace and comments that should be overwritten, if the defaults - (no comments, horizontal right aligned whitespace) don't fit: - - Attributes: - COMMENT__: regular expression string for matching comments - - WSP__: regular expression for whitespace and comments - - wspL__: regular expression string for left aligned whitespace, - which either equals WSP__ or is empty. - - wspR__: regular expression string for right aligned whitespace, - which either equals WSP__ or is empty. - - root__: The root parser of the grammar. Theoretically, all parsers of the - grammar should be reachable by the root parser. However, for testing - of yet incomplete grammars class Grammar does not assume that this - is the case. - - parser_initializiation__: Before the parser class (!) has been initialized, - which happens upon the first time it is instantiated (see - :func:_assign_parser_names()` for an explanation), this class - field contains a value other than "done". A value of "done" indicates - that the class has already been initialized. - - python__src__: For the purpose of debugging and inspection, this field can - take the python src of the concrete grammar class - (see `dsl.grammar_provider`). - - Attributes: - all_parsers__: A set of all parsers connected to this grammar object - - history_tracking__: A flag indicating that the parsing history shall - be tracked - - whitespace__: A parser for the implicit optional whitespace (or the - :class:zombie-parser if the default is empty). The default - whitespace will be used by parsers :class:`Token` and, if no - other parsers are passed to its constructor, by parser - :class:`RE`. It can also be place explicitly in the - EBNF-Grammar via the "~"-sign. - - wsp_left_parser__: The same as ``whitespace`` for - left-adjacent-whitespace. - - wsp_right_parser__: The same as ``whitespace`` for - right-adjacent-whitespace. - - _dirty_flag__: A flag indicating that the Grammar has been called at - least once so that the parsing-variables need to be reset - when it is called again. - - document__: the text that has most recently been parsed or that is - currently being parsed. - - document_length__: the length of the document. - - document_lbreaks__: list of linebreaks within the document, starting - with -1 and ending with EOF. This helps generating line - and column number for history recording and will only be - initialized if :attr:`history_tracking__` is true. - - tree__: The root-node of the parsing tree. This variable is available - for error-reporting already during parsing via - ``self.grammar.tree__.add_error``, but it references the full - parsing tree only after parsing has been finished. - - _reversed__: the same text in reverse order - needed by the `Lookbehind`- - parsers. - - variables__: A mapping for variable names to a stack of their respective - string values - needed by the :class:`Capture`-, :class:`Retrieve`- - and :class:`Pop`-parsers. - - rollback__: A list of tuples (location, rollback-function) that are - deposited by the :class:`Capture`- and :class:`Pop`-parsers. - If the parsing process reaches a dead end then all - rollback-functions up to the point to which it retreats will be - called and the state of the variable stack restored accordingly. - - last_rb__loc__: The last, i.e. most advanced location in the text - where a variable changing operation occurred. If the parser - backtracks to a location at or before last_rb__loc__ (i.e. - location <= last_rb__loc__) then a rollback of all variable - changing operations is necessary that occurred after the - location to which the parser backtracks. This is done by - calling method :func:`rollback_to__(location)`. - - call_stack__: A stack of all parsers that have been called. This - is required for recording the parser history (for debugging) - and, eventually, i.e. one day in the future, for tracing through - the parsing process. - - history__: A list of parser-call-stacks. A parser-call-stack is - appended to the list each time a parser either matches, fails - or if a parser-error occurs. - - moving_forward__: This flag indicates that the parsing process is currently - moving forward . It is needed to reduce noise in history recording - and should not be considered as having a valid value if history - recording is turned off! (See :func:`add_parser_guard` and its local - function :func:`guarded_call`) - - recursion_locations__: Stores the locations where left recursion was - detected. Needed to provide minimal memoization for the left - recursion detection algorithm, but, strictly speaking, superfluous - if full memoization is enabled. (See :func:`add_parser_guard` and its - local function :func:`guarded_call`) - - memoization__: Turns full memoization on or off. Turning memoization off - results in less memory usage and sometimes reduced parsing time. - In some situations it may drastically increase parsing time, so - it is safer to leave it on. (Default: on) - - left_recursion_handling__: Turns left recursion handling on or off. - If turned off, a recursion error will result in case of left - recursion. - """ - python_src__ = '' # type: str - root__ = ZOMBIE_PARSER # type: ParserBase - # root__ must be overwritten with the root-parser by grammar subclass - parser_initialization__ = "pending" # type: str - # some default values - COMMENT__ = r'' # type: str # r'#.*(?:\n|$)' - WSP__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__) # type: str - wspL__ = '' # type: str - wspR__ = WSP__ # type: str - - - @classmethod - def _assign_parser_names__(cls): - """ - Initializes the `parser.name` fields of those - Parser objects that are directly assigned to a class field with - the field's name, e.g.:: - - class Grammar(Grammar): - ... - symbol = RE('(?!\\d)\\w+') - - After the call of this method symbol.name == "symbol" - holds. Names assigned via the ``name``-parameter of the - constructor will not be overwritten. Parser names starting or - ending with a double underscore like ``root__`` will be - ignored. See :func:`sane_parser_name()` - - This is done only once, upon the first instantiation of the - grammar class! - - Attention: If there exists more than one reference to the same - parser, only the first one will be chosen for python versions - greater or equal 3.6. For python version <= 3.5 an arbitrarily - selected reference will be chosen. See PEP 520 - (www.python.org/dev/peps/pep-0520/) for an explanation of why. - """ - if cls.parser_initialization__ != "done": - cdict = cls.__dict__ - for entry, parser in cdict.items(): - if isinstance(parser, Parser) and sane_parser_name(entry): - if not parser.name: - parser._name = entry - if isinstance(parser, Forward) and (not cast(Forward, parser).parser.name): - cast(Forward, parser).parser._name = entry - cls.parser_initialization__ = "done" - - - def __init__(self, root: Parser = None) -> None: - # if not hasattr(self.__class__, 'parser_initialization__'): - # self.__class__.parser_initialization__ = "pending" - # if not hasattr(self.__class__, 'wspL__'): - # self.wspL__ = '' - # if not hasattr(self.__class__, 'wspR__'): - # self.wspR__ = '' - self.all_parsers__ = set() # type: Set[ParserBase] - self._dirty_flag__ = False # type: bool - self.history_tracking__ = False # type: bool - self.memoization__ = True # type: bool - self.left_recursion_handling__ = True # type: bool - self._reset__() - - # prepare parsers in the class, first - self._assign_parser_names__() - - # then deep-copy the parser tree from class to instance; - # parsers not connected to the root object will be copied later - # on demand (see Grammar.__getitem__()). Usually, the need to - # do so only arises during testing. - self.root__ = copy.deepcopy(root) if root else copy.deepcopy(self.__class__.root__) - - if self.WSP__: - try: - probe = self.whitespace__ # type: RegExp - assert self.whitespace__.regexp.pattern == self.WSP__ - except AttributeError: - self.whitespace__ = Whitespace(self.WSP__) # type: RegExp - self.whitespace__.grammar = self - self.all_parsers__.add(self.whitespace__) # don't you forget about me... - else: - self.whitespace__ = cast(RegExp, ZOMBIE_PARSER) - - assert not self.wspL__ or self.wspL__ == self.WSP__ - assert not self.wspR__ or self.wspR__ == self.WSP__ - self.wsp_left_parser__ = self.whitespace__ if self.wspL__ else ZOMBIE_PARSER - self.wsp_right_parser__ = self.whitespace__ if self.wspR__ else ZOMBIE_PARSER - - self.root__.apply(self._add_parser__) - - - def __getitem__(self, key): - try: - return self.__dict__[key] - except KeyError: - parser_template = getattr(self, key, None) - if parser_template: - # add parser to grammar object on the fly... - parser = copy.deepcopy(parser_template) - parser.apply(self._add_parser__) - # assert self[key] == parser - return self[key] - raise UnknownParserError('Unknown parser "%s" !' % key) - - - def _reset__(self): - self.tree__ = RootNode() # type: RootNode - self.document__ = EMPTY_STRING_VIEW # type: StringView - self._reversed__ = EMPTY_STRING_VIEW # type: StringView - self.document_length__ = 0 # type: int - self.document_lbreaks__ = [] # type: List[int] - # variables stored and recalled by Capture and Retrieve parsers - self.variables__ = defaultdict(lambda :[]) # type: DefaultDict[str, List[str]] - self.rollback__ = [] # type: List[Tuple[int, Callable]] - self.last_rb__loc__ = -1 # type: int - # support for call stack tracing - self.call_stack__ = [] # type: List[Parser] - # snapshots of call stacks - self.history__ = [] # type: List[HistoryRecord] - # also needed for call stack tracing - self.moving_forward__ = False # type: bool - self.recursion_locations__ = set() # type: Set[int] - - - @property - def reversed__(self) -> StringView: - """ - Returns a reversed version of the currently parsed document. As - about the only case where this is needed is the Lookbehind-parser, - this is done lazily. - """ - if not self._reversed__: - self._reversed__ = StringView(self.document__.text[::-1]) - return self._reversed__ - - - def _add_parser__(self, parser: Parser) -> None: - """ - Adds the particular copy of the parser object to this - particular instance of Grammar. - """ - if parser.name: - # prevent overwriting instance variables or parsers of a different class - assert parser.name not in self.__dict__ or \ - isinstance(self.__dict__[parser.name], parser.__class__), \ - ('Cannot add parser "%s" because a field with the same name ' - 'already exists in grammar object!' % parser.name) - setattr(self, parser.name, parser) - self.all_parsers__.add(parser) - parser.grammar = self - - - def __call__(self, document: str, start_parser="root__") -> Node: - """ - Parses a document with with parser-combinators. - - Args: - document (str): The source text to be parsed. - start_parser (str): The name of the parser with which to - start. This is useful for testing particular parsers - (i.e. particular parts of the EBNF-Grammar.) - Returns: - Node: The root node ot the parse tree. - """ - - def tail_pos(predecessors: Union[List[Node], Tuple[Node, ...]]) -> int: - """Adds the position after the last node in the list of - predecessors to the node.""" - return predecessors[-1].pos + len(predecessors[-1]) if predecessors else 0 - - # assert isinstance(document, str), type(document) - if self.root__ is None: - raise NotImplementedError() - if self._dirty_flag__: - self._reset__() - for parser in self.all_parsers__: - parser.reset() - else: - self._dirty_flag__ = True - self.history_tracking__ = is_logging() - self.document__ = StringView(document) - self.document_length__ = len(self.document__) - self.document_lbreaks__ = linebreaks(document) if self.history_tracking__ else [] - self.last_rb__loc__ = -1 # rollback location - parser = self[start_parser] if isinstance(start_parser, str) else start_parser - assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \ - " %s vs. %s" % (str(self), str(parser.grammar)) - result = None # type: Optional[Node] - stitches = [] # type: List[Node] - rest = self.document__ - if not rest: - result, _ = parser(rest) - if result is None: - result = Node(None, '').init_pos(0) - self.tree__.new_error(result, - 'Parser "%s" did not match empty document.' % str(parser)) - while rest and len(stitches) < MAX_DROPOUTS: - result, rest = parser(rest) - if rest: - fwd = rest.find("\n") + 1 or len(rest) - skip, rest = rest[:fwd], rest[fwd:] - if result is None: - error_msg = 'Parser did not match! Invalid source file?' \ - '\n Most advanced: %s\n Last match: %s;' % \ - (str(HistoryRecord.most_advanced_match(self.history__)), - str(HistoryRecord.last_match(self.history__))) - else: - stitches.append(result) - error_msg = "Parser stopped before end" + \ - (("! trying to recover" + - (" but stopping history recording at this point." - if self.history_tracking__ else "...")) - if len(stitches) < MAX_DROPOUTS - else " too often! Terminating parser.") - stitches.append(Node(None, skip).init_pos(tail_pos(stitches))) - self.tree__.new_error(stitches[-1], error_msg) - if self.history_tracking__: - # # some parsers may have matched and left history records with nodes != None. - # # Because these are not connected to the stitched root node, their pos- - # # properties will not be initialized by setting the root node's pos property - # # to zero. Therefore, their pos properties need to be initialized here - # for record in self.history__: - # if record.node and record.node._pos < 0: - # record.node.init_pos(0) - record = HistoryRecord(self.call_stack__.copy(), stitches[-1], rest) - self.history__.append(record) - # stop history tracking when parser returned too early - self.history_tracking__ = False - if stitches: - if rest: - stitches.append(Node(None, rest)) - result = Node(None, tuple(stitches)).init_pos(0) - if any(self.variables__.values()): - error_str = "Capture-retrieve-stack not empty after end of parsing: " \ - + str(self.variables__) - if result: - if result.children: - # add another child node at the end to ensure that the position - # of the error will be the end of the text. Otherwise, the error - # message above ("...after end of parsing") would appear illogical. - error_node = Node(ZOMBIE_PARSER, '').init_pos(tail_pos(result.children)) - self.tree__.new_error(error_node, error_str) - result.result = result.children + (error_node,) - else: - self.tree__.new_error(result, error_str) - # result.pos = 0 # calculate all positions - # result.collect_errors(self.document__) - self.tree__.swallow(result) - return self.tree__ - - -
[docs] def push_rollback__(self, location, func): - """ - Adds a rollback function that either removes or re-adds - values on the variable stack (`self.variables`) that have been - added (or removed) by Capture or Pop Parsers, the results of - which have been dismissed. - """ - self.rollback__.append((location, func)) - self.last_rb__loc__ = location
- - -
[docs] def rollback_to__(self, location): - """ - Rolls back the variable stacks (`self.variables`) to its - state at an earlier location in the parsed document. - """ - while self.rollback__ and self.rollback__[-1][0] >= location: - _, rollback_func = self.rollback__.pop() - # assert not loc > self.last_rb__loc__, \ - # "Rollback confusion: line %i, col %i < line %i, col %i" % \ - # (*line_col(self.document__, len(self.document__) - loc), - # *line_col(self.document__, len(self.document__) - self.last_rb__loc__)) - rollback_func() - self.last_rb__loc__ == self.rollback__[-1][0] if self.rollback__ \ - else (len(self.document__) + 1)
- - -def dsl_error_msg(parser: Parser, error_str: str) -> str: - """ - Returns an error message for errors in the parser configuration, - e.g. errors that result in infinite loops. - - Args: - parser (Parser): The parser where the error was noticed. Note - that this is not necessarily the parser that caused the - error but only where the error became apparent. - error_str (str): A short string describing the error. - Returns: - str: An error message including the call stack if history - tacking has been turned in the grammar object. - """ - msg = ["DSL parser specification error:", error_str, 'Caught by parser "%s".' % str(parser)] - if parser.grammar.history__: - msg.extend(["\nCall stack:", parser.grammar.history__[-1].stack]) - else: - msg.extend(["\nEnable history tracking in Grammar object to display call stack."]) - return " ".join(msg) - - -######################################################################## -# -# Token and Regular Expression parser classes (i.e. leaf classes) -# -######################################################################## - - -
[docs]class PreprocessorToken(Parser): - """ - Parses tokens that have been inserted by a preprocessor. - - Preprocessors can generate Tokens with the ``make_token``-function. - These tokens start and end with magic characters that can only be - matched by the PreprocessorToken Parser. Such tokens can be used to - insert BEGIN - END delimiters at the beginning or ending of a - quoted block, for example. - """ - - def __init__(self, token: str) -> None: - assert token and token.isupper() - assert RX_TOKEN_NAME.match(token) - super().__init__(token) - - def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]: - if text[0:1] == BEGIN_TOKEN: - end = text.find(END_TOKEN, 1) - if end < 0: - node = Node(self, '') - self.grammar.tree__.new_error(node, - 'END_TOKEN delimiter missing from preprocessor token. ' - '(Most likely due to a preprocessor bug!)') # type: Node - return node, text[1:] - elif end == 0: - node = Node(self, '') - self.grammar.tree__.new_error(node, - 'Preprocessor-token cannot have zero length. ' - '(Most likely due to a preprocessor bug!)') - return node, text[2:] - elif text.find(BEGIN_TOKEN, 1, end) >= 0: - node = Node(self, text[len(self.name) + 1:end]) - self.grammar.tree__.new_error(node, - 'Preprocessor-tokens must not be nested or contain ' - 'BEGIN_TOKEN delimiter as part of their argument. ' - '(Most likely due to a preprocessor bug!)') - return node, text[end:] - if text[1:len(self.name) + 1] == self.name: - return Node(self, text[len(self.name) + 2:end]), text[end + 1:] - return None, text
- - -class PlainText(Parser): - """ - Parses plain text strings. (Could be done by RegExp as well, but is faster.) - - Example:: - - >>> while_token = PlainText("while") - >>> Grammar(while_token)("while").content - 'while' - """ - assert PLAINTEXT_PTYPE == ":PlainText" - - def __init__(self, text: str, name: str = '') -> None: - super().__init__(name) - self.text = text - self.len = len(text) - - def __deepcopy__(self, memo): - return self.__class__(self.text, self.name) - - def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]: - if text.startswith(self.text): - return Node(self, self.text, True), text[self.len:] - return None, text - - def __repr__(self): - return ("'%s'" if self.text.find("'") <= 0 else '"%s"') % self.text - - -
[docs]class RegExp(Parser): - r""" - Regular expression parser. - - The RegExp-parser parses text that matches a regular expression. - RegExp can also be considered as the "atomic parser", because all - other parsers delegate part of the parsing job to other parsers, - but do not match text directly. - - Example:: - - >>> word = RegExp(r'\w+') - >>> Grammar(word)("Haus").content - 'Haus' - - EBNF-Notation: ``/ ... /`` - - EBNF-Example: ``word = /\w+/`` - """ - - def __init__(self, regexp, name: str = '') -> None: - super().__init__(name) - self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp - - def __deepcopy__(self, memo): - # `regex` supports deep copies, but not `re` - try: - regexp = copy.deepcopy(self.regexp, memo) - except TypeError: - regexp = self.regexp.pattern - return self.__class__(regexp, self.name) - - def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]: - match = text.match(self.regexp) - if match: - capture = match.group(0) - end = text.index(match.end()) - # regular expression must never match preprocessor-tokens! - # TODO: Find a better solution here? e.g. static checking/re-mangling at compile time - i = capture.find(BEGIN_TOKEN) - if i >= 0: - capture = capture[:i] - end = i - return Node(self, capture, True), text[end:] - return None, text - - def __repr__(self): - return escape_control_characters('/%s/' % self.regexp.pattern)
- - -
[docs]class Whitespace(RegExp): - """An variant of RegExp that signifies through its class name that it - is a RegExp-parser for whitespace.""" - assert WHITESPACE_PTYPE == ":Whitespace"
- - -####################################################################### -####################################################################### -# -# WARNING: The following code is hard to maintain, because it -# introduces a special case, i.e. a parser with child parsers that is -# not a descandent of the NaryOperator and, because it itneracts -# With the constructor of the Grammar class (see the instantiations of -# the Whitespace-class, there). -# -# That is all the more regrettable, as class RE basically just -# introduces syntactical sugar for -# -# Series(whitespace__, RegExp('something'), whitespace__) -# -# What to do? Throw the syntactical sugar out? :-( Or find a more -# robust solution for that kind of syntactical sugar? Or just leave -# it be? -# -###################################################################### -###################################################################### - - -
[docs]class RE(Parser): - r""" - Regular Expressions with optional leading or trailing whitespace. - - The RE-parser parses pieces of text that match a given regular - expression. Other than the ``RegExp``-Parser it can also skip - "implicit whitespace" before or after the matched text. - - The whitespace is in turn defined by a regular expression. It should - be made sure that this expression also matches the empty string, - e.g. use r'\s*' or r'[\t ]+', but not r'\s+'. If the respective - parameters in the constructor are set to ``None`` the default - whitespace expression from the Grammar object will be used. - - Example (allowing whitespace on the right hand side, but not on - the left hand side of a regular expression):: - - >>> word = RE(r'\w+', wR=r'\s*') - >>> parser = Grammar(word) - >>> result = parser('Haus ') - >>> result.content - 'Haus ' - >>> result.structure - '(:RE (:RegExp "Haus") (:Whitespace " "))' - >>> str(parser(' Haus')) - ' <<< Error on " Haus" | Parser did not match! Invalid source file?\n Most advanced: None\n Last match: None; >>> ' - - EBNF-Notation: ``/ ... /~` or `~/ ... /` or `~/ ... /~`` - - EBNF-Example: ``word = /\w+/~`` - """ - - def __init__(self, regexp, wL=None, wR=None, name: str='') -> None: - r"""Constructor for class RE. - - Args: - regexp (str or regex object): The regular expression to be - used for parsing. - wL (str or regexp): Left whitespace regular expression, - i.e. either ``None``, the empty string or a regular - expression (e.g. "\s*") that defines whitespace. An - empty string means no whitespace will be skipped; ``None`` - means that the default whitespace will be used. - wR (str or regexp): Right whitespace regular expression. - See above. - name: The optional name of the parser. - """ - super().__init__(name) - self.rx_wsl = wL - self.rx_wsr = wR - self.wsp_left = Whitespace(wL) if wL else ZOMBIE_PARSER - self.wsp_right = Whitespace(wR) if wR else ZOMBIE_PARSER - self.main = self.create_main_parser(regexp) - - def __deepcopy__(self, memo={}): - try: - regexp = copy.deepcopy(self.main.regexp, memo) - except TypeError: - regexp = self.main.regexp.pattern - return self.__class__(regexp, self.rx_wsl, self.rx_wsr, self.name) - - def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]: - # assert self.main.regexp.pattern != "@" - txt = text # type: StringView - wsl, txt = self.wsp_left(txt) - main, txt = self.main(txt) - if main: - wsr, txt = self.wsp_right(txt) - result = tuple(nd for nd in (wsl, main, wsr) if nd) - return Node(self, result), txt - return None, text - - def __repr__(self): - wsl = '~' if self.wsp_left != ZOMBIE_PARSER else '' - wsr = '~' if self.wsp_right != ZOMBIE_PARSER else '' - return wsl + '/%s/' % self.main.regexp.pattern + wsr - - def _grammar_assigned_notifier(self): - if self.grammar: - # use default whitespace parsers if not otherwise specified - if self.rx_wsl is None: - self.wsp_left = self.grammar.wsp_left_parser__ - if self.rx_wsr is None: - self.wsp_right = self.grammar.wsp_right_parser__ - -
[docs] def apply(self, func: Parser.ApplyFunc) -> bool: - if super().apply(func): - if self.rx_wsl: - self.wsp_left.apply(func) - if self.rx_wsr: - self.wsp_right.apply(func) - self.main.apply(func) - return True - return False
- -
[docs] def create_main_parser(self, arg) -> Parser: - """Creates the main parser of this compound parser. Can be overridden.""" - return RegExp(arg)
- - -
[docs]class Token(RE): - """ - Class Token parses simple strings. Any regular regular expression - commands will be interpreted as simple sequence of characters. - - Other than that class Token is essentially a renamed version of - class RE. Because tokens often have a particular semantic different - from other REs, parsing them with a separate parser class allows to -