Commit 2ae8204a authored by eckhart's avatar eckhart
Browse files

- XML-parser-example: parser complete, but no AST Transformation an output yet.

parent 4fab203d
......@@ -974,7 +974,7 @@ class Whitespace(RegExp):
#
# WARNING: The following code is hard to maintain, because it
# introduces a special case, i.e. a parser with child parsers that is
# not a descandent of the NaryOperator and, because it itneracts
# not a descendant of the NaryOperator and because it interacts
# With the constructor of the Grammar class (see the instantiations of
# the Whitespace-class, there).
#
......
# XML
PLACE A SHORT DESCRIPTION HERE
This is a complete XML-Parser, see: https://www.w3.org/TR/REC-xml/
Author: Eckhart Arnold <eckhart.arnold@posteo.de>
......
# XML-grammar
# XML-grammar, see https://www.w3.org/TR/REC-xml/
#######################################################################
#
......@@ -6,38 +6,203 @@
#
#######################################################################
@ whitespace = vertical # implicit whitespace, includes any number of line feeds
@ literalws = right # literals have implicit whitespace on the right hand side
@ comment = /#.*/ # comments range from a '#'-character to the end of the line
@ whitespace = /\s*/ # implicit whitespace, signified by ~
@ literalws = none # literals have no implicit whitespace
@ comment = // # no implicit comments
@ ignorecase = False # literals and regular expressions are case-sensitive
#######################################################################
#
# Structure and Components
# Document Frame and Prolog
#
#######################################################################
document = ~ { WORD } §EOF # root parser: a sequence of words preceded by whitespace
# until the end of file
document = prolog element [Misc] EOF
prolog = [ XMLDecl ] [Misc] [doctypedecl [Misc]]
XMLDecl = '<?xml' VersionInfo [EncodingDecl] [SDDecl] ~ '?>'
xml = { tag | text | comment }
tag = single_tag | tag_pair
single_tag = "<" name attributes "/>"
tag_pair = opening_tag xml closing_tag
opening_tag = "<" tag_name attributes ">"
closing_tag = "</" ::tag_name ">"
attributes = { attribute }
attribute = name "=" '"' content '"'
VersionInfo = ~ 'version' ~ '=' ~ ("'" VersionNum "'" | '"' VersionNum '"')
VersionNum = '1.' /[0-9]+/
EncodingDecl = ~ 'encoding' ~ '=' ~ ("'" EncName "'" | '"' EncName '"')
EncName = /[A-Za-z][A-Za-z0-9._\-]*/
SDDecl = ~ 'standalone' ~ '=' ~ (("'" Yes | No "'") | ('"' Yes | No '"'))
Yes = 'yes'
No = 'no'
#######################################################################
#
# Document Type Definition
#
#######################################################################
doctypedecl = '<!DOCTYPE' ~ Name [~ ExternalID] ~ ['[' intSubset ']' ~] '>'
intSubset = { markupdecl | DeclSep }
DeclSep = PEReference | S
markupdecl = elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
extSubset = [TextDecl] extSubsetDecl
extSubsetDecl = { markupdecl | conditionalSect | DeclSep }
conditionalSect = includeSect | ignoreSect
includeSect = '<![' ~ 'INCLUDE' ~ '[' extSubsetDecl ']]>'
ignoreSect = '<![' ~ 'IGNORE' ~ '[' ignoreSectContents ']]>'
ignoreSectContents = IgnoreChars {'<![' ignoreSectContents ']]>' IgnoreChars}
extParsedEnt = [TextDecl] content
TextDecl = '<?xml' [VersionInfo] EncodingDecl ~ '?>'
elementdecl = '<!ELEMENT' S Name S contentspec ~ '>'
contentspec = EMPTY | ANY | Mixed | children
EMPTY = 'EMPTY'
ANY = 'ANY'
Mixed = '(' ~ '#PCDATA' { ~ '|' ~ Name } ~ ')*'
| '(' ~ '#PCDATA' ~ ')'
children = (choice | seq) ['?' | '*' | '+']
choice = '(' ~ { ~ '|' ~ cp }+ ~ ')'
cp = (Name | choice | seq) ['?' | '*' | '+']
seq = '(' ~ cp { ~ ',' ~ cp } ~ ')'
AttlistDecl = '<!ATTLIST' S Name { S AttDef } ~ '>'
AttDef = Name S AttType S DefaultDecl
AttType = StringType | TokenizedType | EnumeratedType
StringType = 'CDATA'
TokenizedType = ID | IDREF | IDREFS | ENTITY | ENTITIES | NMTOKEN | NMTOKENS
ID = 'ID'
IDREF = 'IDREF'
IDREFS = 'IDREFS'
ENTITY = 'ENTITY'
ENTITIES = 'ENTITIES'
NMTOKEN = 'NMTOKEN'
NMTOKENS = 'NMTOKENS'
EnumeratedType = NotationType | Enumeration
NotationType = 'NOTATION' S '(' ~ Name { ~ '|' ~ Name } ~ ')'
Enumeration = '(' ~ Nmtoken { ~ '|' ~ Nmtoken } ~ ')'
DefaultDecl = REQUIRED | IMPLIED | FIXED
REQUIRED = '#REQUIRED'
IMPLIED = '#IMPLIED'
FIXED = ['#FIXED' S] AttValue
EntityDecl = GEDecl | PEDecl
GEDecl = '<!ENTITY' S Name S EntityDef ~ '>'
PEDecl = '<!ENTITY' S '%' S Name S PEDef ~ '>'
EntityDef = EntityValue | ExternalID [NDataDecl]
PEDef = EntityValue | ExternalID
NotationDecl = '<!NOTATION' S Name ~ (ExternalID | PublicID) ~ '>'
ExternalID = 'SYSTEM' S SystemLiteral
PublicID = 'PUBLIC' S PubidLiteral
NDataDecl = 'NData' S Name
#######################################################################
#
# Logical Structures
#
#######################################################################
element = EmptyElemTag | STag content ETag
STag = '<' TagName { ~ Attribute } ~ '>'
ETag = '</' ::TagName ~ '>'
EmptyElemTag = '<' Name { ~ Attribute } ~ '/>'
TagName = Name
Attribute = Name ~ '=' ~ AttValue
content = [ CharData ]
{ (element | Reference | CDSect | PI | Comment)
[CharData] }
name = IDENTIFIER
tag_name = IDENTIFIER
#######################################################################
#
# Regular Expressions
# Literals
#
#######################################################################
IDENTIFIER = /\w+/~ # a sequence of letters, optional trailing whitespace
EOF = !/./ # no more characters ahead, end of file reached
EntityValue = '"' { /[^%&"]/ | PEReference | Reference } '"'
| "'" { /[^%&']/ | PEReference | Reference } "'"
AttValue = '"' { /[^<&"]/ | Reference } '"'
| "'" { /[^<&']/ | Reference } "'"
SystemLiteral = '"' /[^"]*/ '"' | "'" /[^']*/ "'"
PubidLiteral = '"' [PubidChars] '"'
| "'" [PubidCharsSingleQuoted] "'"
#######################################################################
#
# References
#
#######################################################################
Reference = EntityRef | CharRef
EntityRef = '&' Name ';'
PEReference = '%' Name ';'
#######################################################################
#
# Names and Tokens
#
#######################################################################
Nmtokens = Nmtoken { / / Nmtoken }
Nmtoken = NameChars
Names = Name { / / Name }
Name = NameStartChar [NameChars]
NameStartChar = /_|:|[A-Z]|[a-z]
|[\u00C0-\u00D6]|[\u00D8-\u00F6]|[\u00F8-\u02FF]
|[\u0370-\u037D]|[\u037F-\u1FFF]|[\u200C-\u200D]
|[\u2070-\u218F]|[\u2C00-\u2FEF]|[\u3001-\uD7FF]
|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]
|[\U00010000-\U000EFFFF]/
NameChars = /(?:_|:|-|\.|[A-Z]|[a-z]|[0-9]
|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]
|[\u00C0-\u00D6]|[\u00D8-\u00F6]|[\u00F8-\u02FF]
|[\u0370-\u037D]|[\u037F-\u1FFF]|[\u200C-\u200D]
|[\u2070-\u218F]|[\u2C00-\u2FEF]|[\u3001-\uD7FF]
|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]
|[\U00010000-\U000EFFFF])+/
#######################################################################
#
# Comments, Processing Instructions and CDATA sections
#
#######################################################################
Misc = { Comment | PI | S }+
Comment = '<!--' { CommentChars | /-(?!-)/ } '-->'
PI = '<?' PITarget [~ PIChars] '?>'
PITarget = !/X|xM|mL|l/ Name
CDSect = '<![CDATA[' CData ']]>'
#######################################################################
#
# Characters, Explicit Whitespace and End of File
#
#######################################################################
PubidCharsSingleQuoted = /(?:\x20|\x0D|\x0A|[a-zA-Z0-9]|[-()+,.\/:=?;!*#@$_%])+/
PubidChars = /(?:\x20|\x0D|\x0A|[a-zA-Z0-9]|[-'()+,.\/:=?;!*#@$_%])+/
CharData = /(?:(?!\]\]>)[^<&])+/
CData = /(?:(?!\]\]>)(?:\x09|\x0A|\x0D|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]))+/
IgnoreChars = /(?:(?!(?:<!\[)|(?:\]\]>))(?:\x09|\x0A|\x0D|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]))+/
PIChars = /(?:(?!\?>)(?:\x09|\x0A|\x0D|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]))+/
CommentChars = /(?:(?!-)(?:\x09|\x0A|\x0D|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]))+/
CharRef = ('&#' /[0-9]+/ ';') | ('&#x' /[0-9a-fA-F]+/ ';')
Chars = /(?:\x09|\x0A|\x0D|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF])+/
Char = /\x09|\x0A|\x0D|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]/
S = /\s+/ # whitespace
EOF = !/./ # no more characters ahead, end of file reached
[match:document]
M1: """
<?xml version="1.0" encoding="UTF-8"?>
<note>
<to>Tove</to>
<from>Jani</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
</note>
"""
# XMLSnippet
A strongly simplified XML parser that can be used for inline XML within
other markup languages like markdown.
Author: Eckhart Arnold <eckhart.arnold@posteo.de>
## License
XMLSnippet is open source software under the [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0)
Copyright YEAR AUTHOR'S NAME <EMAIL>, AFFILIATION
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# XMLSnippet-grammar
#######################################################################
#
# EBNF-Directives
#
#######################################################################
@ whitespace = vertical # implicit whitespace, includes any number of line feeds
@ literalws = right # literals have implicit whitespace on the right hand side
@ comment = /#.*/ # comments range from a '#'-character to the end of the line
@ ignorecase = False # literals and regular expressions are case-sensitive
#######################################################################
#
# Structure and Components
#
#######################################################################
document = prolog element EOF
prolog = ""
xml = { element | text | comment }
element = single_tag | tag_pair
single_tag = "<" name attributes "/>"
tag_pair = opening_tag xml closing_tag
opening_tag = "<" tag_name attributes ">"
closing_tag = "</" ::tag_name ">"
attributes = { attribute }
attribute = name "=" '"' content '"'
name = IDENTIFIER
tag_name = IDENTIFIER
#######################################################################
#
# Regular Expressions
#
#######################################################################
WORD = /\w+/~ # a sequence of letters, optional trailing whitespace
EOF = !/./ # no more characters ahead, end of file reached
Life is but a walking shadow
#!/usr/bin/python3
"""tst_XMLSnippet_grammar.py - runs the unit tests for the XMLSnippet-grammar
"""
import os
import sys
sys.path.append(r'/home/eckhart/Entwicklung/DHParser')
scriptpath = os.path.dirname(__file__)
try:
from DHParser import dsl
import DHParser.log
from DHParser import testing
except ModuleNotFoundError:
print('Could not import DHParser. Please adjust sys.path in file '
'"%s" manually' % __file__)
sys.exit(1)
def recompile_grammar(grammar_src, force):
with DHParser.log.logging(False):
# recompiles Grammar only if it has changed
if not dsl.recompile_grammar(grammar_src, force=force):
print('\nErrors while recompiling "%s":' % grammar_src +
'\n--------------------------------------\n\n')
with open('XMLSnippet_ebnf_ERRORS.txt') as f:
print(f.read())
sys.exit(1)
def run_grammar_tests(glob_pattern):
with DHParser.log.logging(False):
error_report = testing.grammar_suite(
os.path.join(scriptpath, 'grammar_tests'),
get_grammar, get_transformer,
fn_patterns=[glob_pattern], report=True, verbose=True)
return error_report
if __name__ == '__main__':
arg = sys.argv[1] if len(sys.argv) > 1 else '*_test_*.ini'
if arg.endswith('.ebnf'):
recompile_grammar(arg, force=True)
else:
recompile_grammar(os.path.join(scriptpath, 'XMLSnippet.ebnf'),
force=False)
sys.path.append('.')
from XMLSnippetCompiler import get_grammar, get_transformer
error_report = run_grammar_tests(glob_pattern=arg)
if error_report:
print('\n')
print(error_report)
sys.exit(1)
print('ready.\n')
......@@ -155,6 +155,22 @@ class TestRegex:
assert node.parser.name == "regex"
assert str(node) == 'abc+def'
def test_multilineRegex_wo_Comments(self):
mlregex = r"""
regex = /\w+
[+]
\w* /
"""
result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
assert result
assert not messages, str(messages)
parser = compile_python_object(DHPARSER_IMPORTS + result, '\w+Grammar$')()
node = parser('abc+def', parser.regex)
assert not node.error_flag
assert node.parser.name == "regex"
assert str(node) == 'abc+def'
def text_ignore_case(self):
mlregex = r"""
@ ignorecase = True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment