FlexibleEBNF.ebnf 6.56 KB
Newer Older
Eckhart Arnold's avatar
Eckhart Arnold committed
1
2
# EBNF-Grammar in EBNF

eckhart's avatar
eckhart committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# This grammar is tuned for flexibility, that is, it supports as many
# different flavors of EBNF as possible. However, this flexibility
# comes at the cost of some ambiguities. In particular:
#
#    1. the alternative OR-operator / could be mistaken for the start
#       of a regular expression and vice versa, and
#    2. character ranges [a-z] can be mistaken for optional blocks
#       and vice versa
#
# A strategy to avoid these ambiguities is to do all of the following:
#
#     - replace the free_char-parser by a never matching parser
#     - if this is done, it is safe to replace the char_range_heuristics-
#       parser by an always matching parser
#     - replace the regex_heuristics by an always matching parser
#
# Ambiguities can also be avoided by NOT using all the syntactic variants
20
# made possible by this EBNF-grammar within one and the same EBNF-document
eckhart's avatar
eckhart committed
21
22
23
24
25

@ comment    = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)/
    # comments can be either C-Style: /* ... */
    # or pascal/modula/oberon-style: (* ... *)
    # or python-style: # ... \n, excluding, however, character markers: #x20
26
27
@ whitespace = /\s*/                            # whitespace includes linefeed
@ literalws  = right                            # trailing whitespace of literals will be ignored tacitly
28
@ disposable = component, pure_elem, countable, FOLLOW_UP, SYM_REGEX, ANY_SUFFIX, EOF
eckhart's avatar
eckhart committed
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
@ drop       = whitespace, EOF                  # do not include these even in the concrete syntax tree
@ RNG_BRACE_filter = matching_bracket()         # filter or transform content of RNG_BRACE on retrieve

# re-entry-rules for resuming after parsing-error

@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume  = /\n\s*(?=@|\w+\w*\s*=)/


# specialized error messages for certain cases

@ definition_error  = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
                           'be a directive and the directive symbol @ is missing\nor the error is '
                           'due to inconsistent use of the comma as a delimiter\nfor the elements '
                           'of a sequence.'

45

46
47
#: top-level

eckhart's avatar
eckhart committed
48
syntax     = ~ { definition | directive } EOF
49
definition = symbol §:DEF~ [ :OR~ ] expression :ENDL~ & FOLLOW_UP  # [:OR~] to support v. Rossum's syntax
eckhart's avatar
eckhart committed
50

51
directive  = "@" §symbol "=" component { "," component } & FOLLOW_UP
Eckhart Arnold's avatar
Eckhart Arnold committed
52
53
  # component  = (regexp | literals | procedure | symbol !DEF)
  component  = literals | procedure | expression
54
55
  literals   = { literal }+                       # string chaining, only allowed in directives!
  procedure  = SYM_REGEX "()"                     # procedure name, only allowed in directives!
eckhart's avatar
eckhart committed
56
57
58

FOLLOW_UP  = `@` | symbol | EOF

Eckhart Arnold's avatar
Eckhart Arnold committed
59

60
61
#: components

eckhart's avatar
eckhart committed
62
63
expression = sequence { :OR~ sequence }
sequence   = ["§"] ( interleave | lookaround )  # "§" means all following terms mandatory
64
             { !`@` !(symbol :DEF) :AND~ ["§"] ( interleave | lookaround ) }
eckhart's avatar
eckhart committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
interleave = difference { "°" ["§"] difference }
lookaround = flowmarker § (oneormore | pure_elem)
difference = term ["-" § (oneormore | pure_elem)]
term       = oneormore | counted | repetition | option | pure_elem


#: elements

countable  = option | oneormore | element
pure_elem  = element § !ANY_SUFFIX              # element strictly without a suffix
element    = [retrieveop] symbol !:DEF          # negative lookahead to be sure it's not a definition
           | literal
           | plaintext
           | regexp
           | char_range
           | character ~
           | any_char
           | whitespace
           | group


ANY_SUFFIX = /[?*+]/

Eckhart Arnold's avatar
Eckhart Arnold committed
88

89
90
#: flow-operators

91
flowmarker = "!"  | "&"                         # '!' negative lookahead, '&' positive lookahead
92
           | "<-!" | "<-&"                      # '<-!' negative lookbehind, '<-&' positive lookbehind
93
retrieveop = "::" | ":?" | ":"                  # '::' pop, ':?' optional pop, ':' retrieve
Eckhart Arnold's avatar
Eckhart Arnold committed
94

eckhart's avatar
eckhart committed
95

96
97
#: groups

eckhart's avatar
eckhart committed
98
99
100
101
102
103
104
105
106
107
group      = "(" no_range §expression ")"
oneormore  = "{" no_range expression "}+" | element "+"
repetition = "{" no_range §expression "}" | element "*" no_range
option     = !char_range "[" §expression "]" | element "?"
counted    = countable range | countable :TIMES~ multiplier | multiplier :TIMES~ §countable

range      = RNG_BRACE~ multiplier [ :RNG_DELIM~ multiplier ] ::RNG_BRACE~
no_range   = !multiplier | &multiplier :TIMES
multiplier = /[1-9]\d*/~

Eckhart Arnold's avatar
Eckhart Arnold committed
108

109
110
#: leaf-elements

eckhart's avatar
eckhart committed
111
symbol     = SYM_REGEX ~                        # e.g. expression, term, parameter_list
112
113
literal    = /"(?:(?<!\\)\\"|[^"])*?"/~         # e.g. "(", '+', 'while'
           | /'(?:(?<!\\)\\'|[^'])*?'/~         # whitespace following literals will be ignored tacitly.
eckhart's avatar
eckhart committed
114
plaintext  = /`(?:(?<!\\)\\`|[^`])*?`/~         # like literal but does not eat whitespace
eckhart's avatar
eckhart committed
115
116
117
118
119
120
121
122
           | /´(?:(?<!\\)\\´|[^´])*?´/~
regexp     = :RE_LEADIN RE_CORE :RE_LEADOUT ~   # e.g. /\w+/, ~/#.*(?:\n|$)/~
# regexp     = /\/(?:(?<!\\)\\(?:\/)|[^\/])*?\//~     # e.g. /\w+/, ~/#.*(?:\n|$)/~
char_range = `[` &char_range_heuristics
                 [`^`] (character | free_char) { [`-`] character | free_char } "]"
character  = :CH_LEADIN HEXCODE
free_char  = /[^\n\[\]\\]/ | /\\[nrt`´'"(){}\[\]\/\\]/
any_char   = "."
123
whitespace = /~/~                               # insignificant whitespace
eckhart's avatar
eckhart committed
124

eckhart's avatar
eckhart committed
125
126
127
128
129
#: delimiters

EOF = !/./ [:?DEF] [:?OR] [:?AND] [:?ENDL]      # [:?DEF], [:?OR], ... clear stack by eating stored value
           [:?RNG_DELIM] [:?BRACE_SIGN] [:?CH_LEADIN] [:?TIMES] [:?RE_LEADIN] [:?RE_LEADOUT]

130
DEF        = `=` | `:=` | `::=` | `<-` | /:\n/ | `: `  # with `: `, retrieve markers mustn't be followed by a blank!
eckhart's avatar
eckhart committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
OR         = `|` | `/` !regex_heuristics
AND        = `,` | ``
ENDL       = `;` | ``

RNG_BRACE  = :BRACE_SIGN
BRACE_SIGN = `{` | `(`
RNG_DELIM  = `,`
TIMES      = `*`

RE_LEADIN  = `/` &regex_heuristics | `^/`
RE_LEADOUT = `/`

CH_LEADIN  = `0x` | `#x`

#: heuristics

char_range_heuristics  = ! ( /[\n\t ]/
                           | ~ literal_heuristics
                           | [`::`|`:?`|`:`] SYM_REGEX /\s*\]/ )
literal_heuristics     = /~?\s*"(?:[\\]\]|[^\]]|[^\\]\[[^"]*)*"/
                       | /~?\s*'(?:[\\]\]|[^\]]|[^\\]\[[^']*)*'/
                       | /~?\s*`(?:[\\]\]|[^\]]|[^\\]\[[^`]*)*`/
                       | /~?\s*´(?:[\\]\]|[^\]]|[^\\]\[[^´]*)*´/
                       | /~?\s*\/(?:[\\]\]|[^\]]|[^\\]\[[^\/]*)*\//
regex_heuristics       = /[^ ]/ | /[^\/\n*?+\\]*[*?+\\][^\/\n]\//


#: basic-regexes

RE_CORE    = /(?:(?<!\\)\\(?:\/)|[^\/])*/       # core of a regular expression, i.e. the dots in /.../
SYM_REGEX  = /(?!\d)\w+/                        # regular expression for symbols
HEXCODE    = /[A-Fa-f0-9]{1,8}/