FixedEBNF.ebnf 5.81 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# EBNF-Grammar in EBNF

# This is faster version of EBNF relying on fixed constants for delimiters,
# rather than variables that are captured on first use as in "EBNF.ebnf".
# Different syntactical variants are not detected by the grammar itself,
# but need to be configured either by adjusting the definitions of DEF, OR,
# AND, ENDL, RNG_OPEN, RNG_CLOSE, RNG_DELIM, CH_LEADIN, TIMES, RE_LEADIN,
# RE_LEADOUT either within this grammar definition or in the Grammar-object
# changing the `text`-field of the respective parser objects.

@ comment    = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)/
    # comments can be either C-Style: /* ... */
    # or pascal/modula/oberon-style: (* ... *)
    # or python-style: # ... \n, excluding, however, character markers: #x20
@ whitespace = /\s*/                            # whitespace includes linefeed
@ literalws  = right                            # trailing whitespace of literals will be ignored tacitly
17
@ disposable = component, pure_elem, countable, FOLLOW_UP, SYM_REGEX, ANY_SUFFIX, EOF
18
19
20
21
@ drop       = whitespace, EOF                  # do not include these even in the concrete syntax tree
@ RNG_BRACE_filter = matching_bracket()         # filter or transform content of RNG_BRACE on retrieve

# re-entry-rules for resuming after parsing-error
22
 
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume  = /\n\s*(?=@|\w+\w*\s*=)/

# specialized error messages for certain cases

@ definition_error  = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
                           'be a directive and the directive symbol @ is missing\nor the error is '
                           'due to inconsistent use of the comma as a delimiter\nfor the elements '
                           'of a sequence.'

#: top-level

syntax     = ~ { definition | directive } EOF
definition = symbol §DEF~ [ OR~ ] expression ENDL~ & FOLLOW_UP  # [OR~] to support v. Rossum's syntax

38
directive  = "@" §symbol "=" component { "," component } & FOLLOW_UP
di68kap's avatar
di68kap committed
39
  # component  = (regexp | literals | procedure | symbol !DEF)
40
  component  = literals | procedure | expression
41
42
  literals   = { literal }+                       # string chaining, only allowed in directives!
  procedure  = SYM_REGEX "()"                     # procedure name, only allowed in directives!
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

FOLLOW_UP  = `@` | symbol | EOF


#: components

expression = sequence { OR~ sequence }
sequence   = ["§"] ( interleave | lookaround )  # "§" means all following terms mandatory
             { AND~ ["§"] ( interleave | lookaround ) }
interleave = difference { "°" ["§"] difference }
lookaround = flowmarker § (oneormore | pure_elem)
difference = term ["-" § (oneormore | pure_elem)]
term       = oneormore | counted | repetition | option | pure_elem


#: elements

countable  = option | oneormore | element
pure_elem  = element § !ANY_SUFFIX              # element strictly without a suffix
element    = [retrieveop] symbol !DEF          # negative lookahead to be sure it's not a definition
           | literal
           | plaintext
           | regexp
           # | char_range
           | character ~
           | any_char
           | whitespace
           | group


ANY_SUFFIX = /[?*+]/


#: flow-operators

flowmarker = "!"  | "&"                         # '!' negative lookahead, '&' positive lookahead
           | "<-!" | "<-&"                      # '<-!' negative lookbehind, '<-&' positive lookbehind
retrieveop = "::" | ":?" | ":"                  # '::' pop, ':?' optional pop, ':' retrieve


#: groups

group      = "(" no_range §expression ")"
oneormore  = "{" no_range expression "}+" | element "+"
repetition = "{" no_range §expression "}" | element "*" no_range
option     = # !char_range
             "[" §expression "]" | element "?"
counted    = countable range | countable TIMES~ multiplier | multiplier TIMES~ §countable

range      = RNG_OPEN~ multiplier [ RNG_DELIM~ multiplier ] RNG_CLOSE~
no_range   = !multiplier | &multiplier TIMES
multiplier = /[1-9]\d*/~


#: leaf-elements

symbol     = SYM_REGEX ~                        # e.g. expression, term, parameter_list
literal    = /"(?:(?<!\\)\\"|[^"])*?"/~         # e.g. "(", '+', 'while'
           | /'(?:(?<!\\)\\'|[^'])*?'/~         # whitespace following literals will be ignored tacitly.
plaintext  = /`(?:(?<!\\)\\`|[^`])*?`/~         # like literal but does not eat whitespace
           | /´(?:(?<!\\)\\´|[^´])*?´/~
regexp     = RE_LEADIN RE_CORE RE_LEADOUT ~   # e.g. /\w+/, ~/#.*(?:\n|$)/~
# regexp     = /\/(?:(?<!\\)\\(?:\/)|[^\/])*?\//~     # e.g. /\w+/, ~/#.*(?:\n|$)/~
char_range = `[` &char_range_heuristics
                 [`^`] (character | free_char) { [`-`] character | free_char } "]"
character  = CH_LEADIN HEXCODE
free_char  = /[^\n\[\]\\]/ | /\\[nrt`´'"(){}\[\]\/\\]/
any_char   = "."
whitespace = /~/~                               # insignificant whitespace

#: delimiters

EOF = !/./

DEF        = `=`
OR         = `|`
AND        = ``
ENDL       = ``

RNG_OPEN   = `{`
RNG_CLOSE  = `}`
RNG_DELIM  = `,`
TIMES      = `*`

RE_LEADIN  = `/`
RE_LEADOUT = `/`

CH_LEADIN  = `0x`

#: heuristics

char_range_heuristics  = ! ( /[\n\t ]/
                           | ~ literal_heuristics
                           | [`::`|`:?`|`:`] SYM_REGEX /\s*\]/ )
literal_heuristics     = /~?\s*"(?:[\\]\]|[^\]]|[^\\]\[[^"]*)*"/
                       | /~?\s*'(?:[\\]\]|[^\]]|[^\\]\[[^']*)*'/
                       | /~?\s*`(?:[\\]\]|[^\]]|[^\\]\[[^`]*)*`/
                       | /~?\s*´(?:[\\]\]|[^\]]|[^\\]\[[^´]*)*´/
                       | /~?\s*\/(?:[\\]\]|[^\]]|[^\\]\[[^\/]*)*\//
regex_heuristics       = /[^ ]/ | /[^\/\n*?+\\]*[*?+\\][^\/\n]\//


#: basic-regexes

RE_CORE    = /(?:(?<!\\)\\(?:\/)|[^\/])*/       # core of a regular expression, i.e. the dots in /.../
SYM_REGEX  = /(?!\d)\w+/                        # regular expression for symbols
HEXCODE    = /[A-Fa-f0-9]{1,8}/