StepByStepGuide.html 63.2 KB
Newer Older
eckhart's avatar
eckhart committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>DHParser’s Step by Step Guide &mdash; DHParser 0.8 documentation</title>
  

  
  
  
  

  

  
  
    

  

  
    <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
eckhart's avatar
eckhart committed
29
30
31
32
33
  <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="DHParser User’s Guide" href="UserGuide.html" />
    <link rel="prev" title="Welcome to DHParser’s documentation!" href="index.html" /> 
eckhart's avatar
eckhart committed
34
35
36
37
38
39

  
  <script src="_static/js/modernizr.min.js"></script>

</head>

eckhart's avatar
eckhart committed
40
<body class="wy-body-for-nav">
eckhart's avatar
eckhart committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

   
  <div class="wy-grid-for-nav">

    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search">
          

          
            <a href="index.html" class="icon icon-home"> DHParser
          

          
          </a>

          
            
            
          

          
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          
        </div>

        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
              
            
            
              <p class="caption"><span class="caption-text">Contents:</span></p>
eckhart's avatar
eckhart committed
83
84
85
86
87
88
89
90
91
92
93
94
95
<ul class="current">
<li class="toctree-l1 current"><a class="current reference internal" href="#">DHParser’s Step by Step Guide</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#setting-up-a-new-dhparser-project">Setting up a new DHParser project</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#installing-dhparser-from-the-git-repository">Installing DHParser from the git repository</a></li>
<li class="toctree-l3"><a class="reference internal" href="#staring-a-new-dhparser-project">Staring a new DHParser project</a></li>
<li class="toctree-l3"><a class="reference internal" href="#understanding-how-compilation-of-dsl-documents-with-dhparser-works">Understanding how compilation of DSL-documents with DHParser works</a></li>
<li class="toctree-l3"><a class="reference internal" href="#the-development-workflow-for-dsls">The development workflow for DSLs</a></li>
<li class="toctree-l3"><a class="reference internal" href="#extending-the-example-dsl-further">Extending the example DSL further</a></li>
<li class="toctree-l3"><a class="reference internal" href="#controlling-abstract-syntax-tree-generation">Controlling abstract-syntax-tree generation</a></li>
</ul>
</li>
</ul>
</li>
eckhart's avatar
eckhart committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
<li class="toctree-l1"><a class="reference internal" href="UserGuide.html">DHParser User’s Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="ReferenceManual.html">DHParser Reference Manual</a></li>
<li class="toctree-l1"><a class="reference internal" href="ModuleReference.html">Module Reference</a></li>
</ul>

            
          
        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">

      
eckhart's avatar
eckhart committed
110
      <nav class="wy-nav-top" aria-label="top navigation">
eckhart's avatar
eckhart committed
111
112
113
114
115
116
117
118
        
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">DHParser</a>
        
      </nav>


      <div class="wy-nav-content">
eckhart's avatar
eckhart committed
119
        
eckhart's avatar
eckhart committed
120
        <div class="rst-content">
eckhart's avatar
eckhart committed
121
        
eckhart's avatar
eckhart committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
          















<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="wy-breadcrumbs">
    
      <li><a href="index.html">Docs</a> &raquo;</li>
        
      <li>DHParser’s Step by Step Guide</li>
    
    
      <li class="wy-breadcrumbs-aside">
        
            
            <a href="_sources/StepByStepGuide.rst.txt" rel="nofollow"> View page source</a>
          
        
      </li>
    
  </ul>

  
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
  <div class="section" id="dhparser-s-step-by-step-guide">
<h1>DHParser’s Step by Step Guide<a class="headerlink" href="#dhparser-s-step-by-step-guide" title="Permalink to this headline"></a></h1>
<p>This step by step guide goes through the whole process of desining and testing
a domain specific notation from the very start. (The terms “domain specific
noation” and “domain specific language” are used interchangeably in the
following. Both will abbreviated by “DSL”, however.) We will design a simple
domain specific notation for poems as a teaching example. On the way we will
learn:</p>
<ol class="arabic simple">
<li>how to setup a new DHParser project</li>
<li>how to use the test driven development approach to designing a DSL</li>
<li>how to describe the syntax of a DSL with the EBNF-notation</li>
<li>how to specify the transformations for converting the concrete syntax tree
that results from parsing a text denoted in our DLS into an abstract syntax
tree that represents or comes close to representing out data model.</li>
<li>how to write a compiler that transforms the abstract syntax tree into a
target representation which might be a html page, epub or printable pdf in
the case of typical Digital-Humanities-projects.</li>
</ol>
<div class="section" id="setting-up-a-new-dhparser-project">
<h2>Setting up a new DHParser project<a class="headerlink" href="#setting-up-a-new-dhparser-project" title="Permalink to this headline"></a></h2>
<p>Since DHParser, while quite mature in terms of implemented features, is still
in a pre-first-release state, it is for the time being more recommendable to
clone the most current version of DHParser from the git-repository rather than
installing the packages from the Python Package Index (PyPI).</p>
<p>This section takes you from cloning the DHParser git repository to setting up
a new DHParser-project in the <code class="docutils literal notranslate"><span class="pre">experimental</span></code>-subdirectory and testing
whether the setup works. Similarily to current web development practices, most
eckhart's avatar
eckhart committed
191
192
193
194
of the work with DHParser is done from the shell. In the following, we assume
a Unix (Linux) environment. The same can most likely be done on other
operating systems in a very similar way, but there might be subtle
differences.</p>
eckhart's avatar
eckhart committed
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
<div class="section" id="installing-dhparser-from-the-git-repository">
<h3>Installing DHParser from the git repository<a class="headerlink" href="#installing-dhparser-from-the-git-repository" title="Permalink to this headline"></a></h3>
<p>In order to install DHParser from the git repository, open up a shell window
and type:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ git clone git@gitlab.lrz.de:badw-it/DHParser.git
$ cd DHParser
</pre></div>
</div>
<p>The second command changes to the DHParser directory. Within this directory
you should recognise the following subdirectories and files. There are more
files and directories for sure, but those will not concern us for now:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>DHParser/            - the DHParser python packages
documentation/       - DHParser&#39;s documentation in html-form
documentation_source - DHParser&#39;s documentation in reStructedText-Format
examples/            - some exmamples for DHParser (mostly incomplete)
experimental/        - an empty directory for experimenting
test/                - DHParser&#39;s unit-tests
dhparser.py          - DHParser&#39;s command line tool for setting up projects
README.md            - General information about DHParser
LICENSE.txt          - DHParser&#39;s license. It&#39;s open source (hooray!)
Introduction.md      - An introduction and appetizer for DHParser
</pre></div>
</div>
eckhart's avatar
eckhart committed
218
219
220
221
222
223
<p>In order to verify that the installation works, you can simply run the
“dhparser.py” script and, when asked, chose “3” for the self-test. If the
self-test runs through without error, the installation has succeded.</p>
</div>
<div class="section" id="staring-a-new-dhparser-project">
<h3>Staring a new DHParser project<a class="headerlink" href="#staring-a-new-dhparser-project" title="Permalink to this headline"></a></h3>
eckhart's avatar
eckhart committed
224
225
<p>In order to setup a new DHParser project, you run the <code class="docutils literal notranslate"><span class="pre">dhparser.py</span></code>-script
with the name of the new project. For the sake of the example, let’s type:</p>
eckhart's avatar
eckhart committed
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python dhparser.py experimental/poetry
$ cd experimental/poetry
</pre></div>
</div>
<p>This creates a new DHParser-project with the name “poetry” in directory with
the same name within the subdirectory “experimental”. This new directory
contains the following files:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">README</span><span class="o">.</span><span class="n">md</span>           <span class="o">-</span> <span class="n">a</span> <span class="n">stub</span> <span class="k">for</span> <span class="n">a</span> <span class="n">readme</span><span class="o">-</span><span class="n">file</span> <span class="n">explaiing</span> <span class="n">the</span> <span class="n">project</span>
<span class="n">poetry</span><span class="o">.</span><span class="n">ebnf</span>         <span class="o">-</span> <span class="n">a</span> <span class="n">trivial</span> <span class="n">demo</span> <span class="n">grammar</span> <span class="k">for</span> <span class="n">the</span> <span class="n">new</span> <span class="n">project</span>
<span class="n">example</span><span class="o">.</span><span class="n">dsl</span>         <span class="o">-</span> <span class="n">an</span> <span class="n">example</span> <span class="n">file</span> <span class="n">written</span> <span class="ow">in</span> <span class="n">this</span> <span class="n">grammar</span>
<span class="n">tst_poetry_grammar</span><span class="o">.</span><span class="n">py</span> <span class="o">-</span> <span class="n">a</span> <span class="n">python</span> <span class="n">script</span> <span class="p">(</span><span class="s2">&quot;test-script&quot;</span><span class="p">)</span> <span class="n">that</span> <span class="n">re</span><span class="o">-</span><span class="n">compiles</span>
                        <span class="n">the</span> <span class="n">grammar</span> <span class="p">(</span><span class="k">if</span> <span class="n">necessary</span><span class="p">)</span> <span class="ow">and</span> <span class="n">runs</span> <span class="n">the</span> <span class="n">unit</span> <span class="n">tests</span>
<span class="n">grammar_tests</span><span class="o">/</span><span class="mi">01</span><span class="n">_test_word</span><span class="o">.</span><span class="n">ini</span>     <span class="o">-</span> <span class="n">a</span> <span class="n">demo</span> <span class="n">unit</span> <span class="n">test</span>
<span class="n">grammar_tests</span><span class="o">/</span><span class="mi">02</span><span class="n">_test_document</span><span class="o">.</span><span class="n">ini</span> <span class="o">-</span> <span class="n">another</span> <span class="n">unit</span> <span class="n">test</span>
</pre></div>
</div>
<p>Now, if you look into the file “example.dsl” you will find that it contains a
simple sequence of words, namely “Life is but a walking shadow”. In fact, the
demo grammar that comes with a newly created project is nothing but simple
grammar for sequences of words separated by whitespace. Now, since we alread
have unit tests, our first exercise will be to run the unit tests by starting
the script “tst_poetry_grammar.py”:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python tst_poetry_grammar.py
</pre></div>
</div>
<p>This will run through the unit-tests in the grammar_tests directory and print
their success or failure on the screen. If you check the contents of your
project directory after running the script, you might notice that there now
exists a new file “poetryCompiler.py” in the project directory. This is an
auto-generated compiler-script for our DSL. You can use this script to compile
any source file of your DSL, like “example.dsl”. Let’s try:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python poetryCompiler.py example.dsl
</pre></div>
</div>
<p>The output is a block of pseudo-XML, looking like this:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="o">&lt;</span><span class="n">document</span><span class="o">&gt;</span>
  <span class="o">&lt;</span><span class="p">:</span><span class="n">ZeroOrMore</span><span class="o">&gt;</span>
    <span class="o">&lt;</span><span class="n">WORD</span><span class="o">&gt;</span>
      <span class="o">&lt;</span><span class="p">:</span><span class="n">RegExp</span><span class="o">&gt;</span><span class="n">Life</span><span class="o">&lt;/</span><span class="p">:</span><span class="n">RegExp</span><span class="o">&gt;</span>
      <span class="o">&lt;</span><span class="p">:</span><span class="n">Whitespace</span><span class="o">&gt;</span> <span class="o">&lt;/</span><span class="p">:</span><span class="n">Whitespace</span><span class="o">&gt;</span>
    <span class="o">&lt;/</span><span class="n">WORD</span><span class="o">&gt;</span>
    <span class="o">&lt;</span><span class="n">WORD</span><span class="o">&gt;</span>
      <span class="o">&lt;</span><span class="p">:</span><span class="n">RegExp</span><span class="o">&gt;</span><span class="ow">is</span><span class="o">&lt;/</span><span class="p">:</span><span class="n">RegExp</span><span class="o">&gt;</span>
      <span class="o">&lt;</span><span class="p">:</span><span class="n">Whitespace</span><span class="o">&gt;</span> <span class="o">&lt;/</span><span class="p">:</span><span class="n">Whitespace</span><span class="o">&gt;</span>
    <span class="o">&lt;/</span><span class="n">WORD</span><span class="o">&gt;</span>
 <span class="o">...</span>
</pre></div>
</div>
<p>Now, this does not look too helpful yet, partly, because it is cluttered with
all sorts of seemingly superflous pseudo-XML-tags like “&lt;:ZeroOrMore&gt;”.
However, you might notice that it contains the original sequence of words
“Life is but a walkting shadow” in a structured form, where each word is
(among other things) surrounded by &lt;WORD&gt;-tags. In fact, the output of the
compiler script is a pseudo-XML-representation of the <em>contrete syntax tree</em>
of our “example.dsl”-document according the grammar specified in “poetry.ebnf”
(which we haven’t looked into yet, but we will do so soon).</p>
<p>If you see the pseudo-XML on screen, the setup of the new DHParser-project
has been successful.</p>
</div>
<div class="section" id="understanding-how-compilation-of-dsl-documents-with-dhparser-works">
<h3>Understanding how compilation of DSL-documents with DHParser works<a class="headerlink" href="#understanding-how-compilation-of-dsl-documents-with-dhparser-works" title="Permalink to this headline"></a></h3>
<p>Generally speaking, the compilation process consists of three stages:</p>
<ol class="arabic simple">
<li>Parsing a document. This yields a <em>concrete syntax tree</em> (CST) of the
document.</li>
<li>Transforming. This transforms the CST into the much more concise <em>abstract
syntax tree</em> (AST) of the document.</li>
<li>Compiling. This turns the AST into anything you’d like, for example, an
XML-representation or a relational database record.</li>
</ol>
<p>Now, DHParser can fully automize the generation of a parser from a
syntax-description in EBNF-form, like our “poetry.ebnf”, but it cannot
automize the transformation from the concrete into the abstract syntax tree
(which for the sake of brevity we will simply call “AST-Transformation” in the
following), and neither can it automize the compilation of the abstract syntax
tree into something more useful. Therefore, the AST-Transformation in the
autogenerated compile-script is simply left empty, while the compiling stage
simply converts the syntax tree into a pseudo-XML-format.</p>
<p>The latter two stages have to be coded into the compile-script by hand, with
the support of templates within this script. If the grammar of the DSL is
changed - as it will be frequently during the development of a DSL - the
parser-part of this script will be regenerated by the testing-script before
the unit tests are run. The script will notice if the grammar has changed.
This also means that the parser part of this script will be overwritten and
should never be edited by hand. The other two stages can and should be edited
by hand. Stubs for theses parts of the compile-script will only be generated
if the compile-script does not yet exist, that is, on the very first calling
of the test-srcipt.</p>
<p>Usually, if you have adjusted the grammar, you will want to run the unit tests
anyway. Therefore, the regeneration of the parser-part of the compile-script
is triggered by the test-script.</p>
</div>
<div class="section" id="the-development-workflow-for-dsls">
<h3>The development workflow for DSLs<a class="headerlink" href="#the-development-workflow-for-dsls" title="Permalink to this headline"></a></h3>
<p>When developing a domain specific notation it is recommendable to first
develop the grammar and the parser for that notation, then to the abstract
syntax tree transformations and finally to implement the compiler. Of course
one can always come back and change the grammar later. But in order to avoid
revising the AST-transformations and the compiler time and again it helps if
the grammar has been worked out before. A bit of interlocking between these
steps does not hurt, though.</p>
<p>A resonable workflow for developing the grammar proceeds like this:</p>
<ol class="arabic">
<li><p class="first">Set out by writing down a few example documents for your DSL. It is
advisable to start with a few simple examples that use only a subset of the
intended features of your DSL.</p>
</li>
<li><p class="first">Next you sktech a grammar for your DSL that is just rich enough to capture
those examples.</p>
</li>
<li><p class="first">Right after sketching the grammar you should write test cases for your
grammar. The test cases can be small parts or snippets of your example
documents. You could also use your example documents as test cases, but
usually the test cases should have a smaller granularity to make locating
errors easier.</p>
</li>
<li><p class="first">Next, you should run the test script. Usually, some test will fail at
the first attempt. So you’ll keep revising the EBNF-grammar, adjusting and
adding test cases until all tests pass.</p>
</li>
<li><p class="first">Now it is time to try and compile the example documents. By this time the
test-script should have generated the compile-script, which you can be
called with the example documents. Don’t worry too much about the output,
yet. What is important at this stage is merely whether the parser can
handle the examples or not. If not, further test cases and adjustments the
EBNF grammar will be needed - or revision of the examples in case you
decide to use different syntactic constructs.</p>
<p>If all examples can be parsed, you go back to step one and add further more
complex examples, and continue to do so until you have the feeling that you
DSL’s grammar is rich enough for all intended application cases.</p>
</li>
</ol>
<p>Let’s try this with the trivial demo example that comes with creating a new
project with the “dhparser.py”-script. Now, you have already seen that the
“example.dsl”-document merely contains a simple sequence of words: “Life is
but a walking shadow” Now, wouldn’t it be nice, if we could end this sequence
with a full stop to turn it into a proper sentence. So, open “examples.dsl”
with a text editor and add a full stop:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Life</span> <span class="ow">is</span> <span class="n">but</span> <span class="n">a</span> <span class="n">walking</span> <span class="n">shadow</span><span class="o">.</span>
</pre></div>
</div>
<p>Now, try to compile “examples.dsl” with the compile-script:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python poetryCompiler.py example.dsl
example.dsl:1:29: Error: EOF expected; &quot;.\n &quot; found!
</pre></div>
</div>
<p>Since the grammar, obviously, did not allow full stops so far, the parser
returns an error message. The error message is pretty self-explanatory in this
case. (Often, you will unfortunately find that the error message are somewhat
difficult to decipher. In particular, because it so happens that an error the
parser complains about is just the consequence of an error made at an earlier
location that the parser may not have been able to recognize as such. We will
learn more about how to avoid such situations, later.) EOF is actually the
name of a parser that captures the end of the file, thus “EOF”! But instead of
the expected end of file an, as of now, unparsable construct, namely a full
stop followed by a line feed, signified by “n”, was found.</p>
<p>Let’s have look into the grammar description “poetry.ebnf”. We ignore the
beginning of the file, in particular all lines starting with “&#64;” as these
lines do not represent any grammar rules, but meta rules or so-called
“directives” that determine some general characteristics of the grammar, such
as whitespace-handling or whether the parser is going to be case-sensitive.
Now, there are exactly three rules that make up this grammar:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>document = ~ { WORD } §EOF
WORD     =  /\w+/~
EOF      =  !/./
</pre></div>
</div>
<p>EBNF-Grammars describe the structure of a domain specific notation in top-down
fashion. Thus, the first rule in the grammar describes the comonents out of
which a text or document in the domain specific notation is composed as a
whole. The following rules then break down the components into even smaller
components until, finally, there a only atomic components left which are
described be matching rules. Matching rules are rules that do not refer to
other rules any more. They consist of string literals or regular expressions
that “capture” the sequences of characters which form the atomic components of
our DSL. Rules in general always consist of a symbol on the left hand side of
a “=”-sign (which in this context can be unterstood as a definition signifier)
and the definition of the rule on the right hand side.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p>Traditional parser technology for context-free grammars often
distinguishes two phases, <em>scanning</em> and <em>parsing</em>, where a lexical scanner
would take a stream of characters and yield a sequence of tokens and the
actual parser would then operate on the stream of tokens. DHParser,
however, is an instance of a <em>scannerless parser</em> where the functionality
of the lexical scanner is seamlessly integrated into the
parser. This is done by allowing regular expressions in the definiendum of
grammar symbols. The regular expressions do the work of the lexical
scanner.</p>
<p class="last">Theoretically, one could do without scanners or regular expressions.
Because regular languages are a subset of context-free languages, parsers
for context-free languages can do all the work that regular expressions can
do. But it makes things easier - and, in the case of DHParser, also faster
- to have them.</p>
</div>
<p>In our case the text as a whole, conveniently named “document” (any other name
would be allowed, too), consists of a leading whitespace, a possibly empty
sequence of an arbitrary number of words words ending only if the end of file
has been reached. Whitespace in DHParser-grammers is always denoted by a tilde
“~”. Thuse the definiens of the rule “document” starts with a “~” on the right
hand side of the deifnition sign (“=”). Next, you find the symbol “WORD”
enclosed in braces. “WORD”, like any symbol composed of letters in DHParser,
refers to another rule further below that defines what words are. The meaning
of the braces is that whatever is enclosed by braces may be repeated zero or
more times. Thus the expression “{ WORD }” describes a seuqence of arbitrarily
many repetitions of WORD, whatever WORD may be. Finally, EOF refers to yet
another rule definied further below. We do not yet know what EOF is, but we
know that when the sequence of words ends, it must be followed by an EOF. The
paragraph sign “§” in front of EOF means that it is absolutely mandatory that
the seuqence of WORDs is followed by an EOF. If it doesn’t the program issues
an error message. Without the “§”-sign the parser simply would not match,
which in itself is not considered an error.</p>
<p>Now, let’s look at our two matching rules. Both of these rules contain regular
expressions. If you do not know about regular expressions yet, you should head
over to an explanation or tutorial on regular expressions, like
<a class="reference external" href="https://docs.python.org/3/library/re.html">https://docs.python.org/3/library/re.html</a>, before continuing, because we are
not going to discuss them here. In DHParser-Grammars regular expressions are
enclosed by simple forawrd slashes “/”. Everything between two forward slashes
is a regular expression as it would be understood by Python’s “re”-module.
Thus the rule <code class="docutils literal notranslate"><span class="pre">WORD</span> <span class="pre">=</span> <span class="pre">/\w+/~</span></code> means that a word consists of a seuqence of
letters, numbers or underscores ‘_’ that must be at least one sign long. This
is what the regular expression “w+” inside the slashes means. In regular
expressions, “w” stands for word-characters and “+” means that the previous
character can be repeated one or more times. The tile “~” following the
regular expression, we already know. It means that a a word can be followed by
whitespace. Strictly speaking that whitespace is part of “WORD” as it is
defined here.</p>
<p>Similarly, the EOF (for “end of line”) symbol is defined by a rule that
consists of a simple regular expression, namely “.”. The dot in regular
expressions means any character. However, the regular expression itself
preceded by an exclamations mark “!”. IN DHParser-Grammars, the explanation
mark means “not”. Therefore the whole rule means, that <em>no</em> character must
follow. Since this is true only for the end of file, the parser looking for
EOF will only match if the very end of the file has been reached.</p>
<p>Now, what would be the easiest way to allow our sequence of words to be ended
like a real sentence with a dot “.”?  As always when defining grammars on can
think of different choice to implement this requirement in our grammar. One
possible solution is to add a dot-literal before the “§EOF”-component at the
end of the definition of the “document”-rule. So let’s do that. Change the
line where the “document”-rule is defined to:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>document = ~ { WORD } &quot;.&quot; §EOF
</pre></div>
</div>
<p>As you can see, string-literals are simply denoted as strings between inverted
commas in DHParser’s variant of the EBNF-Grammar. Now, before we can compile
the file “example.dsl”, we will have to regenerate the our parser, because we
have changed the grammar. In order to recompile, we simply run the test-script
again:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python tst_poetry_grammar.py
</pre></div>
</div>
<p>But what is that? A whole lot of errormessages? Well, this it not surprising,
because we change the grammar, some of our old test-cases fail with the new
grammar. So we will have to update our test-cases wird. (Actually, the grammar
get’s compiles never the less and we could just ignore the test failures and
carry on with compiling our “example.dsl”-file again. But, for this time,
we’ll follow good practice and adjust the test cases. So open the test that
failed, “grammar_tests/02_test_document.ini”, in the editor and add full stops
at the end of the “match”-cases and remove the full stop at the end of the
“fail”-case:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">match</span><span class="p">:</span><span class="n">document</span><span class="p">]</span>
<span class="n">M1</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;This is a sequence of words</span>
<span class="s2">    extending over several lines.&quot;&quot;&quot;</span>
<span class="n">M2</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;  This sequence contains leading whitespace.&quot;&quot;&quot;</span>

<span class="p">[</span><span class="n">fail</span><span class="p">:</span><span class="n">document</span><span class="p">]</span>
<span class="n">F1</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;This test should fail, because neither</span>
<span class="s2">    comma nor full have been defined anywhere&quot;&quot;&quot;</span>
</pre></div>
</div>
<p>The format of the test-files should be pretty self-explanatory. It is a simple
ini-file, where the section markers hold the name of the grammar-rule to be
tested which is either preceded by “match” or “fail”. “match means” that the
following examples should be matched by the grammar-rule. “fail” means they
should <em>not</em> match. It is just as important that a parser or grammar-rules
does not match those strings it should not match as it is that it matches
those strings that it should match. The individual test-cases all get a name,
in this case M1, M2, F2, but if you prefer more meaningful names this is also
possible. (Beware, however, that the names for match-test different from the
names for the fail tests for the same rule!). Now, run the test-script again
and you’ll see that no errors get reported any more.</p>
<p>Finally, we can recompile out “example.dsl”-file, and by its XML output we can
tell that it worked:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python poetryCompiler.py example.dsl
</pre></div>
</div>
<p>So far, we have seen <em>in nuce</em> how the development workflow for a building up
DSL-grammar goes. Let’s take this a step further by adding more capabilities
to our grammr.</p>
</div>
<div class="section" id="extending-the-example-dsl-further">
<h3>Extending the example DSL further<a class="headerlink" href="#extending-the-example-dsl-further" title="Permalink to this headline"></a></h3>
<p>A grammar that can only digest single sentences is certainly a rather boring.
So we’ll extend our grammar a little further so that it can capture paragraphs
of sentences. To see, where we are heading, let’s first start a new example
file, let’s call it “macbeth.dsl” and enter the following lines:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Life’s but a walking shadow, a poor player that struts and frets his hour
upon the stage and then is heard no more. It is a tale told by an idiot,
full of sound and fury, signifying nothing.
</pre></div>
</div>
<p>What have we got, there? We’ve got a paragraph that consists of several
sentences each of which ends with a full stop. The sentences themselves can
consist of different parts which a separated by a comma. If, so far, we have
got a clear idea (in verbal terms) of the structure of texts in our DSL, we
can now try to formulate this in the grammar.</p>
eckhart's avatar
eckhart committed
532
<blockquote>
eckhart's avatar
eckhart committed
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
<div>document = ~ { sentence } §EOF
sentence = part {“,” part } “.”
part     = { WORD }              # a subtle mistake, right here!
WORD     =  /w+/~               # something forgotten, here!
EOF      =  !/./</div></blockquote>
<p>The most important new part is the grammar rule “sentence”. It reads as this:
A sentence is a part of a sentence potentially followed by a repeated sequence
of a comma and another part of a sentence and ultimately ending with a full
stop. (Understandable? If you have ever read Russell’s “Introduction to
Mathematical Philosophy” you will be used to this kind of prose. Other than
that I find the formal definition easier to understand. However, for learning
EBNF or any other formalism, it helps in the beginning to translate the
meaning of its statements into plain old Englisch.)</p>
<p>There is are two subtle mistakes in this grammar. If you can figure them out
just by thinking about it, feel free to correct the grammar right now. (Would
you really have noticed the mistakes if they hadn’t already been marked in the
code above?) For all less intelligent people, like me: Let’s be prudent and -
since the grammar has become more complex - add a few test cases. This should
make it easier to locate any errors. So open up an editor with a new file in
the tests subdirectory, say <code class="docutils literal notranslate"><span class="pre">grammar_tests/03_test_sentence.ini</span></code> (Test files
should always contain the component “<a href="#id1"><span class="problematic" id="id2">test_</span></a>” in the filename, otherwise they
will be overlooked by DHParser’s unit testing subsystem) and enter a few
test-cases like these:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">match</span><span class="p">:</span><span class="n">sentence</span><span class="p">]</span>
<span class="n">M1</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;It is a tale told by an idiot,</span>
<span class="s2">   full of sound and fury, signifying nothing.&quot;&quot;&quot;</span>
<span class="n">M2</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;Plain old sentence.&quot;&quot;&quot;</span>

<span class="p">[</span><span class="n">fail</span><span class="p">:</span><span class="n">sentence</span><span class="p">]</span>
<span class="n">F1</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;Ups, a full stop is missing&quot;&quot;&quot;</span>
<span class="n">F2</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;No commas at the end,.&quot;&quot;&quot;</span>
</pre></div>
</div>
<p>Again, we recompile the grammar and run the test at the same time by running
the testing-script:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python tst_poetry_grammar.py
Errors found by unit test &quot;03_test_sentence.ini&quot;:
Fail test &quot;F2&quot; for parser &quot;sentence&quot; yields match instead of expected failure!
</pre></div>
</div>
<p>Too bad, something went wrong here. But what? Didn’t the definition of the
rule “sentence” make sure that parts of sentences are, if at all, only be
followed by a sequence of a comma <em>and</em> another part of a sentence. So, how
come that between the last comma and the full stop there is nothing but empty
space? Ah, there’s the rub! If we look into our grammar, how parts of
sentences have been defined, we find that the rule:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="p">{</span> <span class="n">WORD</span> <span class="p">}</span>
</pre></div>
</div>
<p>definies a part of a sentence as a sequence of <em>zero</em> or more WORDs. This
means that a string of length zero also counts as a valid part of a sentence.
Now in order to avoid this, we could write:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">WORD</span> <span class="p">{</span> <span class="n">WORD</span> <span class="p">}</span>
</pre></div>
</div>
<p>This definition makes sure that there is at least on WORD in a part. Since the
case that at least one item is needed occurs rather frequently in grammars,
DHParser offers a special syntax for this case:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="p">{</span> <span class="n">WORD</span> <span class="p">}</span><span class="o">+</span>
</pre></div>
</div>
<p>(The plus sign “+” must always follow directly after the curly brace “}”
without any whitespcae in between, otherwise DHParser won’t understannd it.)
At this point the worry may arise that the same problem could reoccur at
another level, if the rule for WORD would match empty strings as well. Let’s
quickly add a test case for this to the file
<code class="docutils literal notranslate"><span class="pre">grammar_tests/01_test_word.ini</span></code>:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">fail</span><span class="p">:</span><span class="n">WORD</span><span class="p">]</span>
<span class="n">F1</span><span class="p">:</span> <span class="n">two</span> <span class="n">words</span>
<span class="n">F2</span><span class="p">:</span> <span class="s2">&quot;&quot;</span>
</pre></div>
</div>
<p>Thus, we are sure to be warned in case the definition of rule “WORD” matches
the empty string. Luckily, it does not do so now. But it might happen that we
change this definition later again for some reason, we might have forgotton
about this subtlety and introduce the same error again. With a test case we
can reduce the risk of such a regression error. This time the tests run
through, nicely. So let’s try the parser on our new example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python poetryCompiler.py macbeth.dsl
macbeth.dsl:1:1: Error: EOF expected; &quot;Life’s but&quot; found!
</pre></div>
</div>
<p>That is strange. Obviously, there is an error right at the beginning (line 1
column 1). But what coul possibly be wrong with the word “Life”. Now you might
already have guessed what the error is and that the error is not exactly
located in the first column of the first line.</p>
<p>Unfortunately, DHParser - like almost any other parser out there - is not
always very good at spotting the exact location of an error. Because rules
refer to other rules, a rule may fail to parse - or, what is just as bad,
succeed to parse when it should indeed fail - as a consequence of an error in
the definition of one of the rules it refers to. But this means if the rule
for the whole document fails to match, the actual error can be located
anywhere in the document! There a different approaches to dealing with this
problem. A tool that DHParser offers is to write log-files that document the
parsing history. The log-files allow to spot the location, where the parsing
error occured. However, you will have to look for the error manually. A good
starting point is usually either the end of the parsing process or the point
where the parser reached the farthest into the text. In order to receive the
parsing history, you need to run the compiler-script again with the debugging
option:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ python poetryCompiler.py macbeth.dsl
</pre></div>
</div>
<p>You will receive the same error messages as before. but this time various
kinds of debugging information have been written into a new created
subdirectory “LOGS”. (Beware that any files in the “LOGS” directory may be
overwritten or deleted by any of the DHParser scripts upon the next run! So
don’t store any important data there.) The most interesting file in the
“LGOS”-directory is the full parser log. We’ll ignore the other files and just
open the file “macbech_full_parser.log.html” in an internet-browser. As the
parsing history tends to become quite long, this usually takes a while, but
luckily not in the case of our short demo example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>$ firefox LOGS/macbeth_full_parser.log.html &amp;
</pre></div>
</div>
<img alt="_images/parsing_history.png" src="_images/parsing_history.png" />
<p>What you see is a representation of the parsing history. It might look a bit
tedious in the beginning, especially the this column that contains the parser
call sequence. But it is all very straight forward: For every application of a
match rule, there is a row in the table. Typically, match rules are applied at
the end of a long sequence of parser calls that is displayed in the third
column. You will recognise the parsers that represent rules by their names,
e.g. “document”, “sentence” etc. Those parsers that merely represent
constructs of the EBNF grammar within a rule do not have a name and are
represented by theis type, which always begins with a colon, like
“:ZeroOrMore”. Finally, the regular expression or literal parsers are
represented by the regular expression pattern or the string literal
themselves. (Arguably, it can be confusing that parsers are represented in
three different ways in the parer call sequence. I am still figuring out a
better way to display the parser call sequence. Any suggestions welcome!) The
first two columns display the position in the text in terms of lines and
columns. The second but last column, labeled “success” shows wether the last
parser in the sequence matched or failed or produced an error. In case of an
error, the error message is displayed in the third column as well. In case the
parser matched, the last column displays exactly that section of the text that
the parser did match. If the parser did not match, the last column displays
the text that still lies ahead and has not yet been parsed.</p>
<p>In our concrete example, we can see that the parser “WORD” matches “Life”, but
not “Life’s” or “’s”. And this ultimately leads to the failure of the parsing
process as a whole. The simplemost solution would be to add the apostrophe to
the list of allowed characters in a word by changeing the respective line in
the grammar definition to <code class="docutils literal notranslate"><span class="pre">WORD</span> <span class="pre">=</span> <span class="pre">/[\w’]+/</span></code>. Now, before we even change the
grammar we first add another test case to capture this kind of error. Since we
have decided that “Life’s” should be parsed as a singe word, let’s open the
file “grammar_tests/01_test_word.ini” and add the following test:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>[match:WORD]
M3: Life’s
</pre></div>
</div>
<p>To be sure that the new test captures the error we have found you might want
to run the script “tst_poetry_grammar.py” and verify that it reports the
failure of test “M3” in the suite “01_test_word.ini”. After that, change the
regular expression for the symbol WORD in the grammar file “poetry.ebnf” as
just described. Now both the tests and the compilation of the file
“macbeth.dsl” should run through smoothly.</p>
<div class="admonition caution">
<p class="first admonition-title">Caution</p>
<p>Depending on the purpose of your DSL, the simple solution of
allowing apostrophes within words, might not be what you want. After all
“Life’s” is but a shorthand for the two word phrase “Life is”. Now,
whatever alternative solution now comes to your mind, be aware that there
are also cases like Irish names, say “O’Dolan” where the apostrophe is
actually a part of a word and cases like “don’t” which, if expanded, would
be two words <em>not</em> separated at the position of the apostrophe.</p>
<p class="last">We leave that as an exercise, first to figure out, what different cases for
the use of apostrophes in the middle of a word exist. Secondly, to make a
reasonable decision which of these should be treated as a single and which
as separate words and, finally, if possible, to write a grammar that
provides for these cases. These steps are quite typical for the kind of
challenges that occur during the design of a DSL for a
Digital-Humanities-Project.</p>
</div>
</div>
<div class="section" id="controlling-abstract-syntax-tree-generation">
<h3>Controlling abstract-syntax-tree generation<a class="headerlink" href="#controlling-abstract-syntax-tree-generation" title="Permalink to this headline"></a></h3>
<p>Compiling the example “macbeth.dsl” with the command <code class="docutils literal notranslate"><span class="pre">python</span> <span class="pre">poetryCompier.py</span>
<span class="pre">macbeth.dsl</span></code>, you might find yourself not being able to avoid the impression
that the output is rather verbose. Just looking at the beginning of the
output, we find:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>&lt;document&gt;
    &lt;:ZeroOrMore&gt;
        &lt;sentence&gt;
            &lt;part&gt;
                &lt;WORD&gt;
                    &lt;:RegExp&gt;Life’s&lt;/:RegExp&gt;
                    &lt;:Whitespace&gt; &lt;/:Whitespace&gt;
                &lt;/WORD&gt;
                &lt;WORD&gt;
                    &lt;:RegExp&gt;but&lt;/:RegExp&gt;
                    &lt;:Whitespace&gt; &lt;/:Whitespace&gt;
                &lt;/WORD&gt;
...
</pre></div>
</div>
<p>But why do we need to know all those details! Why would we need a
“:ZeroOrMore” element inside the “&lt;document&gt;” element, if the
&lt;sentence&gt;”-elements could just as well be direct descendants of the
&lt;document&gt;”-element? Why do we need the information that “Life’s” has been
captured by a regular expression parser? Wouldn’t it suffice to know that the
word captured is “Life’s”? And is the whitespace really needed at all? If the
words in a sequence are separated by definition by whitespace, then it would
suffice to have the word without whitespace in our tree, and to add whitespace
only later when transforming the tree into some kind of output format. (On the
other hand, it might be convenient to have it in the tree never the less…)</p>
<p>Well, the answer to most most of these questions is that what our compilation
script yields is more or less the output that the parser yields which in turn
is the <em>concrete syntax tree</em> of the parsed text. Being a concrete syntax tree
it is by its very nature very verbose, because it captures every minute
syntactic detail described in the grammar and found in the text, no matter how
irrelevant it is, if we are primarily interested in the structure of our text.
In order for our tree to become more handy we have to transform it into an
<em>abstract syntax tree</em> first, which is called thus because it abstracts from
all details that deem us irrelevant. Now, which details we consider as
irrelevant is almost entirely up to ourselves. And we should think carefully
about what features must be included in the abstract syntax tree, because the
abstract syntax tree more or less reflects the data model (or is at most one
step away from it) with which want to capture our material.</p>
<p>For the sake of our example, let’s assume that we are not interested in
whitespace and that we want to get rid of all uniformative nodes, i.e. nodes
that merely demark syntactic structures but not semantic entities.</p>
<p>DHParser supports the transformation of the concrete syntax tree (CST) into the
abstract syntax tree (AST) with a simple technology that (in theory) allows to
specify the necessary transformations in an almost delcarative fashion: You
simply fill in a Python-dictionary of tag-names with transformation <em>operators</em>.
Technically, these operators are simply Python-functions. DHParser comes with a
rich set of predefined operators. Should these not suffice, you
can easily write your own. How does this look like?</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">poetry_AST_transformation_table</span> <span class="o">=</span> <span class="p">{</span>
    <span class="s2">&quot;+&quot;</span><span class="p">:</span> <span class="n">remove_empty</span><span class="p">,</span>
    <span class="s2">&quot;document&quot;</span><span class="p">:</span> <span class="p">[],</span>
    <span class="s2">&quot;sentence&quot;</span><span class="p">:</span> <span class="p">[],</span>
    <span class="s2">&quot;part&quot;</span><span class="p">:</span> <span class="p">[],</span>
    <span class="s2">&quot;WORD&quot;</span><span class="p">:</span> <span class="p">[],</span>
    <span class="s2">&quot;EOF&quot;</span><span class="p">:</span> <span class="p">[],</span>
    <span class="s2">&quot;:Token, :RE&quot;</span><span class="p">:</span> <span class="n">reduce_single_child</span><span class="p">,</span>
    <span class="s2">&quot;*&quot;</span><span class="p">:</span> <span class="n">replace_by_single_child</span>
<span class="p">}</span>
</pre></div>
</div>
<p>You’ll find this table in the script <code class="docutils literal notranslate"><span class="pre">poetryCompiler.py</span></code>, which is also the
place where you edit the table, because then it is automatically used when
compiling your DSL-sources. Now, AST-Transformation works as follows: The whole
tree is scanned, starting at the deepest level and applying the specified
operators and then working its way upward. This means that the operators
specified for “WORD”-nodes will be applied before the operators of “part”-nodes
and “sentence”-nodes. This has the advantage that when a particular node is
reached the transformations for its descendant nodes have already been applied.</p>
<p>As you can see, the transformation-table contains an entry for every known
parser, i.e. “document”, “sentence”, “part”, “WORD”, “EOF”. (If any of these are
missing in the table of your <code class="docutils literal notranslate"><span class="pre">poetryCompiler.py</span></code>, add them now!) In the
template you’ll also find transformations for two anonymous parsers, i.e.
“:Token” and “:RE” as well as some curious entries such as “*” and “+”. The
latter are considered to be “jokers”. The transformations related to the
“+”-sign will be applied on any node, before any other transformation is
applied. In this case, all empty nodes will be removed first (transformation:
<code class="docutils literal notranslate"><span class="pre">remove_empty</span></code>). The “*”-joker contains a list of transformations that will be
applied to all those tags that have not been entered explicitly into the
transformation table. For example, if the transformation reaches a node with the
tag-name “:ZeroOrMore” (i.e. an anonymous node that has been generated by the
parser “:ZeroOrmore”), the “*”-joker-operators will be applied. In this
case it is just one transformation, namely, <code class="docutils literal notranslate"><span class="pre">replace_by_single_child</span></code> which
replaces a node that has but one child by its child. In contrast, the
transformation <code class="docutils literal notranslate"><span class="pre">reduce_single_child</span></code> eliminates a single child node by
attaching the child’s children or content directly to the parent node. We’ll see
what this means and how this works, briefly.</p>
<div class="admonition caution">
<p class="first admonition-title">Caution</p>
<p>Once the compiler-script “xxxxCompiler.py” has been generated, the
<em>only</em> part that is changed after editing and extending the grammar is the
parser-part of this script (i.e. the class derived from class Grammar),
because this part is completly auto-generated and can therefore be
overwritten safely. The other parts of that script, including the
AST-transformation-dictionary, if never changed once it has been generated,
because it needs to be filled in by hand by the designer of the DSL and the
hand-made changes should not be overwritten. There it is left as it is when
regenerating the parser. However, this means, if you add symbols to your
grammar later, you will not find them as keys in the
AST-transformation-table, but you’ll have to add them yourself.</p>
<p class="last">The comments in the compiler-script clearly indicate which parts can be
edited by hand safely, i.e. without running the risk of being overwritten, an
which cannot.</p>
</div>
<p>We can either specify no operator (empty list), a single operator or a list of
operators for transforming a node. There is a different between specifying an
empty list for a particular tag-name or leaving out a tag-name completly. In the
latter case the “*”-joker is applied, in place of the missing list of operators.
In the former case only the “+”-joker is applied. If a list of operators is
specified, these operator will be applied in sequence one after the other. We
also call the list of operators or the single operator if there is only one the
<em>transformation</em> for a particular tag (or parser name or parser type for that
matter).</p>
<p>Because the AST-transfomation works through the table from the inside to the
outside, it is reasonable to do the same when designing the AST-transformations,
to proceed in the same order. The innermost nodes that concern us are the nodes
captured by the &lt;WORD&gt;-parser, or simply, &lt;WORD&gt;-nodes. As we can see, these
nodes usually contain a &lt;:RegExp&gt;-node and a &lt;:Whitespace&gt;-node. As the “WORD”
parser is defined as a simple regular expresseion with followed by optional
whitespace in our grammar, we now that this must always be the case, although
the whitespace may occasionally be empty. Thus, we can eliminate the
uninformative child nodes by removing whitespace first and the reducing the
single left over child node. The respective line in the AST-transformation-table
in the compiler-script should be changed as follows:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="s2">&quot;WORD&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">remove_whitespace</span><span class="p">,</span> <span class="n">reduce_single_child</span><span class="p">],</span>
</pre></div>
</div>
<p>Running the “poetryCompiler.py”-script on “macbeth.dsl” again, yields:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>&lt;document&gt;
  &lt;:ZeroOrMore&gt;
    &lt;sentence&gt;
      &lt;part&gt;
        &lt;WORD&gt;Life’s&lt;/WORD&gt;
        &lt;WORD&gt;but&lt;/WORD&gt;
        &lt;WORD&gt;a&lt;/WORD&gt;
        &lt;WORD&gt;walking&lt;/WORD&gt;
        &lt;WORD&gt;shadow&lt;/WORD&gt;
      &lt;/part&gt;
      &lt;:Series&gt;
        &lt;:Token&gt;
          &lt;:PlainText&gt;,&lt;/:PlainText&gt;
          &lt;:Whitespace&gt; &lt;/:Whitespace&gt;
        &lt;/:Token&gt;
        &lt;part&gt;
          &lt;WORD&gt;a&lt;/WORD&gt;
...
</pre></div>
</div>
<p>It starts to become more readble and concise, but there are sill some oddities.
Firstly, the Tokens that deliminate parts of sentences still contain whitespace.
Secondly, if several &lt;part&gt;-nodes follow each other in a &lt;sentence&gt;-node, the
&lt;part&gt;-nodes after the first one are enclosed by a &lt;:Series&gt;-node or even a
cascade of &lt;:ZeroOrMore&gt; and &lt;:Series&gt;-nodes. As for the &lt;:Token&gt;-nodes, have
can do the same trick as with the WORD-nodes:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="s2">&quot;:Token&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">remove_whitespace</span><span class="p">,</span> <span class="n">reduce_single_child</span><span class="p">],</span>
<span class="s2">&quot;:RE&quot;</span><span class="p">:</span> <span class="n">reduce_single_child</span><span class="p">,</span>
</pre></div>
</div>
<p>As to the nested structure of the &lt;part&gt;-nodes within the &lt;sentence&gt;-node, this
a rather typical case of syntactic artefacts that can be found in concrete
syntax trees. It is obviously a consequence of the grammar definition:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">sentence</span> <span class="o">=</span> <span class="n">part</span> <span class="p">{</span><span class="s2">&quot;,&quot;</span> <span class="n">part</span> <span class="p">}</span> <span class="s2">&quot;.&quot;</span>
</pre></div>
</div>
<p>We’d of course prefer to have flat structure of parts and punctuation marks
following each other within the sentence. Since this is a standard case,
DHParser includes a special operator to “flatten” nested structures of this
kind:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="s2">&quot;sentence&quot;</span> <span class="o">=</span> <span class="p">[</span><span class="n">flatten</span><span class="p">],</span>
</pre></div>
</div>
<p>The <code class="docutils literal notranslate"><span class="pre">flatten</span></code> operator recursively eliminates all intermediary anonymous child
nodes. We do not need to do anything in particular for transforming the
&lt;part&gt;-node, except that we should explicitly assign an empty operator-list to
it, because we do not want the “*” to step in. The reason is that a &lt;part&gt; with
a single &lt;WORD&gt; should still be visible as a part a not replaced by the
&lt;WORD&gt;-node, because we would like our data model to have has regular a form as
possible. (This does of course imply a decision that we have taken on the form
of our data model, which would lead too far to discuss here. Suffice it to say
that depending on the occasion and purpose, such decisions can also be taken
otherwise.)</p>
<p>The only kind of nodes left are the &lt;document&gt;-nodes. In the output of the
compiler-script (see above), the &lt;document&gt;-node had a single childe node
“:ZeroOrMore”. Since this child node does not have any particular semantic
meaning it would reasonable to eliminate it and attach its children directly to
“document”. We could do so by entering <code class="docutils literal notranslate"><span class="pre">reduce_single_child</span></code> in the lost of
transformations for “document”-nodes. However, when designing the
AST-transformations, it is important not only to consider the concrete output
that a particular text yields, but all possible outputs. Therefore, before
specifying a transformation, we should also take a careful look at the grammar
again, where “document” is defined as follows:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>document = ~ { sentence } §EOF
</pre></div>
</div>
<p>As we can see a “document”-node may also contain whitespace and an EOF-marker.
The reason why we don’t find these in the output is that empty nodes have been
eliminated by the <code class="docutils literal notranslate"><span class="pre">remove_empty</span></code>-transformation specified in the “+”-joker,
before. While EOF is always empty (little exercise: explain why!). But there
could be “:Whitespace”-nodes next to the zero or more sentences in the document
node, in which case the “reduce_single_child”-operator would do nothing, because
there is more than a single child. (We could of course also use the
“flatten”-operator, instead. Try this as an exercise.) Test cases help to
capture those different scenarios, so adding test cases and examining the output
in the test report halp to get a grip on this, if just looking at the grammar
strains you imagination too much.</p>
<p>Since we have decided, that we do not want to include whitespace in our data
model, we can simply eliminate any whitespace before we apply the
<code class="docutils literal notranslate"><span class="pre">reduce_single_child</span></code>-operator, so we change the “document”-entry in the
AST-transformation-table as thus:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="s2">&quot;document&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">remove_whitespace</span><span class="p">,</span> <span class="n">reduce_single_child</span><span class="p">],</span>
</pre></div>
</div>
<p>Now that everything is set, let’s have a look at the result:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>&lt;document&gt;
  &lt;sentence&gt;
    &lt;part&gt;
      &lt;WORD&gt;Life’s&lt;/WORD&gt;
      &lt;WORD&gt;but&lt;/WORD&gt;
      &lt;WORD&gt;a&lt;/WORD&gt;
      &lt;WORD&gt;walking&lt;/WORD&gt;
      &lt;WORD&gt;shadow&lt;/WORD&gt;
    &lt;/part&gt;
    &lt;:Token&gt;,&lt;/:Token&gt;
    &lt;part&gt;
      &lt;WORD&gt;a&lt;/WORD&gt;
      &lt;WORD&gt;poor&lt;/WORD&gt;
      &lt;WORD&gt;player&lt;/WORD&gt;
...
</pre></div>
</div>
<p>That is much better. There is but one slight blemish in the output: While all
nodes left a named nodes, i.e. nodes associated with a named parser, there are a
few anonymous &lt;:Token&gt; nodes. Here is a little exercise: Do away with those
&lt;:Token&gt;-nodes by replacing them by something semantically more meaningful.
Hint: Add a new symbol “delimiter” in the grammar definition “poetry.ebnf”. An
alternative strategy to extending the grammar would be to use the
<code class="docutils literal notranslate"><span class="pre">replace_parser</span></code> operator. Which of the strategy is the better one? Explain
why.</p>
eckhart's avatar
eckhart committed
949
950
951
952
953
954
</div>
</div>
</div>


           </div>
eckhart's avatar
eckhart committed
955
           
eckhart's avatar
eckhart committed
956
957
958
          </div>
          <footer>
  
eckhart's avatar
eckhart committed
959
960
961
962
963
964
965
966
967
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
        <a href="UserGuide.html" class="btn btn-neutral float-right" title="DHParser User’s Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
        <a href="index.html" class="btn btn-neutral" title="Welcome to DHParser’s documentation!" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  
eckhart's avatar
eckhart committed
968
969
970
971
972
973
974
975
976

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2018, Eckhart Arnold.

    </p>
  </div>
eckhart's avatar
eckhart committed
977
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 
eckhart's avatar
eckhart committed
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995

</footer>

        </div>
      </div>

    </section>

  </div>
  


  

    <script type="text/javascript">
        var DOCUMENTATION_OPTIONS = {
            URL_ROOT:'./',
            VERSION:'0.8',
eckhart's avatar
eckhart committed
996
            LANGUAGE:'None',
eckhart's avatar
eckhart committed
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
            COLLAPSE_INDEX:false,
            FILE_SUFFIX:'.html',
            HAS_SOURCE:  true,
            SOURCELINK_SUFFIX: '.txt'
        };
    </script>
      <script type="text/javascript" src="_static/jquery.js"></script>
      <script type="text/javascript" src="_static/underscore.js"></script>
      <script type="text/javascript" src="_static/doctools.js"></script>
      <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>

  

  
  
    <script type="text/javascript" src="_static/js/theme.js"></script>
  

  <script type="text/javascript">
      jQuery(function () {
eckhart's avatar
eckhart committed
1017
1018
1019
          
          SphinxRtdTheme.Navigation.enableSticky();
          
eckhart's avatar
eckhart committed
1020
      });
eckhart's avatar
eckhart committed
1021
  </script> 
eckhart's avatar
eckhart committed
1022
1023
1024

</body>
</html>