toolkit.py 10.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
"""toolkit.py - utility functions for DHParser

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.


Module ``toolkit`` contains utility functions and cross-sectional
functionality like logging support that is needed across several 
of the the other DHParser-Modules.

For logging functionality, the global variable LOGGING is defined which
contains the name of a directory where log files shall be placed. By
setting its value to the empty string "" logging can be turned off.

To read the directory name function ``LOGS_DIR()`` should be called
rather than reading the variable LOGGING. ``LOGS_DIR()`` makes sure
the directory exists and raises an error if a file with the same name
already exists.
"""

33
import codecs
34
import hashlib
35
36
import io
import parser
37

di68kap's avatar
di68kap committed
38
39
40
41
try:
    import regex as re
except ImportError:
    import re
42
import sys
43

44
try:
45
    import typing
46
except ImportError:
47
    import DHParser.foreign_typing as typing
di68kap's avatar
di68kap committed
48
    sys.modules['typing'] = typing  # make it possible to import from typing
49

50
from typing import Any, Iterable, Sequence, Set, Union, Dict, cast
51

52
__all__ = ('escape_re',
53
           'escape_control_characters',
Eckhart Arnold's avatar
Eckhart Arnold committed
54
           'is_filename',
55
           'lstrip_docstring',
56
57
           'load_if_file',
           'is_python_code',
58
59
           'md5',
           'expand_table',
60
           'compile_python_object',
61
           'smart_list',
62
           'sane_parser_name')
63
64
65
66


#######################################################################
#
67
# miscellaneous (generic)
68
69
#
#######################################################################
70
71


72
def escape_re(strg: str) -> str:
73
74
    """
    Returns the string with all regular expression special characters escaped.
75
    """
76
    # assert isinstance(strg, str)
77
78
    re_chars = r"\.^$*+?{}[]()#<>=|!"
    for esc_ch in re_chars:
79
80
        strg = strg.replace(esc_ch, '\\' + esc_ch)
    return strg
81
82


83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
def escape_control_characters(strg: str) -> str:
    """Replace all control characters (e.g. \n \t) in a string by their
    backslashed representation."""
    return repr(strg).replace('\\\\', '\\')[1:-1]


def lstrip_docstring(docstring: str) -> str:
    """
    Strips leading whitespace from a docstring.
    """
    lines = docstring.replace('\t', '    ').split('\n')
    indent = 255  # highest integer value
    for line in lines[1:]:
        stripped = line.lstrip()
        if stripped:  # ignore empty lines
            indent = min(indent, len(line) - len(stripped))
    if indent >= 255:
        indent = 0
    return '\n'.join([lines[0]] + [line[indent:] for line in lines[1:]])


104
def is_filename(strg: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
105
    """Tries to guess whether string ``s`` is a file name."""
106
107
108
    return strg.find('\n') < 0 and strg[:1] != " " and strg[-1:] != " " \
           and all(strg.find(ch) < 0 for ch in '*?"<>|')
           # and strg.find('*') < 0 and strg.find('?') < 0
Eckhart Arnold's avatar
Eckhart Arnold committed
109
110


111
112
113
114
115
116
117
#######################################################################
#
# loading and compiling
#
#######################################################################


118
119
120
def load_if_file(text_or_file) -> str:
    """Reads and returns content of a text-file if parameter
    `text_or_file` is a file name (i.e. a single line string),
121
    otherwise (i.e. if `text_or_file` is a multi-line string)
122
    `text_or_file` is returned.
123
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
124
    if is_filename(text_or_file):
125
126
127
128
        try:
            with open(text_or_file, encoding="utf-8") as f:
                content = f.read()
            return content
129
        except FileNotFoundError:
130
            if re.fullmatch(r'[\w/:. \\]+', text_or_file):
131
132
                raise FileNotFoundError('Not a valid file: ' + text_or_file + '!\n(Add "\\n" '
                                        'to distinguish source data from a file name.)')
133
134
            else:
                return text_or_file
135
136
137
138
    else:
        return text_or_file


139
def is_python_code(text_or_file: str) -> bool:
140
141
142
    """Checks whether 'text_or_file' is python code or the name of a file that
    contains python code.
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
143
    if is_filename(text_or_file):
144
145
        return text_or_file[-3:].lower() == '.py'
    try:
146
147
        parser.suite(text_or_file)
        # compile(text_or_file, '<string>', 'exec')
148
149
150
151
152
153
        return True
    except (SyntaxError, ValueError, OverflowError):
        pass
    return False


154
def has_fenced_code(text_or_file: str, info_strings=('ebnf', 'test')) -> bool:
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    """Checks whether `text_or_file` contains fenced code blocks, which are
    marked by one of the given info strings.
    See http://spec.commonmark.org/0.28/#fenced-code-blocks for more
    information on fenced code blocks in common mark documents.
    """
    if is_filename(text_or_file):
        with open(text_or_file, 'r', encoding='utf-8') as f:
            markdown = f.read()
    else:
        markdown = text_or_file

    if markdown.find('\n~~~') < 0 and markdown.find('\n```') < 0:
        return False

169
170
171
172
173
174
    if isinstance(info_strings, str):
        info_strings = (info_strings,)
    fence_tmpl = '\n(?:(?:``[`]*[ ]*(?:%s)(?=[ .\-:\n])[^`\n]*\n)' + \
                 '|(?:~~[~]*[ ]*(?:%s)(?=[ .\-:\n])[\n]*\n))'
    label_re = '|'.join('(?:%s)' % matched_string for matched_string in info_strings)
    rx_fence = re.compile(fence_tmpl % (label_re, label_re), flags=re.IGNORECASE)
175

176
177
178
    for match in rx_fence.finditer(markdown):
        matched_string = re.match('(?:\n`+)|(?:\n~+)', match.group(0)).group(0)
        if markdown.find(matched_string, match.end()) >= 0:
179
180
            return True
        else:
181
182
            break
    return False
183
184


185
186
187
188
189
190
191
192
193
194
def md5(*txt):
    """Returns the md5-checksum for `txt`. This can be used to test if
    some piece of text, for example a grammar source file, has changed.
    """
    md5_hash = hashlib.md5()
    for t in txt:
        md5_hash.update(t.encode('utf8'))
    return md5_hash.hexdigest()


195
196
197
198
199
200
201
202
203
204
205
def compile_python_object(python_src, catch_obj_regex=""):
    """Compiles the python source code and returns the (first) object
    the name of which is matched by ``catch_obj_regex``. If catch_obj
    is the empty string, the namespace dictionary will be returned.
    """
    if isinstance(catch_obj_regex, str):
        catch_obj_regex = re.compile(catch_obj_regex)
    code = compile(python_src, '<string>', 'exec')
    namespace = {}
    exec(code, namespace)  # safety risk?
    if catch_obj_regex:
206
207
        matches = [key for key in namespace if catch_obj_regex.match(key)]
        if len(matches) < 1:
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
            raise ValueError("No object matching /%s/ defined in source code." %
                             catch_obj_regex.pattern)
        elif len(matches) > 1:
            raise ValueError("Ambiguous matches for %s : %s" %
                             (str(catch_obj_regex), str(matches)))
        return namespace[matches[0]] if matches else None
    else:
        return namespace


#######################################################################
#
# smart lists and multi-keyword tables
#
#######################################################################


Eckhart Arnold's avatar
Eckhart Arnold committed
225
# def smart_list(arg: Union[str, Iterable[T]]) -> Union[Sequence[str], Sequence[T]]:
226
def smart_list(arg: Union[str, Iterable, Any]) -> Union[Sequence, Set]:
227
    """Returns the argument as list, depending on its type and content.
228

229
230
231
    If the argument is a string, it will be interpreted as a list of
    comma separated values, trying ';', ',', ' ' as possible delimiters
    in this order, e.g.
232
233
234
235
236
237
238
    >>> smart_list('1; 2, 3; 4')
    ['1', '2, 3', '4']
    >>> smart_list('2, 3')
    ['2', '3']
    >>> smart_list('a b cd')
    ['a', 'b', 'cd']

239
240
    If the argument is a collection other than a string, it will be
    returned as is, e.g.
241
242
243
244
245
    >>> smart_list((1, 2, 3))
    (1, 2, 3)
    >>> smart_list({1, 2, 3})
    {1, 2, 3}

246
247
    If the argument is another iterable than a collection, it will
    be converted into a list, e.g.
248
249
250
    >>> smart_list(i for i in {1,2,3})
    [1, 2, 3]

251
    Finally, if none of the above is true, the argument will be
252
    wrapped in a list and returned, e.g.
253
254
    >>> smart_list(125)
    [125]
255
256
257
258
259
    """
    if isinstance(arg, str):
        for delimiter in (';', ','):
            lst = arg.split(delimiter)
            if len(lst) > 1:
260
261
                return [s.strip() for s in lst]
        return [s.strip() for s in arg.strip().split(' ')]
262
    elif isinstance(arg, Sequence) or isinstance(arg, Set):
263
        return arg
Eckhart Arnold's avatar
Eckhart Arnold committed
264
    elif isinstance(arg, Iterable):
265
        return list(arg)
266
267
268
269
    else:
        return [arg]


270
def expand_table(compact_table: Dict) -> Dict:
271
272
273
274
    """Expands a table by separating keywords that are tuples or strings
    containing comma separated words into single keyword entries with
    the same values. Returns the expanded table.
    Example:
275
276
    >>> expand_table({"a, b": 1, ('d','e','f'):5, "c":3})
    {'a': 1, 'b': 1, 'd': 5, 'e': 5, 'f': 5, 'c': 3}
277
278
279
280
281
    """
    expanded_table = {}
    keys = list(compact_table.keys())
    for key in keys:
        value = compact_table[key]
282
        for k in smart_list(key):
283
            if k in expanded_table:
284
                raise KeyError('Key "%s" used more than once in compact table!' % key)
285
            expanded_table[k] = value
286
    return expanded_table
287
288


289
290
#######################################################################
#
291
# miscellaneous (DHParser-specific)
292
293
294
295
#
#######################################################################


296
def sane_parser_name(name) -> bool:
297
    """Checks whether given name is an acceptable parser name. Parser names
298
    must not be preceded or succeeded by a double underscore '__'!
299
    """
di68kap's avatar
di68kap committed
300
301
302
    return name and name[:2] != '__' and name[-2:] != '__'


303
304
305
306
307
308
309
#######################################################################
#
# initialization
#
#######################################################################


310
311
try:
    if sys.stdout.encoding.upper() != "UTF-8":
312
        # make sure that `print()` does not raise an error on
313
        # non-ASCII characters:
314
315
        sys.stdout = cast(io.TextIOWrapper, codecs.getwriter("utf-8")(cast(
            io.BytesIO, cast(io.TextIOWrapper, sys.stdout).detach())))
316
317
318
except AttributeError:
    # somebody has already taken care of this !?
    pass