toolkit.py 11 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# toolkit.py - utility functions for DHParser
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
17
18


19
20
21
22
23
24
"""
Module ``toolkit`` contains utility functions that are needed across
several of the the other DHParser-Modules or that are just very generic
so that they are best defined in a toolkit-module.
"""

25
import codecs
26
import hashlib
27
import io
28
import multiprocessing
29
import parser
30
import threading
31

di68kap's avatar
di68kap committed
32
33
34
35
try:
    import regex as re
except ImportError:
    import re
36
import sys
37

38
try:
39
    import typing
40
except ImportError:
41
    import DHParser.foreign_typing as typing
di68kap's avatar
di68kap committed
42
    sys.modules['typing'] = typing  # make it possible to import from typing
43

44
from typing import Any, Iterable, Sequence, Set, Union, Dict, cast
45

46

47
__all__ = ('escape_re',
48
           'escape_control_characters',
Eckhart Arnold's avatar
Eckhart Arnold committed
49
           'is_filename',
50
           'concurrent_ident',
51
           'lstrip_docstring',
52
53
           'issubtype',
           'isgenerictype',
54
55
           'load_if_file',
           'is_python_code',
56
57
           'md5',
           'expand_table',
58
           'compile_python_object',
59
           'smart_list',
60
           'sane_parser_name')
61
62
63
64


#######################################################################
#
65
# miscellaneous (generic)
66
67
#
#######################################################################
68
69


70
def escape_re(strg: str) -> str:
71
72
    """
    Returns the string with all regular expression special characters escaped.
73
    """
74

75
    # assert isinstance(strg, str)
76
77
    re_chars = r"\.^$*+?{}[]()#<>=|!"
    for esc_ch in re_chars:
78
79
        strg = strg.replace(esc_ch, '\\' + esc_ch)
    return strg
80
81


82
def escape_control_characters(strg: str) -> str:
83
84
85
    """
    Replace all control characters (e.g. \n \t) in a string by their backslashed representation.
    """
86

87
88
89
90
91
92
93
    return repr(strg).replace('\\\\', '\\')[1:-1]


def lstrip_docstring(docstring: str) -> str:
    """
    Strips leading whitespace from a docstring.
    """
94

95
96
97
98
99
100
101
102
103
104
105
    lines = docstring.replace('\t', '    ').split('\n')
    indent = 255  # highest integer value
    for line in lines[1:]:
        stripped = line.lstrip()
        if stripped:  # ignore empty lines
            indent = min(indent, len(line) - len(stripped))
    if indent >= 255:
        indent = 0
    return '\n'.join([lines[0]] + [line[indent:] for line in lines[1:]])


106
def is_filename(strg: str) -> bool:
107
108
109
110
    """
    Tries to guess whether string ``strg`` is a file name.
    """

111
    return strg.find('\n') < 0 and strg[:1] != " " and strg[-1:] != " " \
112
        and all(strg.find(ch) < 0 for ch in '*?"<>|')
113
    #   and strg.select('*') < 0 and strg.select('?') < 0
Eckhart Arnold's avatar
Eckhart Arnold committed
114
115


116
117
118
119
120
121
122
def concurrent_ident() -> str:
    """
    Returns an identificator for the current process and thread
    """
    return multiprocessing.current_process().name + '_' + str(threading.get_ident())


123
124
125
126
127
128
129
130
#######################################################################
#
# type system support
#
#######################################################################


def issubtype(sub_type, base_type):
131
132
133
134
135
136
137
    def origin(t):
        try:
            ot = t.__origin__
        except AttributeError:
            return t
        return ot if ot is not None else t
    return issubclass(origin(sub_type), origin(base_type))
138
139
140
141
142
143


def isgenerictype(t):
    return str(t).endswith(']')


144
145
146
147
148
149
150
#######################################################################
#
# loading and compiling
#
#######################################################################


151
def load_if_file(text_or_file) -> str:
152
153
    """
    Reads and returns content of a text-file if parameter
154
    `text_or_file` is a file name (i.e. a single line string),
155
    otherwise (i.e. if `text_or_file` is a multi-line string)
156
    `text_or_file` is returned.
157
    """
158

Eckhart Arnold's avatar
Eckhart Arnold committed
159
    if is_filename(text_or_file):
160
161
162
163
        try:
            with open(text_or_file, encoding="utf-8") as f:
                content = f.read()
            return content
164
        except FileNotFoundError:
165
            if re.fullmatch(r'[\w/:. \\]+', text_or_file):
166
167
                raise FileNotFoundError('Not a valid file: ' + text_or_file + '!\n(Add "\\n" '
                                        'to distinguish source data from a file name.)')
168
169
            else:
                return text_or_file
170
171
172
173
    else:
        return text_or_file


174
def is_python_code(text_or_file: str) -> bool:
175
176
    """
    Checks whether 'text_or_file' is python code or the name of a file that
177
178
    contains python code.
    """
179

Eckhart Arnold's avatar
Eckhart Arnold committed
180
    if is_filename(text_or_file):
181
182
        return text_or_file[-3:].lower() == '.py'
    try:
183
184
        parser.suite(text_or_file)
        # compile(text_or_file, '<string>', 'exec')
185
186
187
188
189
190
        return True
    except (SyntaxError, ValueError, OverflowError):
        pass
    return False


191
def has_fenced_code(text_or_file: str, info_strings=('ebnf', 'test')) -> bool:
192
193
    """
    Checks whether `text_or_file` contains fenced code blocks, which are
194
195
196
197
    marked by one of the given info strings.
    See http://spec.commonmark.org/0.28/#fenced-code-blocks for more
    information on fenced code blocks in common mark documents.
    """
198

199
200
201
202
203
204
205
206
207
    if is_filename(text_or_file):
        with open(text_or_file, 'r', encoding='utf-8') as f:
            markdown = f.read()
    else:
        markdown = text_or_file

    if markdown.find('\n~~~') < 0 and markdown.find('\n```') < 0:
        return False

208
209
210
211
212
213
    if isinstance(info_strings, str):
        info_strings = (info_strings,)
    fence_tmpl = '\n(?:(?:``[`]*[ ]*(?:%s)(?=[ .\-:\n])[^`\n]*\n)' + \
                 '|(?:~~[~]*[ ]*(?:%s)(?=[ .\-:\n])[\n]*\n))'
    label_re = '|'.join('(?:%s)' % matched_string for matched_string in info_strings)
    rx_fence = re.compile(fence_tmpl % (label_re, label_re), flags=re.IGNORECASE)
214

215
216
217
    for match in rx_fence.finditer(markdown):
        matched_string = re.match('(?:\n`+)|(?:\n~+)', match.group(0)).group(0)
        if markdown.find(matched_string, match.end()) >= 0:
218
219
            return True
        else:
220
221
            break
    return False
222
223


224
def md5(*txt):
225
226
    """
    Returns the md5-checksum for `txt`. This can be used to test if
227
228
    some piece of text, for example a grammar source file, has changed.
    """
229

230
231
232
233
234
235
    md5_hash = hashlib.md5()
    for t in txt:
        md5_hash.update(t.encode('utf8'))
    return md5_hash.hexdigest()


236
def compile_python_object(python_src, catch_obj_regex=""):
237
238
    """
    Compiles the python source code and returns the (first) object
239
240
241
    the name of which is matched by ``catch_obj_regex``. If catch_obj
    is the empty string, the namespace dictionary will be returned.
    """
242

243
244
245
246
247
248
    if isinstance(catch_obj_regex, str):
        catch_obj_regex = re.compile(catch_obj_regex)
    code = compile(python_src, '<string>', 'exec')
    namespace = {}
    exec(code, namespace)  # safety risk?
    if catch_obj_regex:
249
250
        matches = [key for key in namespace if catch_obj_regex.match(key)]
        if len(matches) < 1:
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
            raise ValueError("No object matching /%s/ defined in source code." %
                             catch_obj_regex.pattern)
        elif len(matches) > 1:
            raise ValueError("Ambiguous matches for %s : %s" %
                             (str(catch_obj_regex), str(matches)))
        return namespace[matches[0]] if matches else None
    else:
        return namespace


#######################################################################
#
# smart lists and multi-keyword tables
#
#######################################################################


Eckhart Arnold's avatar
Eckhart Arnold committed
268
# def smart_list(arg: Union[str, Iterable[T]]) -> Union[Sequence[str], Sequence[T]]:
269
def smart_list(arg: Union[str, Iterable, Any]) -> Union[Sequence, Set]:
270
271
    """
    Returns the argument as list, depending on its type and content.
272

273
274
275
    If the argument is a string, it will be interpreted as a list of
    comma separated values, trying ';', ',', ' ' as possible delimiters
    in this order, e.g.
276
277
278
279
280
281
282
    >>> smart_list('1; 2, 3; 4')
    ['1', '2, 3', '4']
    >>> smart_list('2, 3')
    ['2', '3']
    >>> smart_list('a b cd')
    ['a', 'b', 'cd']

283
284
    If the argument is a collection other than a string, it will be
    returned as is, e.g.
285
286
287
288
289
    >>> smart_list((1, 2, 3))
    (1, 2, 3)
    >>> smart_list({1, 2, 3})
    {1, 2, 3}

290
291
    If the argument is another iterable than a collection, it will
    be converted into a list, e.g.
292
293
294
    >>> smart_list(i for i in {1,2,3})
    [1, 2, 3]

295
    Finally, if none of the above is true, the argument will be
296
    wrapped in a list and returned, e.g.
297
298
    >>> smart_list(125)
    [125]
299
    """
300

301
302
303
304
    if isinstance(arg, str):
        for delimiter in (';', ','):
            lst = arg.split(delimiter)
            if len(lst) > 1:
305
306
                return [s.strip() for s in lst]
        return [s.strip() for s in arg.strip().split(' ')]
307
    elif isinstance(arg, Sequence) or isinstance(arg, Set):
308
        return arg
Eckhart Arnold's avatar
Eckhart Arnold committed
309
    elif isinstance(arg, Iterable):
310
        return list(arg)
311
312
313
314
    else:
        return [arg]


315
def expand_table(compact_table: Dict) -> Dict:
316
317
    """
    Expands a table by separating keywords that are tuples or strings
318
319
320
    containing comma separated words into single keyword entries with
    the same values. Returns the expanded table.
    Example:
321
322
    >>> expand_table({"a, b": 1, ('d','e','f'):5, "c":3})
    {'a': 1, 'b': 1, 'd': 5, 'e': 5, 'f': 5, 'c': 3}
323
    """
324

325
    expanded_table = {}  # type: Dict
326
327
328
    keys = list(compact_table.keys())
    for key in keys:
        value = compact_table[key]
329
        for k in smart_list(key):
330
            if k in expanded_table:
331
                raise KeyError('Key "%s" used more than once in compact table!' % key)
332
            expanded_table[k] = value
333
    return expanded_table
334
335


eckhart's avatar
eckhart committed
336

337
338
#######################################################################
#
339
# miscellaneous (DHParser-specific)
340
341
342
343
#
#######################################################################


344
def sane_parser_name(name) -> bool:
345
346
    """
    Checks whether given name is an acceptable parser name. Parser names
347
    must not be preceded or succeeded by a double underscore '__'!
348
    """
349

di68kap's avatar
di68kap committed
350
351
352
    return name and name[:2] != '__' and name[-2:] != '__'


353
354
355
356
357
358
359
#######################################################################
#
# initialization
#
#######################################################################


360
361
try:
    if sys.stdout.encoding.upper() != "UTF-8":
362
        # make sure that `print()` does not raise an error on
363
        # non-ASCII characters:
364
365
        sys.stdout = cast(io.TextIOWrapper, codecs.getwriter("utf-8")(cast(
            io.BytesIO, cast(io.TextIOWrapper, sys.stdout).detach())))
366
367
368
except AttributeError:
    # somebody has already taken care of this !?
    pass