Currently job artifacts in CI/CD pipelines on LRZ GitLab never expire. Starting from Wed 26.1.2022 the default expiration time will be 30 days (GitLab default). Currently existing artifacts in already completed jobs will not be affected by the change. The latest artifacts for all jobs in the latest successful pipelines will be kept. More information: https://gitlab.lrz.de/help/user/admin_area/settings/continuous_integration.html#default-artifacts-expiration

toolkit.py 10.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# toolkit.py - utility functions for DHParser
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
17
18


19
20
21
22
23
24
"""
Module ``toolkit`` contains utility functions that are needed across
several of the the other DHParser-Modules or that are just very generic
so that they are best defined in a toolkit-module.
"""

25
import codecs
26
import hashlib
27
import inspect
28
29
import io
import parser
30

di68kap's avatar
di68kap committed
31
32
33
34
try:
    import regex as re
except ImportError:
    import re
35
import sys
36

37
try:
38
    import typing
39
except ImportError:
40
    import DHParser.foreign_typing as typing
di68kap's avatar
di68kap committed
41
    sys.modules['typing'] = typing  # make it possible to import from typing
42

43
from typing import Any, Iterable, Sequence, Set, Union, Dict, cast
44

45
__all__ = ('escape_re',
46
           'escape_control_characters',
Eckhart Arnold's avatar
Eckhart Arnold committed
47
           'is_filename',
48
           'lstrip_docstring',
49
50
           'issubtype',
           'isgenerictype',
51
52
           'load_if_file',
           'is_python_code',
53
54
           'md5',
           'expand_table',
55
           'compile_python_object',
56
           'smart_list',
57
           'sane_parser_name')
58
59
60
61


#######################################################################
#
62
# miscellaneous (generic)
63
64
#
#######################################################################
65
66


67
def escape_re(strg: str) -> str:
68
69
    """
    Returns the string with all regular expression special characters escaped.
70
    """
71
    # assert isinstance(strg, str)
72
73
    re_chars = r"\.^$*+?{}[]()#<>=|!"
    for esc_ch in re_chars:
74
75
        strg = strg.replace(esc_ch, '\\' + esc_ch)
    return strg
76
77


78
def escape_control_characters(strg: str) -> str:
79
80
81
    """
    Replace all control characters (e.g. \n \t) in a string by their backslashed representation.
    """
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
    return repr(strg).replace('\\\\', '\\')[1:-1]


def lstrip_docstring(docstring: str) -> str:
    """
    Strips leading whitespace from a docstring.
    """
    lines = docstring.replace('\t', '    ').split('\n')
    indent = 255  # highest integer value
    for line in lines[1:]:
        stripped = line.lstrip()
        if stripped:  # ignore empty lines
            indent = min(indent, len(line) - len(stripped))
    if indent >= 255:
        indent = 0
    return '\n'.join([lines[0]] + [line[indent:] for line in lines[1:]])


100
def is_filename(strg: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
101
    """Tries to guess whether string ``s`` is a file name."""
102
    return strg.find('\n') < 0 and strg[:1] != " " and strg[-1:] != " " \
103
        and all(strg.find(ch) < 0 for ch in '*?"<>|')
104
    #   and strg.select('*') < 0 and strg.select('?') < 0
Eckhart Arnold's avatar
Eckhart Arnold committed
105
106


107
108
109
110
111
112
113
114
#######################################################################
#
# type system support
#
#######################################################################


def issubtype(sub_type, base_type):
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    # if sys.version_info.major <= 3 and sys.version_info.minor <= 6:
    #     return issubclass(sub_type, base_type)
    # try:
    #     base_type = base_type.__origin__
    # except AttributeError:
    #     pass
    # try:
    #     sub_type = sub_type.__origin__
    # except AttributeError:
    #     pass
    def origin(t):
        try:
            ot = t.__origin__
        except AttributeError:
            return t
        return ot if ot is not None else t
    return issubclass(origin(sub_type), origin(base_type))
132
133
134
135
136
137


def isgenerictype(t):
    return str(t).endswith(']')


138
139
140
141
142
143
144
#######################################################################
#
# loading and compiling
#
#######################################################################


145
146
147
def load_if_file(text_or_file) -> str:
    """Reads and returns content of a text-file if parameter
    `text_or_file` is a file name (i.e. a single line string),
148
    otherwise (i.e. if `text_or_file` is a multi-line string)
149
    `text_or_file` is returned.
150
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
151
    if is_filename(text_or_file):
152
153
154
155
        try:
            with open(text_or_file, encoding="utf-8") as f:
                content = f.read()
            return content
156
        except FileNotFoundError:
157
            if re.fullmatch(r'[\w/:. \\]+', text_or_file):
158
159
                raise FileNotFoundError('Not a valid file: ' + text_or_file + '!\n(Add "\\n" '
                                        'to distinguish source data from a file name.)')
160
161
            else:
                return text_or_file
162
163
164
165
    else:
        return text_or_file


166
def is_python_code(text_or_file: str) -> bool:
167
168
169
    """Checks whether 'text_or_file' is python code or the name of a file that
    contains python code.
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
170
    if is_filename(text_or_file):
171
172
        return text_or_file[-3:].lower() == '.py'
    try:
173
174
        parser.suite(text_or_file)
        # compile(text_or_file, '<string>', 'exec')
175
176
177
178
179
180
        return True
    except (SyntaxError, ValueError, OverflowError):
        pass
    return False


181
def has_fenced_code(text_or_file: str, info_strings=('ebnf', 'test')) -> bool:
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    """Checks whether `text_or_file` contains fenced code blocks, which are
    marked by one of the given info strings.
    See http://spec.commonmark.org/0.28/#fenced-code-blocks for more
    information on fenced code blocks in common mark documents.
    """
    if is_filename(text_or_file):
        with open(text_or_file, 'r', encoding='utf-8') as f:
            markdown = f.read()
    else:
        markdown = text_or_file

    if markdown.find('\n~~~') < 0 and markdown.find('\n```') < 0:
        return False

196
197
198
199
200
201
    if isinstance(info_strings, str):
        info_strings = (info_strings,)
    fence_tmpl = '\n(?:(?:``[`]*[ ]*(?:%s)(?=[ .\-:\n])[^`\n]*\n)' + \
                 '|(?:~~[~]*[ ]*(?:%s)(?=[ .\-:\n])[\n]*\n))'
    label_re = '|'.join('(?:%s)' % matched_string for matched_string in info_strings)
    rx_fence = re.compile(fence_tmpl % (label_re, label_re), flags=re.IGNORECASE)
202

203
204
205
    for match in rx_fence.finditer(markdown):
        matched_string = re.match('(?:\n`+)|(?:\n~+)', match.group(0)).group(0)
        if markdown.find(matched_string, match.end()) >= 0:
206
207
            return True
        else:
208
209
            break
    return False
210
211


212
213
214
215
216
217
218
219
220
221
def md5(*txt):
    """Returns the md5-checksum for `txt`. This can be used to test if
    some piece of text, for example a grammar source file, has changed.
    """
    md5_hash = hashlib.md5()
    for t in txt:
        md5_hash.update(t.encode('utf8'))
    return md5_hash.hexdigest()


222
223
224
225
226
227
228
229
230
231
232
def compile_python_object(python_src, catch_obj_regex=""):
    """Compiles the python source code and returns the (first) object
    the name of which is matched by ``catch_obj_regex``. If catch_obj
    is the empty string, the namespace dictionary will be returned.
    """
    if isinstance(catch_obj_regex, str):
        catch_obj_regex = re.compile(catch_obj_regex)
    code = compile(python_src, '<string>', 'exec')
    namespace = {}
    exec(code, namespace)  # safety risk?
    if catch_obj_regex:
233
234
        matches = [key for key in namespace if catch_obj_regex.match(key)]
        if len(matches) < 1:
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
            raise ValueError("No object matching /%s/ defined in source code." %
                             catch_obj_regex.pattern)
        elif len(matches) > 1:
            raise ValueError("Ambiguous matches for %s : %s" %
                             (str(catch_obj_regex), str(matches)))
        return namespace[matches[0]] if matches else None
    else:
        return namespace


#######################################################################
#
# smart lists and multi-keyword tables
#
#######################################################################


Eckhart Arnold's avatar
Eckhart Arnold committed
252
# def smart_list(arg: Union[str, Iterable[T]]) -> Union[Sequence[str], Sequence[T]]:
253
def smart_list(arg: Union[str, Iterable, Any]) -> Union[Sequence, Set]:
254
    """Returns the argument as list, depending on its type and content.
255

256
257
258
    If the argument is a string, it will be interpreted as a list of
    comma separated values, trying ';', ',', ' ' as possible delimiters
    in this order, e.g.
259
260
261
262
263
264
265
    >>> smart_list('1; 2, 3; 4')
    ['1', '2, 3', '4']
    >>> smart_list('2, 3')
    ['2', '3']
    >>> smart_list('a b cd')
    ['a', 'b', 'cd']

266
267
    If the argument is a collection other than a string, it will be
    returned as is, e.g.
268
269
270
271
272
    >>> smart_list((1, 2, 3))
    (1, 2, 3)
    >>> smart_list({1, 2, 3})
    {1, 2, 3}

273
274
    If the argument is another iterable than a collection, it will
    be converted into a list, e.g.
275
276
277
    >>> smart_list(i for i in {1,2,3})
    [1, 2, 3]

278
    Finally, if none of the above is true, the argument will be
279
    wrapped in a list and returned, e.g.
280
281
    >>> smart_list(125)
    [125]
282
283
284
285
286
    """
    if isinstance(arg, str):
        for delimiter in (';', ','):
            lst = arg.split(delimiter)
            if len(lst) > 1:
287
288
                return [s.strip() for s in lst]
        return [s.strip() for s in arg.strip().split(' ')]
289
    elif isinstance(arg, Sequence) or isinstance(arg, Set):
290
        return arg
Eckhart Arnold's avatar
Eckhart Arnold committed
291
    elif isinstance(arg, Iterable):
292
        return list(arg)
293
294
295
296
    else:
        return [arg]


297
def expand_table(compact_table: Dict) -> Dict:
298
299
300
301
    """Expands a table by separating keywords that are tuples or strings
    containing comma separated words into single keyword entries with
    the same values. Returns the expanded table.
    Example:
302
303
    >>> expand_table({"a, b": 1, ('d','e','f'):5, "c":3})
    {'a': 1, 'b': 1, 'd': 5, 'e': 5, 'f': 5, 'c': 3}
304
    """
305
    expanded_table = {}  # type: Dict
306
307
308
    keys = list(compact_table.keys())
    for key in keys:
        value = compact_table[key]
309
        for k in smart_list(key):
310
            if k in expanded_table:
311
                raise KeyError('Key "%s" used more than once in compact table!' % key)
312
            expanded_table[k] = value
313
    return expanded_table
314
315


eckhart's avatar
eckhart committed
316

317
318
#######################################################################
#
319
# miscellaneous (DHParser-specific)
320
321
322
323
#
#######################################################################


324
def sane_parser_name(name) -> bool:
325
    """Checks whether given name is an acceptable parser name. Parser names
326
    must not be preceded or succeeded by a double underscore '__'!
327
    """
di68kap's avatar
di68kap committed
328
329
330
    return name and name[:2] != '__' and name[-2:] != '__'


331
332
333
334
335
336
337
#######################################################################
#
# initialization
#
#######################################################################


338
339
try:
    if sys.stdout.encoding.upper() != "UTF-8":
340
        # make sure that `print()` does not raise an error on
341
        # non-ASCII characters:
342
343
        sys.stdout = cast(io.TextIOWrapper, codecs.getwriter("utf-8")(cast(
            io.BytesIO, cast(io.TextIOWrapper, sys.stdout).detach())))
344
345
346
except AttributeError:
    # somebody has already taken care of this !?
    pass