toolkit.py 16.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
"""toolkit.py - utility functions for DHParser

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.


Module ``toolkit`` contains utility functions and cross-sectional
functionality like logging support that is needed across several 
of the the other DHParser-Modules.

For logging functionality, the global variable LOGGING is defined which
contains the name of a directory where log files shall be placed. By
setting its value to the empty string "" logging can be turned off.

To read the directory name function ``LOGS_DIR()`` should be called
rather than reading the variable LOGGING. ``LOGS_DIR()`` makes sure
the directory exists and raises an error if a file with the same name
already exists.
"""

33
import codecs
34
import collections
Eckhart Arnold's avatar
Eckhart Arnold committed
35
import contextlib
36
37
import hashlib
import os
38

di68kap's avatar
di68kap committed
39
40
41
42
try:
    import regex as re
except ImportError:
    import re
43
import sys
44

45
try:
Eckhart Arnold's avatar
Eckhart Arnold committed
46
    from typing import Any, List, Tuple, Iterable, Sequence, Union, Optional, TypeVar
47
except ImportError:
Eckhart Arnold's avatar
Eckhart Arnold committed
48
    from .typing34 import Any, List, Tuple, Iterable, Sequence, Union, Optional, TypeVar
49

50
__all__ = ('logging',
Eckhart Arnold's avatar
Eckhart Arnold committed
51
52
           'is_logging',
           'log_dir',
53
           'logfile_basename',
54
55
56
57
           'StringView',
           'sv_match',
           'sv_index',
           'sv_search',
58
59
           # 'supress_warnings',
           # 'warnings',
60
           # 'repr_call',
61
           'line_col',
62
           'error_messages',
63
           'escape_re',
Eckhart Arnold's avatar
Eckhart Arnold committed
64
           'is_filename',
65
66
           'load_if_file',
           'is_python_code',
67
68
           'md5',
           'expand_table',
69
           'smart_list',
70
           'sane_parser_name')
71
72


Eckhart Arnold's avatar
Eckhart Arnold committed
73
74
75
def log_dir() -> str:
    """Creates a directory for log files (if it does not exist) and
    returns its path.
76

Eckhart Arnold's avatar
Eckhart Arnold committed
77
78
79
    WARNING: Any files in the log dir will eventually be overwritten.
    Don't use a directory name that could be the name of a directory
    for other purposes than logging.
80
81
82
83

    Returns:
        name of the logging directory
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
84
    # the try-except clauses in the following are precautions for multiprocessing
85
    global LOGGING
Eckhart Arnold's avatar
Eckhart Arnold committed
86
    try:
87
        dirname = LOGGING  # raises a name error if LOGGING is not defined
Eckhart Arnold's avatar
Eckhart Arnold committed
88
89
90
91
92
93
94
95
        if not dirname:
            raise NameError  # raise a name error if LOGGING evaluates to False
    except NameError:
        raise NameError("No access to log directory before logging has been turned "
                        "on within the same thread/process.")
    if os.path.exists(dirname) and not os.path.isdir(dirname):
        raise IOError('"' + dirname + '" cannot be used as log directory, '
                                      'because it is not a directory!')
96
    else:
Eckhart Arnold's avatar
Eckhart Arnold committed
97
98
99
100
101
        try:
            os.mkdir(dirname)
        except FileExistsError:
            pass
    info_file_name = os.path.join(dirname, 'info.txt')
102
103
104
105
    if not os.path.exists(info_file_name):
        with open(info_file_name, 'w') as f:
            f.write("This directory has been created by DHParser to store log files from\n"
                    "parsing. ANY FILE IN THIS DIRECTORY CAN BE OVERWRITTEN! Therefore,\n"
106
107
                    "do not place any files here and do not bother editing files in this\n"
                    "directory as any changes will get lost.\n")
108
109
110
    return dirname


Eckhart Arnold's avatar
Eckhart Arnold committed
111
@contextlib.contextmanager
112
def logging(dirname="LOGS"):
Eckhart Arnold's avatar
Eckhart Arnold committed
113
114
115
116
117
118
119
120
    """Context manager. Log files within this context will be stored in
    directory ``dirname``. Logging is turned off if name is empty.
    
    Args:
        dirname: the name for the log directory or the empty string to
            turn logging of
    """
    global LOGGING
121
    if dirname and not isinstance(dirname, str):  dirname = "LOGS"  # be fail tolerant here...
Eckhart Arnold's avatar
Eckhart Arnold committed
122
123
124
125
126
127
128
129
130
    try:
        save = LOGGING
    except NameError:
        save = ""
    LOGGING = dirname
    yield
    LOGGING = save


131
def is_logging() -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
132
133
134
135
136
137
138
139
    """-> True, if logging is turned on."""
    global LOGGING
    try:
        return bool(LOGGING)
    except NameError:
        return False


Eckhart Arnold's avatar
Eckhart Arnold committed
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def clear_logs(logfile_types={'.cst', '.ast', '.log'}):
    """Removes all logs from the log-directory and removes the
    log-directory if it is empty.
    """
    log_dirname = log_dir()
    files = os.listdir(log_dirname)
    only_log_files = True
    for file in files:
        path = os.path.join(log_dirname, file)
        if os.path.splitext(file)[1] in logfile_types or file == 'info.txt':
            os.remove(path)
        else:
            only_log_files = False
    if only_log_files:
        os.rmdir(log_dirname)


Eckhart Arnold's avatar
Eckhart Arnold committed
157
class StringView(collections.abc.Sized):
158
159
160
161
162
163
164
165
166
167
168
    """"A rudimentary StringView class, just enough for the use cases
    in parswer.py.

    Slicing Python-strings always yields copies of a segment of the original
    string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
    However, this becomes costly (in terms of space and as a consequence also
    time) when parsing longer documents. Unfortunately, Python's `memoryview`
    does not work for unicode strings. Hence, the StringView class.
    """

    __slots__ = ['text', 'begin', 'end', 'len']
169
170
171

    def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
        self.text = text  # type: str
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
        self.begin, self.end = StringView.real_indices(begin, end, len(text))
        self.len = max(self.end - self.begin, 0)

    @staticmethod
    def real_indices(begin, end, len):
        def pack(index, len):
            index = index if index >= 0 else index + len
            return 0 if index < 0 else len if index > len else index
        if begin is None:  begin = 0
        if end is None:  end = len
        return pack(begin, len), pack(end, len)

    def __bool__(self):
        return bool(self.text) and self.end > self.begin

    def __len__(self):
        return self.len
189
190
191
192
193

    def __str__(self):
        return self.text[self.begin:self.end]

    def __getitem__(self, index):
di68kap's avatar
di68kap committed
194
195
196
        # assert isinstance(index, slice), "As of now, StringView only allows slicing."
        # assert index.step is None or index.step == 1, \
        #     "Step sizes other than 1 are not yet supported by StringView"
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
        start, stop = StringView.real_indices(index.start, index.stop, self.len)
        return StringView(self.text, self.begin + start, self.begin + stop)

    def __eq__(self, other):
        return str(self) == str(other)  # PERFORMANCE WARNING: This creates copies of the strings

    def find(self, sub, start=None, end=None) -> int:
        if start is None and end is None:
            return self.text.find(sub, self.begin, self.end) - self.begin
        else:
            start, end = StringView.real_indices(start, end, self.len)
            return self.text.find(sub, self.begin + start, self.begin + end) - self.begin

    def startswith(self, prefix: str, start:int = 0, end:Optional[int] = None) -> bool:
        start += self.begin
        end = self.end if end is None else self.begin + end
        return self.text.startswith(prefix, start, end)



def sv_match(regex, sv: StringView):
    return regex.match(sv.text, pos=sv.begin, endpos=sv.end)


Eckhart Arnold's avatar
Eckhart Arnold committed
221
def sv_index(absolute_index: Union[int, Iterable], sv: StringView) -> int:
222
223
224
225
226
227
228
229
230
231
    """
    Converts the an index into string watched by a StringView object
    to an index relativ to the string view object, e.g.:
    >>> sv = StringView('xxIxx')[2:3]
    >>> match = sv_match(re.compile('I'), sv)
    >>> match.end()
    3
    >>> sv_index(match.end(), sv)
    1
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
232
233
234
235
236
237
238
239
    return absolute_index - sv.begin


def sv_indices(absolute_indices: Iterable[int], sv: StringView) -> Tuple[int]:
    """Converts the an index into string watched by a StringView object
    to an index relativ to the string view object. See also: `sv_index()`
    """
    return tuple(index - sv.begin for index in absolute_indices)
240
241
242
243
244
245
246
247


def sv_search(regex, sv: StringView):
    return regex.search(sv.text, pos=sv.begin, endpos=sv.end)



EMPTY_STRING_VIEW = StringView('')
248
249


250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# def repr_call(f, parameter_list) -> str:
#     """Turns a list of items into a string resembling the parameter
#     list of a function call by omitting default values at the end:
#     >>> def f(a, b=1):    print(a, b)
#     >>> repr_call(f, (5,1))
#     'f(5)'
#     >>> repr_call(f, (5,2))
#     'f(5, 2)'
#     """
#     i = 0
#     defaults = f.__defaults__ if f.__defaults__ is not None else []
#     for parameter, default in zip(reversed(parameter_list), reversed(defaults)):
#         if parameter != default:
#             break
#         i -= 1
#     if i < 0:
#         parameter_list = parameter_list[:i]
#     name = f.__self__.__class__.__name__ if f.__name__ == '__init__' else f.__name__
268
#     return "%s(%s)" % (name, ", ".merge_children(repr(item) for item in parameter_list))
269
270


271
def line_col(text: str, pos: int) -> Tuple[int, int]:
272
273
    """Returns the position within a text as (line, column)-tuple.
    """
274
    assert pos <= len(text), str(pos) + " > " + str(len(text))  # can point one character after EOF
275
276
277
278
279
    line = text.count("\n", 0, pos) + 1
    column = pos - text.rfind("\n", 0, pos)
    return line, column


280
def error_messages(source_text, errors) -> List[str]:
281
282
283
284
285
286
287
288
289
290
291
    """Returns the sequence or iterator of error objects as an intertor
    of error messages with line and column numbers at the beginning.
    
    Args:
        source_text (str):  The source text on which the errors occurred.
            (Needed in order to determine the line and column numbers.)
        errors (list):  The list of errors as returned by the method 
            ``collect_errors()`` of a Node object     
    Returns:
        a list that contains all error messages in string form. Each
        string starts with "line: [Line-No], column: [Column-No]
292
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
293
    return ["line: %3i, column: %2i" % line_col(source_text, err.pos) + ", error: %s" % err.msg
294
            for err in sorted(list(errors))]
295
296


297
def escape_re(s) -> str:
298
299
300
301
302
303
304
305
306
    """Returns `s` with all regular expression special characters escaped.
    """
    assert isinstance(s, str)
    re_chars = r"\.^$*+?{}[]()#<>=|!"
    for esc_ch in re_chars:
        s = s.replace(esc_ch, '\\' + esc_ch)
    return s


307
def is_filename(s) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
308
309
310
311
312
    """Tries to guess whether string ``s`` is a file name."""
    return s.find('\n') < 0 and s[:1] != " " and s[-1:] != " " \
           and s.find('*') < 0 and s.find('?') < 0


313
def logfile_basename(filename_or_text, function_or_class_or_instance) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
314
315
316
317
318
319
320
321
322
323
324
325
326
327
    """Generates a reasonable logfile-name (without extension) based on
    the given information.
    """
    if is_filename(filename_or_text):
        return os.path.basename(os.path.splitext(filename_or_text)[0])
    else:
        try:
            s = function_or_class_or_instance.__qualname.__
        except AttributeError:
            s = function_or_class_or_instance.__class__.__name__
        i = s.find('.')
        return s[:i] + '_out' if i >= 0 else s


328
329
330
def load_if_file(text_or_file) -> str:
    """Reads and returns content of a text-file if parameter
    `text_or_file` is a file name (i.e. a single line string),
331
    otherwise (i.e. if `text_or_file` is a multi-line string)
332
    `text_or_file` is returned.
333
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
334
    if is_filename(text_or_file):
335
336
337
338
        try:
            with open(text_or_file, encoding="utf-8") as f:
                content = f.read()
            return content
339
        except FileNotFoundError:
340
            if re.fullmatch(r'[\w/:. \\]+', text_or_file):
341
342
                raise FileNotFoundError('Not a valid file: ' + text_or_file + '!\n(Add "\\n" '
                                        'to distinguish source data from a file name.)')
343
344
            else:
                return text_or_file
345
346
347
348
    else:
        return text_or_file


349
def is_python_code(text_or_file) -> bool:
350
351
352
    """Checks whether 'text_or_file' is python code or the name of a file that
    contains python code.
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
353
    if is_filename(text_or_file):
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
        return text_or_file[-3:].lower() == '.py'
    try:
        compile(text_or_file, '<string>', 'exec')
        return True
    except (SyntaxError, ValueError, OverflowError):
        pass
    return False


def md5(*txt):
    """Returns the md5-checksum for `txt`. This can be used to test if
    some piece of text, for example a grammar source file, has changed.
    """
    md5_hash = hashlib.md5()
    for t in txt:
        md5_hash.update(t.encode('utf8'))
    return md5_hash.hexdigest()


Eckhart Arnold's avatar
Eckhart Arnold committed
373
374
# def smart_list(arg: Union[str, Iterable[T]]) -> Union[Sequence[str], Sequence[T]]:
def smart_list(arg: Union[str, Iterable, Any]) -> Sequence:
375
    """Returns the argument as list, depending on its type and content.
376
377
378
379
    
    If the argument is a string, it will be interpreted as a list of
    comma separated values, trying ';', ',', ' ' as possible delimiters
    in this order, e.g.
380
381
382
383
384
385
386
    >>> smart_list('1; 2, 3; 4')
    ['1', '2, 3', '4']
    >>> smart_list('2, 3')
    ['2', '3']
    >>> smart_list('a b cd')
    ['a', 'b', 'cd']

387
388
    If the argument is a collection other than a string, it will be
    returned as is, e.g.
389
390
391
392
393
    >>> smart_list((1, 2, 3))
    (1, 2, 3)
    >>> smart_list({1, 2, 3})
    {1, 2, 3}

394
395
    If the argument is another iterable than a collection, it will
    be converted into a list, e.g.
396
397
398
    >>> smart_list(i for i in {1,2,3})
    [1, 2, 3]

399
400
    Finally, if none of the above is true, the argument will be 
    wrapped in a list and returned, e.g.
401
402
    >>> smart_list(125)
    [125]
403
404
405
406
407
    """
    if isinstance(arg, str):
        for delimiter in (';', ','):
            lst = arg.split(delimiter)
            if len(lst) > 1:
408
409
                return [s.strip() for s in lst]
        return [s.strip() for s in arg.strip().split(' ')]
Eckhart Arnold's avatar
Eckhart Arnold committed
410
    elif isinstance(arg, Sequence):
411
        return arg
Eckhart Arnold's avatar
Eckhart Arnold committed
412
    elif isinstance(arg, Iterable):
413
        return list(arg)
414
415
416
417
    else:
        return [arg]


418
419
420
421
422
def expand_table(compact_table):
    """Expands a table by separating keywords that are tuples or strings
    containing comma separated words into single keyword entries with
    the same values. Returns the expanded table.
    Example:
423
424
    >>> expand_table({"a, b": 1, ('d','e','f'):5, "c":3})
    {'a': 1, 'b': 1, 'd': 5, 'e': 5, 'f': 5, 'c': 3}
425
426
427
428
429
    """
    expanded_table = {}
    keys = list(compact_table.keys())
    for key in keys:
        value = compact_table[key]
430
        for k in smart_list(key):
431
            if k in expanded_table:
432
                raise KeyError('Key "%s" used more than once in compact table!' % key)
433
            expanded_table[k] = value
434
    return expanded_table
435
436


437
def sane_parser_name(name) -> bool:
438
    """Checks whether given name is an acceptable parser name. Parser names
439
    must not be preceded or succeeded by a double underscore '__'!
440
    """
di68kap's avatar
di68kap committed
441
442
443
    return name and name[:2] != '__' and name[-2:] != '__'


444
445
446
447
def compile_python_object(python_src, catch_obj_regex=""):
    """Compiles the python source code and returns the (first) object 
    the name of which is matched by ``catch_obj_regex``. If catch_obj
    is the empty string, the namespace dictionary will be returned.
di68kap's avatar
di68kap committed
448
449
450
451
452
453
    """
    if isinstance(catch_obj_regex, str):
        catch_obj_regex = re.compile(catch_obj_regex)
    code = compile(python_src, '<string>', 'exec')
    namespace = {}
    exec(code, namespace)  # safety risk?
454
455
456
457
458
459
    if catch_obj_regex:
        matches = [key for key in namespace.keys() if catch_obj_regex.match(key)]
        if len(matches) == 0:
            raise ValueError("No object matching /%s/ defined in source code." %
                             catch_obj_regex.pattern)
        elif len(matches) > 1:
460
            raise ValueError("Ambiguous matches for %s : %s" %
461
462
463
464
                             (str(catch_obj_regex), str(matches)))
        return namespace[matches[0]] if matches else None
    else:
        return namespace
465
466


467
468
469
470
def identity(anything: Any) -> Any:
    return anything


471
472
473
474
475
476
477
478
try:
    if sys.stdout.encoding.upper() != "UTF-8":
        # make sure that `print()` does not raise an error on 
        # non-ASCII characters:
        sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
except AttributeError:
    # somebody has already taken care of this !?
    pass