Source code for preprocess

# preprocess.py - preprocessing of source files for DHParser
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.


"""
Module ``preprocess`` contains functions for preprocessing source
code before the parsing stage as well as source mapping facilities
to map the locations of parser and compiler errors to the
non-preprocessed source text.

Preprocessing (and source mapping of errors) will only be needed
for some domain specific languages, most notably those that
cannot completely be described entirely with context-free grammars.
"""

from __future__ import annotations

import functools
import os
from typing import Union, Optional, Callable, Tuple, List, Any, NamedTuple

from DHParser.error import Error, SourceMap, SourceLocation, SourceMapFunc, \
    add_source_locations, gen_neutral_srcmap_func, source_map, apply_src_mappings
from DHParser.stringview import StringView
from DHParser.toolkit import re, TypeAlias, LazyRE


__all__ = ('RX_TOKEN_NAME',
           'BEGIN_TOKEN',
           'TOKEN_DELIMITER',
           'END_TOKEN',
           'IncludeInfo',
           'FindIncludeFunc',
           'IncludeReaderFunc',
           'PreprocessorFunc',
           'PreprocessorFactory',
           'PreprocessorResult',
           'Tokenizer',
           'make_token',
           'strip_tokens',
           'nil_preprocessor',
           'chain_preprocessors',
           'prettyprint_tokenized',
           'tokenized_to_original_mapping',
           'make_preprocessor',
           'gen_find_include_func',
           'ReadIncludeClass',
           'ReadIncludeOnce',
           'preprocess_includes')


#######################################################################
#
# Types and constants
#
#######################################################################

BEGIN_TOKEN = '\x1b'
TOKEN_DELIMITER = '\x1c'
END_TOKEN = '\x1d'
RESERVED_TOKEN_CHARS = BEGIN_TOKEN + TOKEN_DELIMITER + END_TOKEN

RX_TOKEN_NAME = re.compile(r'\w+')
RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = LazyRE(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')



[docs]
class IncludeInfo(NamedTuple):
    begin: int
    length: int
    file_name: str
    __module__ = __name__  # required for cython/pickle compatibility



def has_includes(sm: SourceMap) -> bool:
    return any(fname != sm.original_name for fname in sm.file_names)



[docs]
class PreprocessorResult(NamedTuple):
    original_text: Union[str, StringView]
    preprocessed_text: Union[str, StringView]
    back_mapping: SourceMapFunc
    errors: List[Error]
    __module__ = __name__  # required for cython/pickle compatibility



FindIncludeFunc: TypeAlias = Union[Callable[[str, int], IncludeInfo],   # (document: str,  start: int)
                                   functools.partial]
IncludeReaderFunc = Callable[[str], str]
IncludeReaderFactory = Callable[[], IncludeReaderFunc]
DeriveFileNameFunc: TypeAlias = Union[Callable[[str], str], functools.partial]  # include name -> file name
PreprocessorFunc: TypeAlias = Union[Callable[[str, str], PreprocessorResult],  # text: str, filename: str
                                    functools.partial]
PreprocessorFactory: TypeAlias = Callable[[], PreprocessorFunc]
Tokenizer: TypeAlias = Union[Callable[[str], Tuple[str, List[Error]]],
                             functools.partial]


#######################################################################
#
# Chaining of preprocessors
#
#######################################################################



[docs]
def nil_preprocessor(original_text: str, original_name: str) -> PreprocessorResult:
    """
    A preprocessor that does nothing, i.e. just returns the input.
    """
    return PreprocessorResult(original_text,
                              original_text,
                              lambda i: SourceLocation(original_name, original_text, i),
                              [])



def _apply_preprocessors(original_text: str, original_name: str,
                         preprocessors: Tuple[PreprocessorFunc, ...]) \
        -> PreprocessorResult:
    """
    Applies several preprocessing functions sequentially to a source text
    and returns the preprocessed text as well as a function that maps
    text-positions in the processed text onto the corresponding position
    in theoriginal source test.
    """
    processed = original_text
    mapping_chain = []
    error_list = []
    for prep in preprocessors:
        _, processed, mapping_func, errors = prep(processed, original_name)
        if errors:
            if mapping_chain:
                chain = mapping_chain.copy()
                chain.reverse()
            else:
                chain = [gen_neutral_srcmap_func(original_text, original_name)]
            add_source_locations(errors, functools.partial(apply_src_mappings, mappings=chain))
        mapping_chain.append(mapping_func)
        error_list.extend(errors)
    mapping_chain.reverse()
    return PreprocessorResult(
        original_text, processed,
        functools.partial(apply_src_mappings, mappings=mapping_chain),
        error_list)



[docs]
def chain_preprocessors(*preprocessors) -> PreprocessorFunc:
    """
    Merges a sequence of preprocessor functions in to a single function.
    """
    if any(prep is preprocess_includes for prep in preprocessors[1:]):
        raise ValueError("The preprocessor for include files must be applied first, "
                         "and there can be no more than one preprocessor for includes.")
    return functools.partial(_apply_preprocessors, preprocessors=preprocessors)



#######################################################################
#
# Tokenization support
#
# In DHParser the source text is usually not tokenized, but,
# optionally, it can be enriched by tokens (or parts of it replaced
# by tokens) to, say, indicate beginnings and endings of indented
# or quoted blocks that are difficult to capture with an EBNF-parser.
#
######################################################################



[docs]
def make_token(token: str, argument: str = '') -> str:
    """
    Turns the ``token`` and ``argument`` into a special token that
    will be caught by the ``PreprocessorToken``-parser.

    This function is a support function that should be used by
    preprocessors to inject preprocessor tokens into the source text.
    """
    assert RX_TOKEN_NAME.match(token)
    assert RX_TOKEN_ARGUMENT.match(argument)

    return BEGIN_TOKEN + token + TOKEN_DELIMITER + argument + END_TOKEN




[docs]
def prettyprint_tokenized(tokenized: str) -> str:
    """Returns a pretty-printable version of a document that contains tokens."""
    return tokenized.replace('\x1b', '<').replace('\x1c', '|').replace('\x1d', '>')




[docs]
def strip_tokens(tokenized: str) -> str:
    """Replaces all tokens with the token's arguments."""
    result = []
    pos = 0
    match = RX_TOKEN.search(tokenized, pos)
    while match:
        start, end = match.span()
        result.append(tokenized[pos:start])
        result.append(match.groupdict()['argument'])
        pos = end
        match = RX_TOKEN.search(tokenized, pos)
    result.append(tokenized[pos:])
    return ''.join(result)



#######################################################################
#
# Source Maps - mapping source code positions between different
#               transformations of the source text
#
#######################################################################



[docs]
def tokenized_to_original_mapping(tokenized_text: str,
                                  original_text: str,
                                  original_name: str = 'UNKNOWN_FILE') -> SourceMap:
    """
    Generates a source map for mapping positions in a text that has
    been enriched with token markers to their original positions.

    :param tokenized_text:  the source text enriched with token markers
    :param original_text: the original source text
    :param original_name:  the name or path or uri of the original source file
    :returns:  a source map, i.e. a list of positions and a list of corresponding
        offsets. The list of positions is ordered from smallest to highest.
        An offset is valid for its associated position and all following
        positions until (and excluding) the next position in the list of
        positions.
    """
    positions, offsets = [0], [0]
    o = 0
    i = tokenized_text.find(BEGIN_TOKEN)
    e = -2
    while i >= 0:
        d = tokenized_text.find(TOKEN_DELIMITER, i)
        e = tokenized_text.find(END_TOKEN, i)
        assert 0 <= d < e
        o -= (d - i + 2)
        positions.extend([d + 1, e + 1])
        offsets.extend([o + 1, o])
        i = tokenized_text.find(BEGIN_TOKEN, e + 1)
    if e + 1 < len(tokenized_text):
        positions.append(len(tokenized_text) + 1)
        offsets.append(offsets[-1])

    # post conditions
    assert len(positions) == len(offsets), '\n' + str(positions) + '\n' + str(offsets)
    assert positions[0] == 0
    assert all(positions[i] < positions[i + 1] for i in range(len(positions) - 1))

    # specific condition for preprocessor tokens
    assert all(offsets[i] > offsets[i + 1] for i in range(len(offsets) - 2))

    L = len(positions)
    return SourceMap(
        original_name, positions, offsets, [original_name] * L, {original_name: original_text})




[docs]
def make_preprocessor(tokenizer: Tokenizer) -> PreprocessorFunc:
    """Generates a preprocessor function from a "naive" tokenizer, i.e.
    a function that merely adds preprocessor tokens to a source text and
    returns the modified source.
    """
    def preprocessor(original_text: str, original_name: str, *args) \
            -> PreprocessorResult:
        tokenized_text, errors = tokenizer(original_text)
        srcmap = tokenized_to_original_mapping(tokenized_text, original_text, original_name)
        mapping = functools.partial(source_map, srcmap=srcmap)
        return PreprocessorResult(original_text, tokenized_text, mapping, errors)
    return preprocessor



#######################################################################
#
# Includes - support for chaining source texts via an in clude command
#
#######################################################################



[docs]
def gen_find_include_func(rx: Union[str, Any],
                          comment_rx: Optional[Union[str, Any]] = None,
                          derive_file_name: DeriveFileNameFunc = lambda name: name) \
                          -> FindIncludeFunc:
    """Generates a function to find include-statements in a file.

    :param rx: A regular expression (either as string or compiled
        regular expression) to catch the names of the includes in
        a document. The expression should catch
    """
    if isinstance(rx, str):  rx = re.compile(rx)
    if isinstance(comment_rx, str):  comment_rx = re.compile(comment_rx)

    def find_include(text: str, begin: int) -> IncludeInfo:
        nonlocal rx
        m = rx.search(text, begin)
        if m:
            begin = m.start()
            file_name = derive_file_name(m.group('name'))
            return IncludeInfo(begin, m.end() - begin, file_name)
        else:
            return IncludeInfo(-1, 0, '')

    def find_comment(text: str, begin: int) -> Tuple[int, int]:
        nonlocal rx
        m = comment_rx.search(text, begin)
        return m.span() if m else (-1, -2)

    def meta_find_include(text: str, begin: int) -> IncludeInfo:
        a, b = find_comment(text, begin)
        info = find_include(text, begin)
        k, length, name = info
        while a < b <= k:
            a, b = find_comment(text, b)
        while (a < k < b) or (a < k + length < b):
            info = find_include(text, b)
            k, length, name = info
            while a < b <= k:
                a, b = find_comment(text, b)
        return info

    return find_include if comment_rx is None else meta_find_include



class ReadIncludeClass:
    def __call__(self, include_name: str) -> str:
        return self.read_include(include_name)
    def read_include(self, include_name: str) -> str:
        with open(include_name, 'r', encoding='utf-8') as f:
            return f.read()


class ReadIncludeOnce(ReadIncludeClass):
    def __init__(self):
        self.already_included = set()

    def __call__(self, include_name: str) -> str:
        if include_name in self.already_included:
            return ""     # don't include the same file twice
        else:
            self.already_included.add(include_name)
            return self.read_include(include_name)


def generate_include_map(original_name: str,
                         original_text: str,
                         find_next_include: FindIncludeFunc,
                         include_reader: IncludeReaderFactory = ReadIncludeClass) \
                         -> Tuple[SourceMap, str]:
    file_names: set = set()

    def generate_map(source_name, source_text, find_next) -> Tuple[SourceMap, str]:
        nonlocal file_names
        map = SourceMap(source_name, [0], [0], [source_name], {source_name: source_text})
        result = []

        if source_name in file_names:
            raise ValueError(f'Circular include of {source_name} detected!')
        file_names.add(source_name)

        dirname = os.path.dirname(source_name)
        original_pointer = 0
        original_offset = 0
        result_pointer = 0
        last_begin = -1
        begin, length, include_name = find_next(source_text, 0)
        include_name = os.path.join(dirname, include_name)
        read_include = include_reader()
        while begin >= 0:
            assert begin > last_begin
            source_delta = begin - original_pointer
            original_pointer += source_delta
            result_pointer += source_delta
            included_text = read_include(include_name)
            inner_map, inner_text = generate_map(include_name, included_text, find_next)
            assert len(inner_map.positions) == len(inner_map.offsets) == len(inner_map.file_names)
            for i in range(len(inner_map.positions)):
                inner_map.positions[i] += result_pointer
                inner_map.offsets[i] -= result_pointer
            if source_delta == 0:
                map.file_names.pop()
                map.positions.pop()
                map.offsets.pop()
            else:
                result.append(source_text[original_pointer - source_delta: original_pointer])
            map.file_names.extend(inner_map.file_names[:-1])
            map.positions.extend(inner_map.positions[:-1])
            map.offsets.extend(inner_map.offsets[:-1])
            map.originals_dict.update(inner_map.originals_dict)
            result.append(inner_text)
            inner_length = len(inner_text)
            result_pointer += inner_length
            map.file_names.append(source_name)
            map.positions.append(result_pointer)
            original_pointer += length
            original_offset += length - inner_length
            map.offsets.append(original_offset)
            begin, length, include_name = find_next(source_text, original_pointer)
            include_name = os.path.join(dirname, include_name)
        rest = source_text[original_pointer:]
        if rest:
            result.append(rest)
            map.positions.append(map.positions[-1] + len(rest))
            map.offsets.append(original_offset)
            map.file_names.append(source_name)
        file_names.remove(source_name)
        # map.file_offsets = [-offset for offset in map.offsets]  # only for debugging!
        return map, ''.join(result)

    return generate_map(original_name, original_text, find_next_include)


def srcmap_includes(position: int, inclmap: SourceMap) -> SourceLocation:
    import bisect
    i = bisect.bisect_right(inclmap.positions, position)
    if i:
        source_name = inclmap.file_names[i - 1]
        return SourceLocation(
            source_name,
            inclmap.originals_dict[source_name],
            position + inclmap.offsets[i - 1])
    raise ValueError



[docs]
def preprocess_includes(original_text: Optional[str],
                        original_name: str,
                        find_next_include: FindIncludeFunc,
                        include_reader: IncludeReaderFactory=ReadIncludeClass) \
                        -> PreprocessorResult:
    """Preprocesses include statements in a file.

    :param original_text: The original source file (if already read from disk)
    :param original_name: The file-name of the original source
    :param find_next_include: The function to find the next include-statement
    :param include_reader: A factory that returns a function that retrieves the
        content from an included file.
    :return: the result of the preprocessing: (original document,
        processed document, source mapping, (possibly empty) list of errors)
    """
    if not original_text:
        with open(original_name, 'r', encoding='utf-8') as f:
            original_text = f.read()
    include_map, result = generate_include_map(
        original_name, original_text, find_next_include, include_reader)
    mapping_func = functools.partial(srcmap_includes, inclmap=include_map)
    return PreprocessorResult(original_text, result, mapping_func, [])