Source code for preprocess

# preprocess.py - preprocessing of source files for DHParser
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.


"""
Module ``preprocess`` contains functions for preprocessing source
code before the parsing stage as well as source mapping facilities
to map the locations of parser and compiler errors to the
non-preprocessed source text. (See :py:class:`~error.SourceMap`)

Preprocessing (and source mapping of errors) are useful
in cases where a syntax or certain syntactical features (like marking
blocks with indentation for example), cannot be described completely
with context-free grammars.
"""

from __future__ import annotations

import functools
import os
from typing import Union, Optional, Callable, Tuple, List, Dict, Any, \
    NamedTuple

from DHParser.error import Error, add_source_locations
from DHParser.stringview import StringView
from DHParser.toolkit import re, TypeAlias, LazyRE


__all__ = ('RX_TOKEN_NAME',
           'BEGIN_TOKEN',
           'TOKEN_DELIMITER',
           'END_TOKEN',
           'IncludeInfo',
           'FindIncludeFunc',
           'IncludeReaderFunc',
           'DeriveFileNameFunc',
           'PreprocessorFunc',
           'PreprocessorFactory',
           'PreprocessorResult',
           'result_from_mapping',
           'Tokenizer',
           'make_token',
           'strip_tokens',
           'SourceLocation',
           'SourceMap',
           'SourceMapFunc',
           'gen_neutral_srcmap_func',
           'source_map',
           'apply_src_mappings',
           'nil_preprocessor',
           'nil_preprocessor_factory',
           'chain_preprocessors',
           'prettyprint_tokenized',
           'tokenized_to_original_mapping',
           'make_preprocessor',
           'gen_find_include_func',
           'ReadIncludeClass',
           'ReadIncludeOnce',
           'preprocess_includes')


#######################################################################
#
# Types and constants
#
#######################################################################

BEGIN_TOKEN = '\x1b'
TOKEN_DELIMITER = '\x1c'
END_TOKEN = '\x1d'
RESERVED_TOKEN_CHARS = BEGIN_TOKEN + TOKEN_DELIMITER + END_TOKEN

RX_TOKEN_NAME = LazyRE(r'\w+')
RX_TOKEN_ARGUMENT = LazyRE(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = LazyRE(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')


[docs]
class IncludeInfo(NamedTuple):
    begin: int
    length: int
    file_name: str
    __module__ = __name__  # required for cython/pickle compatibility


def has_includes(sm: SourceMap) -> bool:
    return any(fname != sm.original_name for fname in sm.file_names)


[docs]
class SourceLocation(NamedTuple):
    """A particular location in the original, not preprocessed source code.

    :ivar original_name: The original source filename. If the document is composed of
        a master file and (possibly nested) includes this will be the name of the file
        that the position is related to.
    :ivar original_text: The original, i.e. not yet preprocessed text-content of the
        file the position relates to.
    :ivar pos: The location within original_text.
    """
    original_name: str          # the file name (or path or uri) of the source code
    original_text: Union[str, StringView]  # the source code itself
    pos: int                    # a position within the code
    __module__ = __name__       # needed for cython compatibility


SourceMapFunc: TypeAlias = Union[Callable[[int], SourceLocation], functools.partial]


[docs]
class PreprocessorResult(NamedTuple):
    original_text: Union[str, StringView]
    preprocessed_text: Union[str, StringView]
    back_mapping: SourceMapFunc
    errors: List[Error]
    __module__ = __name__  # required for cython/pickle compatibility


def result_from_mapping(mapping: SourceMap,
              original_text: Union[str, StringView],
              processed_text: Union[str, StringView],
              errors: List[Error]) -> PreprocessorResult:
    mapping.validate()
    mapper = functools.partial(source_map, srcmap=mapping)
    return PreprocessorResult(original_text, processed_text, mapper, errors)

FindIncludeFunc: TypeAlias = Union[Callable[[str, int], IncludeInfo],   # (document: str,  start: int)
                                   functools.partial]
IncludeReaderFunc = Callable[[str], str]
IncludeReaderFactory = Callable[[], IncludeReaderFunc]
DeriveFileNameFunc: TypeAlias = Union[Callable[[str], str], functools.partial]  # include name -> file name
PreprocessorFunc: TypeAlias = Union[Callable[[str, str], PreprocessorResult],  # (text: str, filename: str) -> result
                                    functools.partial]
PreprocessorFactory: TypeAlias = Callable[[], PreprocessorFunc]
Tokenizer: TypeAlias = Union[Callable[[str], Tuple[str, List[Error]]],
                             functools.partial]


#######################################################################
#
# Chaining of preprocessors
#
#######################################################################



[docs]
def nil_preprocessor(original_text: str, original_name: str) -> PreprocessorResult:
    """
    A preprocessor that does nothing, i.e. just returns the input.
    """
    def neutral_back_mapping(pos: int) -> SourceLocation:
        return SourceLocation(original_name, original_text, pos)

    return PreprocessorResult(original_text, original_text, neutral_back_mapping, [])



def nil_preprocessor_factory() -> PreprocessorFunc:
    return nil_preprocessor


def _apply_preprocessors(original_text: str, original_name: str,
                         preprocessors: Tuple[PreprocessorFunc, ...]) \
        -> PreprocessorResult:
    """
    Applies several preprocessing functions sequentially to a source text
    and returns the preprocessed text as well as a function that maps
    text-positions in the processed text onto the corresponding position
    in the original source test.
    """
    processed = original_text
    mapping_chain = []
    error_list = []
    for prep in preprocessors:
        _, processed, mapping_func, errors = prep(processed, original_name)
        if errors:
            if mapping_chain:
                chain = mapping_chain.copy()
                chain.reverse()
            else:
                chain = [gen_neutral_srcmap_func(original_text, original_name)]
            add_source_locations(errors, functools.partial(apply_src_mappings, mappings=chain))
        mapping_chain.append(mapping_func)
        error_list.extend(errors)
    mapping_chain.reverse()
    return PreprocessorResult(
        original_text, processed,
        functools.partial(apply_src_mappings, mappings=mapping_chain),
        error_list)



[docs]
def chain_preprocessors(*preprocessors) -> PreprocessorFunc:
    """
    Merges a sequence of preprocessor functions in to a single function.
    """
    if any(prep is preprocess_includes for prep in preprocessors[1:]):
        raise ValueError("The preprocessor for include files must be applied first, "
                         "and there can be no more than one preprocessor for includes.")
    return functools.partial(_apply_preprocessors, preprocessors=preprocessors)



#######################################################################
#
# Tokenization support
#
# In DHParser the source text is usually not tokenized, but,
# optionally, it can be enriched by tokens (or parts of it replaced
# by tokens) to, say, indicate beginnings and endings of indented
# or quoted blocks that are difficult to capture with an EBNF-parser.
#
######################################################################



[docs]
def make_token(token: str, argument: str = '') -> str:
    """
    Turns the ``token`` and ``argument`` into a special token that
    will be caught by the ``PreprocessorToken``-parser.

    This function is a support function that should be used by
    preprocessors to inject preprocessor tokens into the source text.
    """
    assert RX_TOKEN_NAME.match(token)
    assert RX_TOKEN_ARGUMENT.match(argument)

    return BEGIN_TOKEN + token + TOKEN_DELIMITER + argument + END_TOKEN




[docs]
def prettyprint_tokenized(tokenized: str) -> str:
    """Returns a pretty-printable version of a document that contains tokens."""
    return tokenized.replace('\x1b', '<').replace('\x1c', '|').replace('\x1d', '>')




[docs]
def strip_tokens(tokenized: str) -> str:
    """Replaces all tokens with the token's arguments."""
    result = []
    pos = 0
    match = RX_TOKEN.search(tokenized, pos)
    while match:
        start, end = match.span()
        result.append(tokenized[pos:start])
        result.append(match.groupdict()['argument'])
        pos = end
        match = RX_TOKEN.search(tokenized, pos)
    result.append(tokenized[pos:])
    return ''.join(result)



#######################################################################
#
# Source Maps - mapping source code positions between different
#               transformations of the source text
#
#######################################################################


[docs]
class SourceMap(NamedTuple):
    """Class SourceMap captures a mapping from the preprocessed source code (that is
    possibly also stitched together from different files) to the original source files
    and source positions. It is possible to use more than one source map (see
    :py:func:`apply_src_mappings`). Thus, several preprocessing stages can be applied
    in sequence and the positions, say where errors occurred, can still be back-propagated
    to the original input file(s).

    :ivar original_name: The original source filename. If the source allows includes,
        this should be the name of the master file.
    :ivar positions: A list of locations in the processed file. Each location is to be
        understood as a marker from which on a different the position in the processed
        file must be shifted by a different offset to gain the position in the original
        file. The first element in the list of positions should always be 0 and
        contain as its last element the length of the processed source plus 1 (or higher).
        (+1 allows the location to exceed the end of the text by 1 which makes writing
         algorithms easier that if the location was not allowed to point beyond the end
         of the text.)
    :ivar offsets: The list of offsets corresponding to the positions. For each position
        entry positions[n], the corresponding offsets value offsets[n] contains
        the offset (positive or negative or zero) that will be added to all locations
        in the half-open interval [ positions[n], positions[n + 1] [
    :ivar file_names: A list of file names corresponding to the positions, i.e. for each
        position[n] the name of the file that the text from this position just until
        before the next position was taken from.
    :ivar originals_dict: A dictionary, mapping the file-names to their text-content in
        form of a :py:class:`~stringview.StringView`-object.
    """
    original_name: str          # nome or path or uri of the original source file
    positions: List[int]        # a list of locations
    offsets: List[int]          # the corresponding offsets to be added from these locations onward
    file_names: List[str]       # list of file_names to which the source locations relate
    originals_dict: Dict[str, Union[str, StringView]]  # File names => (included) source texts
    __module__ = __name__       # needed for cython compatibility

    def validate(self) -> SourceMap:   # actually returns Self
        if len(self.positions) != len(self.offsets) != len(self.file_names):
            raise ValueError("The length of the lists of positions, offsets and file names "
                             "must be equal.")
        if len(self.positions) < 2:
            raise ValueError("The length of the lists of positions, offsets and file names "
                             "must be at least 2.")
        if self.positions[0] != 0:
            raise ValueError("The first element of the list of positions must be 0.")
        if not all(self.positions[i] < self.positions[i + 1]
                   for i in range(len(self.positions) - 1)):
            raise ValueError("The list of positions must be strictly increasing.")
        return self




[docs]
def gen_neutral_srcmap_func(original_text: Union[StringView, str], original_name: str = '') -> SourceMapFunc:
    """Generates a source map function that maps positions to itself."""
    if not original_name:  original_name = 'UNKNOWN_FILE'
    # return lambda pos: SourceLocation(original_name, original_text, pos)
    return functools.partial(SourceLocation, original_name, original_text)




[docs]
def source_map(position: int, srcmap: SourceMap) -> SourceLocation:
    """
    Maps a position in a (pre-)processed text to its corresponding
    position in the original document according to the given source map.

    :param  position: the position in the processed text
    :param  srcmap:  the source map, i.e. a mapping of locations to offset values
        and source texts.
    :returns:  the mapped position
    """
    assert len(srcmap.positions) == len(srcmap.offsets) == len(srcmap.file_names)
    # assert set(srcmap.file_names) == set(srcmap.originals_dict.keys())
    import bisect
    i = bisect.bisect_right(srcmap.positions, position)
    if 0 < i < len(srcmap.positions):
        original_name = srcmap.file_names[i - 1]
        return SourceLocation(
            original_name,
            srcmap.originals_dict[original_name],
            min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i]))
    raise ValueError(f"Position {position} seems is out of range "
                     f"[{srcmap.positions[0]}, {srcmap.positions[-1]}[ "
                     f"or source map ist corrupted.")




[docs]
def apply_src_mappings(position: int, mappings: List[SourceMapFunc]) -> SourceLocation:
    """
    Sequentially apply a number of mapping functions to a source position.
    In the context of source mapping, the source position usually is a
    position within a preprocessed source text and mappings should therefore
    be a list of reverse-mappings in reversed order.
    """
    assert mappings
    filename, text = '', ''
    for mapping in mappings:
        filename, text, position = mapping(position)
    return SourceLocation(filename, text, position)




[docs]
def tokenized_to_original_mapping(tokenized_text: str,
                                  original_text: str,
                                  original_name: str = 'UNKNOWN_FILE') -> SourceMap:
    """
    Generates a source map for mapping positions in a text that has
    been enriched with token markers to their original positions.

    :param tokenized_text:  the source text enriched with token markers
    :param original_text: the original source text
    :param original_name:  the name or path or uri of the original source file
    :returns:  a source map, i.e. a list of positions and a list of corresponding
        offsets. The list of positions is ordered from smallest to highest.
        An offset is valid for its associated position and all following
        positions until (and excluding) the next position in the list of
        positions.
    """
    positions, offsets = [0], [0]
    o = 0
    i = tokenized_text.find(BEGIN_TOKEN)
    e = -2
    while i >= 0:
        d = tokenized_text.find(TOKEN_DELIMITER, i)
        e = tokenized_text.find(END_TOKEN, i)
        assert 0 <= d < e
        o -= (d - i + 2)
        positions.extend([d + 1, e + 1])
        offsets.extend([o + 1, o])
        i = tokenized_text.find(BEGIN_TOKEN, e + 1)
    if e + 1 < len(tokenized_text):
        positions.append(len(tokenized_text) + 1)
        offsets.append(offsets[-1])

    # specific condition for preprocessor tokens
    assert all(offsets[i] > offsets[i + 1] for i in range(len(offsets) - 2))

    L = len(positions)
    return SourceMap(
        original_name, positions, offsets, [original_name] * L, {original_name: original_text}).\
        validate()




[docs]
def make_preprocessor(tokenizer: Tokenizer) -> PreprocessorFunc:
    """Generates a preprocessor function from a "naive" tokenizer, i.e.
    a function that merely adds preprocessor tokens to a source text and
    returns the modified source.
    """
    def preprocessor(original_text: str, original_name: str, *args) \
            -> PreprocessorResult:
        tokenized_text, errors = tokenizer(original_text)
        srcmap = tokenized_to_original_mapping(tokenized_text, original_text, original_name)
        mapping = functools.partial(source_map, srcmap=srcmap)
        return PreprocessorResult(original_text, tokenized_text, mapping, errors)
    return preprocessor



#######################################################################
#
# Includes - support for chaining source texts via an in clude command
#
#######################################################################



[docs]
def gen_find_include_func(rx: Union[str, Any],
                          comment_rx: Optional[Union[str, Any]] = None,
                          derive_file_name: DeriveFileNameFunc = lambda name: name) \
                          -> FindIncludeFunc:
    """Generates a function to find include-statements in a file.

    :param rx: A regular expression (either as string or compiled
        regular expression) to catch the names of the includes in
        a document. The expression should catch
    :param comment_rx: The regular expression for comments. (This
        should always either be NEVER_MATCH_PATTERN or exactly the
        same as the comment-regular rexpression defined in the grammar!)
    """
    if isinstance(rx, str):  rx = re.compile(rx)
    if isinstance(comment_rx, str):  comment_rx = re.compile(comment_rx)

    def find_include(text: str, begin: int) -> IncludeInfo:
        nonlocal rx
        m = rx.search(text, begin)
        if m:
            begin = m.start()
            file_name = derive_file_name(m.group('name'))
            return IncludeInfo(begin, m.end() - begin, file_name)
        else:
            return IncludeInfo(-1, 0, '')

    def find_comment(text: str, begin: int) -> Tuple[int, int]:
        nonlocal rx
        m = comment_rx.search(text, begin)
        return m.span() if m else (-1, -2)

    def meta_find_include(text: str, begin: int) -> IncludeInfo:
        a, b = find_comment(text, begin)
        info = find_include(text, begin)
        k, length, name = info
        while a < b <= k:
            a, b = find_comment(text, b)
        while (a < k < b) or (a < k + length < b):
            info = find_include(text, b)
            k, length, name = info
            while a < b <= k:
                a, b = find_comment(text, b)
        return info

    return find_include if comment_rx is None else meta_find_include



class ReadIncludeClass:
    def __call__(self, include_name: str) -> str:
        return self.read_include(include_name)
    def read_include(self, include_name: str) -> str:
        with open(include_name, 'r', encoding='utf-8') as f:
            return f.read()


class ReadIncludeOnce(ReadIncludeClass):
    def __init__(self):
        self.already_included = set()

    def __call__(self, include_name: str) -> str:
        if include_name in self.already_included:
            return ""     # don't include the same file twice
        else:
            self.already_included.add(include_name)
            return self.read_include(include_name)


def generate_include_map(original_name: str,
                         original_text: str,
                         find_next_include: FindIncludeFunc,
                         include_reader: IncludeReaderFactory = ReadIncludeClass) \
                         -> Tuple[SourceMap, str]:
    file_names: set = set()

    def generate_map(source_name, source_text, find_next) -> Tuple[SourceMap, str]:
        nonlocal file_names
        map = SourceMap(source_name, [0], [0], [source_name], {source_name: source_text})
        result = []

        if source_name in file_names:
            raise ValueError(f'Circular include of {source_name} detected!')
        file_names.add(source_name)

        dirname = os.path.dirname(source_name)
        original_pointer = 0
        original_offset = 0
        result_pointer = 0
        last_begin = -1
        begin, length, include_name = find_next(source_text, 0)
        include_name = os.path.join(dirname, include_name)
        read_include = include_reader()
        while begin >= 0:
            assert begin > last_begin
            source_delta = begin - original_pointer
            original_pointer += source_delta
            result_pointer += source_delta
            included_text = read_include(include_name)
            inner_map, inner_text = generate_map(include_name, included_text, find_next)
            assert len(inner_map.positions) == len(inner_map.offsets) == len(inner_map.file_names)
            for i in range(len(inner_map.positions)):
                inner_map.positions[i] += result_pointer
                inner_map.offsets[i] -= result_pointer
            if source_delta == 0:
                map.file_names.pop()
                map.positions.pop()
                map.offsets.pop()
            else:
                result.append(source_text[original_pointer - source_delta: original_pointer])
            map.file_names.extend(inner_map.file_names[:-1])
            map.positions.extend(inner_map.positions[:-1])
            map.offsets.extend(inner_map.offsets[:-1])
            map.originals_dict.update(inner_map.originals_dict)
            result.append(inner_text)
            inner_length = len(inner_text)
            result_pointer += inner_length
            map.file_names.append(source_name)
            map.positions.append(result_pointer)
            original_pointer += length
            original_offset += length - inner_length
            map.offsets.append(original_offset)
            begin, length, include_name = find_next(source_text, original_pointer)
            include_name = os.path.join(dirname, include_name)
        rest = source_text[original_pointer:]
        if rest:
            result.append(rest)
            map.positions.append(map.positions[-1] + len(rest))
            map.offsets.append(original_offset)
            map.file_names.append(source_name)
        file_names.remove(source_name)
        # map.file_offsets = [-offset for offset in map.offsets]  # only for debugging!
        return map, ''.join(result)

    return generate_map(original_name, original_text, find_next_include)


def srcmap_includes(position: int, inclmap: SourceMap) -> SourceLocation:
    import bisect
    i = bisect.bisect_right(inclmap.positions, position)
    if i:
        source_name = inclmap.file_names[i - 1]
        return SourceLocation(
            source_name,
            inclmap.originals_dict[source_name],
            position + inclmap.offsets[i - 1])
    raise ValueError



[docs]
def preprocess_includes(original_text: Optional[str],
                        original_name: str,
                        find_next_include: FindIncludeFunc,
                        include_reader: IncludeReaderFactory=ReadIncludeClass) \
                        -> PreprocessorResult:
    """Preprocesses include statements in a file.

    :param original_text: The original source file (if already read from disk)
    :param original_name: The file-name of the original source
    :param find_next_include: The function to find the next include-statement
    :param include_reader: A factory that returns a function that retrieves the
        content from an included file.
    :return: the result of the preprocessing: (original document,
        processed document, source mapping, (possibly empty) list of errors)
    """
    if not original_text:
        with open(original_name, 'r', encoding='utf-8') as f:
            original_text = f.read()
    include_map, result = generate_include_map(
        original_name, original_text, find_next_include, include_reader)
    mapping_func = functools.partial(srcmap_includes, inclmap=include_map)
    return PreprocessorResult(original_text, result, mapping_func, [])