Source code for stringview

# stringview.py - a string class where slices are views not copies as
#                    with the standard Python strings.
#
# stringview.pxd - declarations for the cython Python to C compiler
#                    to speed up handling of StringViews.
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.

"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.

It is recommended to compile this module with the Cython-compiler for
speedup. The module comes with a ``stringview.pxd`` that contains some type
declarations to more fully exploit the benefits of the Cython-compiler.
"""

from __future__ import annotations

from typing import Optional, Union, Iterable, Tuple, List, cast
try:
    from typing import TypeAlias
except ImportError:
    from DHParser.externallibs.typing_extensions import TypeAlias

try:
    import cython
    cython_optimized: bool = cython.compiled
except (NameError, ImportError):
    import DHParser.externallibs.shadow_cython as cython
    cython_optimized = False


__all__ = ('StringView', 'real_indices', 'EMPTY_STRING_VIEW', 'TextBuffer')


def first_char(text: str,  begin: cython.int, end: cython.int, chars: str) -> int:
    """Returns the index of the first non-whitespace character in string
     `text` within the bounds [begin, end].
    """
    while begin < end and text[begin] in chars:
        begin += 1
    return begin


def last_char(text: str, begin: cython.int, end: cython.int, chars: str) -> cython.int:
    """Returns the index of the last non-whitespace character in string
    `text` within the bounds [begin, end].
    """
    while end > begin and text[end - 1] in chars:
        end -= 1
    return end


def pack_index(index: cython.int, length: cython.int) -> cython.int:
    """Transforms `index` into a positive index counting from the beginning
    of the string, capping it at the boundaries [0, len].
    Examples:
    >>> pack_index(-1, 5)
    4
    >>> pack_index(6, 5)
    5
    >>> pack_index(-7, 5)
    0
    """
    # assert length >= 0
    index = index if index >= 0 else index + length
    return 0 if index < 0 else length if index > length else index


def fast_real_indices(begin: Optional[int],
                      end: Optional[int],
                      length: cython.int) -> Tuple[cython.int, cython.int]:
    """Returns the tuple of real (i.e. positive) indices from the slice
    indices `begin`,  `end`, assuming a string of size `length`.
    """
    cbegin: cython.int = 0 if begin is None else begin
    cend: cython.int = length if end is None else end
    return pack_index(cbegin, length), pack_index(cend, length)


[docs] def real_indices(begin: Optional[int], end: Optional[int], length: cython.int) -> Tuple[cython.int, cython.int]: """Python callable real-indices function for testing.""" return fast_real_indices(begin, end, length)
[docs] class StringView: # collections.abc.Sized """ A rudimentary StringView class, just enough for the use cases in parse.py. The difference between a StringView and the python builtin strings is that StringView-objects do slicing without copying, i.e. slices are just a view on a section of the sliced string. """ __slots__ = ['_text', '_begin', '_end', '_len', '_fullstring'] @cython.locals(_begin=cython.int, _end=cython.int, _len=cython.int) def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None: # assert isinstance(text, str) self._text = text # type: str _begin, _end = fast_real_indices(begin, end, len(text)) _len = _end - _begin # type: int if _len < 0: _len = 0 self._begin = _begin # type: int self._end = _end # type: int self._len = _len # type: int self._fullstring = '' # type: str def __bool__(self) -> bool: return self._len != 0 # self._end > self._begin # and bool(self.text) def __len__(self) -> int: return self._len def __str__(self) -> str: """PERFORMANCE WARNING: This creates a copy of the string-slice!""" _fullstring = self._fullstring # type: str if _fullstring or self._len == 0: # optimization: avoid slicing/copying return _fullstring # since the slice is being copied now, anyway, the copy might # as well be stored in the string view # return self.text[self.begin:self.end] # use this for debugging! _fullstring = self._text[self._begin:self._end] self._fullstring = _fullstring return _fullstring def __repr__(self) -> str: """PERFORMANCE WARNING: This creates a copy of the string-slice!""" return repr(str(self)) @cython.locals(_len=cython.int) def __eq__(self, other) -> bool: """PERFORMANCE WARNING: This creates copies of the compared string-slices!""" # one string copy could be avoided by using find... # return len(other) == self._len and str(self) == str(other) _len = self._len if len(other) == _len: if _len == 0: return True if isinstance(other, StringView) \ and self._text is other._text and self._begin == other._begin: return True _fullstring = self._fullstring # type: str if _fullstring: return _fullstring == str(other) _fullstring = self._text[self._begin:self._end] self._fullstring = _fullstring return _fullstring == str(other) return False def __hash__(self) -> int: """PERFORMANCE WARNING: This creates a copy of the string-slice!""" return hash(str(self)) def __add__(self, other) -> Union[str, StringView]: if isinstance(other, str): return str(self) + other else: return StringView(str(self) + str(other)) def __radd__(self, other) -> Union[str, StringView]: if isinstance(other, str): return other + str(self) else: return StringView(str(other) + str(self)) @cython.locals(start=cython.int, stop=cython.int, _begin=cython.int, _index=cython.int) def __getitem__(self, index: Union[slice, int]) -> StringView: # assert isinstance(index, slice), "As of now, StringView only allows slicing." # assert index.step is None or index.step == 1, \ # "Step sizes other than 1 are not yet supported by StringView" try: start, stop = fast_real_indices(index.start, index.stop, self._len) _begin = self._begin return StringView(self._text, _begin + start, _begin + stop) except AttributeError: _index = index if _index >= self._len: raise IndexError("StringView index %i out of range 0 - %i" % (_index, self._len)) _begin = self._begin return StringView(self._text, self._begin + _index, self._begin + _index + 1) # return self._text[self._begin + index] # leads to type errors
[docs] def get_text(self) -> str: """Returns the underlying string.""" return self._text
[docs] @cython.locals(_start=cython.int, _end=cython.int) def count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int: """Returns the number of non-overlapping occurrences of substring `sub` in StringView S[start:end]. Optional arguments start and end are interpreted as in slice notation. """ if self._fullstring: if cython_optimized: return self._fullstring.count(sub, start or 0, self._len if end is None else end) else: return self._fullstring.count(sub, start, end) elif start is None and end is None: return self._text.count(sub, self._begin, self._end) else: _start, _end = fast_real_indices(start, end, self._len) return self._text.count(sub, self._begin + _start, self._begin + _end)
[docs] @cython.locals(_start=cython.int, _end=cython.int, _begin=cython.int) def find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int: """Returns the lowest index in S where substring `sub` is found, such that `sub` is contained within S[start:end]. Optional arguments `start` and `end` are interpreted as in slice notation. Returns -1 on failure. """ if self._fullstring: if cython_optimized: return self._fullstring.find(sub, start or 0, self._len if end is None else end) else: return self._fullstring.find(sub, start, end) elif start is None and end is None: _begin = self._begin # type: int return max(self._text.find(sub, _begin, self._end) - _begin, -1) else: _begin = self._begin # type: int _start, _end = fast_real_indices(start, end, self._len) return max(self._text.find(sub, self._begin + _start, _begin + _end) - _begin, -1)
[docs] @cython.locals(_start=cython.int, _end=cython.int, _begin=cython.int) def rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int: """Returns the highest index in S where substring `sub` is found, such that `sub` is contained within S[start:end]. Optional arguments `start` and `end` are interpreted as in slice notation. Returns -1 on failure. """ if self._fullstring: if cython_optimized: return self._fullstring.rfind(sub, start or 0, self._len if end is None else end) else: return self._fullstring.rfind(sub, start, end) if start is None and end is None: _begin = self._begin # type: int return max(self._text.rfind(sub, _begin, self._end) - _begin, -1) else: _begin = self._begin # type: int _start, _end = fast_real_indices(start, end, self._len) return max(self._text.rfind(sub, _begin + _start, _begin + _end) - _begin, -1)
[docs] def startswith(self, prefix: str, start: cython.int = 0, end: Optional[int] = None) -> bool: """Return True if S starts with the specified prefix, False otherwise. With optional `start`, test S beginning at that position. With optional `end`, stop comparing S at that position. """ start += self._begin end = self._end if end is None else self._begin + end return self._text.startswith(prefix, start, end)
[docs] def endswith(self, suffix: str, start: cython.int = 0, end: Optional[int] = None) -> bool: """Return True if S ends with the specified suffix, False otherwise. With optional `start`, test S beginning at that position. With optional `end`, stop comparing S at that position. """ start += self._begin end = self._end if end is None else self._begin + end return self._text.endswith(suffix, start, end)
[docs] def match(self, regex, flags: cython.int = 0): """Executes `regex.match` on the StringView object and returns the result, which is either a match-object or None. Keep in mind that match.end(), match.span() etc. are mapped to the underlying text, not the StringView-object!!! """ return regex.match(self._text, pos=self._begin, endpos=self._end)
[docs] def index(self, absolute_index: cython.int) -> int: """Converts an index for a string watched by a StringView object to an index relative to the string view object, e.g.:: >>> import re >>> sv = StringView('xxIxx')[2:3] >>> match = sv.match(re.compile('I')) >>> match.end() 3 >>> sv.index(match.end()) 1 """ return absolute_index - self._begin
[docs] @cython.locals(_begin=cython.int) def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]: """Converts indices for a string watched by a StringView object to indices relative to the string view object. See also: `sv_index()` """ _begin = self._begin # type: int return tuple(index - _begin for index in absolute_indices)
[docs] @cython.locals(_begin=cython.int) def search(self, regex, start: Optional[int] = None, end: Optional[int] = None): """Executes regex.search on the StringView object and returns the result, which is either a match-object or None. Keep in mind that match.end(), match.span() etc. are mapped to the underlying text, not the StringView-object!!! """ _begin = self._begin # type: int start = _begin if start is None else max(_begin, min(_begin + start, self._end)) end = self._end if end is None else max(_begin, min(_begin + end, self._end)) return regex.search(self._text, start, end)
[docs] def finditer(self, regex): """Executes regex.finditer on the StringView object and returns the iterator of match objects. Keep in mind that match.end(), match.span() etc. are mapped to the underlying text, not the StringView-object!!! """ return regex.finditer(self._text, pos=self._begin, endpos=self._end)
[docs] @cython.locals(begin=cython.int, end=cython.int) def strip(self, chars: str = ' \n\r\t') -> StringView: """Returns a copy of the StringView `self` with leading and trailing whitespace removed. """ begin = first_char(self._text, self._begin, self._end, chars) - self._begin end = last_char(self._text, self._begin, self._end, chars) - self._begin return self if begin == 0 and end == self._len else self[begin:end]
[docs] @cython.locals(begin=cython.int) def lstrip(self, chars=' \n\t') -> StringView: """Returns a copy of `self` with leading whitespace removed.""" begin = first_char(self._text, self._begin, self._end, chars) - self._begin return self if begin == 0 else self[begin:]
[docs] @cython.locals(end=cython.int) def rstrip(self, chars=' \n\t') -> StringView: """Returns a copy of `self` with trailing whitespace removed.""" end = last_char(self._text, self._begin, self._end, chars) - self._begin return self if end == self._len else self[:end]
[docs] @cython.locals(length=cython.int, i=cython.int, k=cython.int) def split(self, sep=None): # -> List[Union[StringView, str]]: """Returns a list of the words in `self`, using `sep` as the delimiter string. If `sep` is not specified or is None, any whitespace string is a separator and empty strings are removed from the result. """ if self._fullstring: return self._fullstring.split(sep) else: pieces = [] # type: List[Union['StringView', str]] length = len(sep) k = 0 i = self.find(sep, k) # while i >= 0: # pieces.append(self._text[self._begin + k: self._begin + i]) # k = i + length # i = self.find(sep, k) # pieces.append(self._text[self._begin + k: self._end]) while i >= 0: pieces.append(self[k:i]) k = i + length i = self.find(sep, k) pieces.append(self[k:]) return pieces
[docs] def replace(self, old, new) -> str: """Returns a string where `old` is replaced by `new`.""" return str(self).replace(old, new)
EMPTY_STRING_VIEW = StringView('')
[docs] class TextBuffer: """TextBuffer class manages a copy of an edited text for a language server. The text can be changed via incremental edits. TextBuffer keeps track of the state of the complete text at any point in time. It works line oriented and lines of text can be retrieved via indexing or slicing. """ def __init__(self, text: Union[str, StringView], version: cython.int = 0): self._text = text # type: Union[str, StringView] self._buffer = [] # type: List[Union[str, StringView]] self.version = version if version >= 0 else 0 def _lazy_init(self): self._buffer = [line.strip('\r') for line in self._text.split('\n')] def __getitem__(self, index: Union[slice, int]) \ -> Union[List[Union[str, StringView]], str, StringView]: if not self._buffer: self._lazy_init() return self._buffer.__getitem__(index) def __str__(self) -> str: if self._text: return str(self._text) return str(self.snapshot('\n')) def __len__(self) -> int: if self._text: return len(self._text) else: return sum(len(line) for line in self._buffer) + len(self._buffer) - 1 @property def buffer(self) -> List[Union[str, StringView]]: if not self._buffer: self._lazy_init() return self._buffer def lines(self) -> int: if not self._buffer: self._lazy_init() return len(self._buffer)
[docs] def update(self, l1: cython.int, c1: cython.int, l2: cython.int, c2: cython.int, replacement: Union[str, StringView]): """Replaces the text-range from line and column (l1, c1) to line and column (l2, c2) with the replacement-string. """ if not self._buffer: self._lazy_init() lines = [line.strip('\r') for line in replacement.split('\n')] head = self._buffer[l1][:c1] tail = self._buffer[l2][c2:] lines[0] = head + lines[0] lines[-1] += tail self._buffer[l1:l2 + 1] = lines self._text = '' # invalidate single-string copy self.version += 1
[docs] def text_edits(self, edits: Union[list, dict], version: cython.int = -1): """Incorporates the one or more text-edits or change-events into the text. A Text-Edit is a dictionary of this form:: {"range": {"start": {"line": 0, "character": 0 }, "end": {"line": 0, "character": 0 } }, "newText": "..."} In case of a change-event, the key "newText" is replaced by "text". """ def edit(ed: dict): """Weaves a single edit into the text-buffer.""" rng = ed["range"] start = rng["start"] end = rng["end"] try: replacement = ed['text'] except KeyError: replacement = ed['newText'] self.update(start["line"], start["character"], end["line"], end["character"], replacement) if isinstance(edits, list): for ed in edits: edit(ed) else: edit(cast(dict, edits)) if version >= 0: self.version = version
[docs] def snapshot(self, eol: str = '\n') -> Union[str, StringView]: """Returns the current state of the entire text, using the given end of line marker (``\\n`` or ``\\r\\n``)""" if self._text: return self._text self._text = eol.join(str(line) for line in self._buffer) return self._text