# stringview.py - a string class where slices are views not copies as
# with the standard Python strings.
#
# stringview.pxd - declarations for the cython Python to C compiler
# to speed up handling of StringViews.
#
# Copyright 2016 by Eckhart Arnold (arnold@badw.de)
# Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.
"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.
It is recommended to compile this module with the Cython-compiler for
speedup. The module comes with a ``stringview.pxd`` that contains some type
declarations to more fully exploit the benefits of the Cython-compiler.
"""
from __future__ import annotations
from typing import Optional, Union, Iterable, Tuple, List, cast
try:
from typing import TypeAlias
except ImportError:
from DHParser.externallibs.typing_extensions import TypeAlias
try:
import cython
cython_optimized: bool = cython.compiled
except (NameError, ImportError):
import DHParser.externallibs.shadow_cython as cython
cython_optimized = False
__all__ = ('StringView', 'real_indices', 'EMPTY_STRING_VIEW', 'TextBuffer')
def first_char(text: str, begin: cython.int, end: cython.int, chars: str) -> int:
"""Returns the index of the first non-whitespace character in string
`text` within the bounds [begin, end].
"""
while begin < end and text[begin] in chars:
begin += 1
return begin
def last_char(text: str, begin: cython.int, end: cython.int, chars: str) -> cython.int:
"""Returns the index of the last non-whitespace character in string
`text` within the bounds [begin, end].
"""
while end > begin and text[end - 1] in chars:
end -= 1
return end
def pack_index(index: cython.int, length: cython.int) -> cython.int:
"""Transforms `index` into a positive index counting from the beginning
of the string, capping it at the boundaries [0, len].
Examples:
>>> pack_index(-1, 5)
4
>>> pack_index(6, 5)
5
>>> pack_index(-7, 5)
0
"""
# assert length >= 0
index = index if index >= 0 else index + length
return 0 if index < 0 else length if index > length else index
def fast_real_indices(begin: Optional[int],
end: Optional[int],
length: cython.int) -> Tuple[cython.int, cython.int]:
"""Returns the tuple of real (i.e. positive) indices from the slice
indices `begin`, `end`, assuming a string of size `length`.
"""
cbegin: cython.int = 0 if begin is None else begin
cend: cython.int = length if end is None else end
return pack_index(cbegin, length), pack_index(cend, length)
[docs]
def real_indices(begin: Optional[int],
end: Optional[int],
length: cython.int) -> Tuple[cython.int, cython.int]:
"""Python callable real-indices function for testing."""
return fast_real_indices(begin, end, length)
[docs]
class StringView: # collections.abc.Sized
"""
A rudimentary StringView class, just enough for the use cases
in parse.py. The difference between a StringView and the python
builtin strings is that StringView-objects do slicing without
copying, i.e. slices are just a view on a section of the sliced
string.
"""
__slots__ = ['_text', '_begin', '_end', '_len', '_fullstring']
@cython.locals(_begin=cython.int, _end=cython.int, _len=cython.int)
def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
# assert isinstance(text, str)
self._text = text # type: str
_begin, _end = fast_real_indices(begin, end, len(text))
_len = _end - _begin # type: int
if _len < 0:
_len = 0
self._begin = _begin # type: int
self._end = _end # type: int
self._len = _len # type: int
self._fullstring = '' # type: str
def __bool__(self) -> bool:
return self._len != 0 # self._end > self._begin # and bool(self.text)
def __len__(self) -> int:
return self._len
def __str__(self) -> str:
"""PERFORMANCE WARNING: This creates a copy of the string-slice!"""
_fullstring = self._fullstring # type: str
if _fullstring or self._len == 0: # optimization: avoid slicing/copying
return _fullstring
# since the slice is being copied now, anyway, the copy might
# as well be stored in the string view
# return self.text[self.begin:self.end] # use this for debugging!
_fullstring = self._text[self._begin:self._end]
self._fullstring = _fullstring
return _fullstring
def __repr__(self) -> str:
"""PERFORMANCE WARNING: This creates a copy of the string-slice!"""
return repr(str(self))
@cython.locals(_len=cython.int)
def __eq__(self, other) -> bool:
"""PERFORMANCE WARNING: This creates copies of the compared string-slices!"""
# one string copy could be avoided by using find...
# return len(other) == self._len and str(self) == str(other)
_len = self._len
if len(other) == _len:
if _len == 0:
return True
if isinstance(other, StringView) \
and self._text is other._text and self._begin == other._begin:
return True
_fullstring = self._fullstring # type: str
if _fullstring:
return _fullstring == str(other)
_fullstring = self._text[self._begin:self._end]
self._fullstring = _fullstring
return _fullstring == str(other)
return False
def __hash__(self) -> int:
"""PERFORMANCE WARNING: This creates a copy of the string-slice!"""
return hash(str(self))
def __add__(self, other) -> Union[str, StringView]:
if isinstance(other, str):
return str(self) + other
else:
return StringView(str(self) + str(other))
def __radd__(self, other) -> Union[str, StringView]:
if isinstance(other, str):
return other + str(self)
else:
return StringView(str(other) + str(self))
@cython.locals(start=cython.int, stop=cython.int, _begin=cython.int, _index=cython.int)
def __getitem__(self, index: Union[slice, int]) -> StringView:
# assert isinstance(index, slice), "As of now, StringView only allows slicing."
# assert index.step is None or index.step == 1, \
# "Step sizes other than 1 are not yet supported by StringView"
try:
start, stop = fast_real_indices(index.start, index.stop, self._len)
_begin = self._begin
return StringView(self._text, _begin + start, _begin + stop)
except AttributeError:
_index = index
if _index >= self._len:
raise IndexError("StringView index %i out of range 0 - %i" % (_index, self._len))
_begin = self._begin
return StringView(self._text, self._begin + _index, self._begin + _index + 1)
# return self._text[self._begin + index] # leads to type errors
[docs]
def get_text(self) -> str:
"""Returns the underlying string."""
return self._text
[docs]
@cython.locals(_start=cython.int, _end=cython.int)
def count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the number of non-overlapping occurrences of substring
`sub` in StringView S[start:end]. Optional arguments start and end
are interpreted as in slice notation.
"""
if self._fullstring:
if cython_optimized:
return self._fullstring.count(sub, start or 0, self._len if end is None else end)
else:
return self._fullstring.count(sub, start, end)
elif start is None and end is None:
return self._text.count(sub, self._begin, self._end)
else:
_start, _end = fast_real_indices(start, end, self._len)
return self._text.count(sub, self._begin + _start, self._begin + _end)
[docs]
@cython.locals(_start=cython.int, _end=cython.int, _begin=cython.int)
def find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the lowest index in S where substring `sub` is found,
such that `sub` is contained within S[start:end]. Optional
arguments `start` and `end` are interpreted as in slice notation.
Returns -1 on failure.
"""
if self._fullstring:
if cython_optimized:
return self._fullstring.find(sub, start or 0, self._len if end is None else end)
else:
return self._fullstring.find(sub, start, end)
elif start is None and end is None:
_begin = self._begin # type: int
return max(self._text.find(sub, _begin, self._end) - _begin, -1)
else:
_begin = self._begin # type: int
_start, _end = fast_real_indices(start, end, self._len)
return max(self._text.find(sub, self._begin + _start, _begin + _end) - _begin, -1)
[docs]
@cython.locals(_start=cython.int, _end=cython.int, _begin=cython.int)
def rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the highest index in S where substring `sub` is found,
such that `sub` is contained within S[start:end]. Optional
arguments `start` and `end` are interpreted as in slice notation.
Returns -1 on failure.
"""
if self._fullstring:
if cython_optimized:
return self._fullstring.rfind(sub, start or 0, self._len if end is None else end)
else:
return self._fullstring.rfind(sub, start, end)
if start is None and end is None:
_begin = self._begin # type: int
return max(self._text.rfind(sub, _begin, self._end) - _begin, -1)
else:
_begin = self._begin # type: int
_start, _end = fast_real_indices(start, end, self._len)
return max(self._text.rfind(sub, _begin + _start, _begin + _end) - _begin, -1)
[docs]
def startswith(self,
prefix: str,
start: cython.int = 0,
end: Optional[int] = None) -> bool:
"""Return True if S starts with the specified prefix, False otherwise.
With optional `start`, test S beginning at that position.
With optional `end`, stop comparing S at that position.
"""
start += self._begin
end = self._end if end is None else self._begin + end
return self._text.startswith(prefix, start, end)
[docs]
def endswith(self,
suffix: str,
start: cython.int = 0,
end: Optional[int] = None) -> bool:
"""Return True if S ends with the specified suffix, False otherwise.
With optional `start`, test S beginning at that position.
With optional `end`, stop comparing S at that position.
"""
start += self._begin
end = self._end if end is None else self._begin + end
return self._text.endswith(suffix, start, end)
[docs]
def match(self, regex, flags: cython.int = 0):
"""Executes `regex.match` on the StringView object and returns the
result, which is either a match-object or None. Keep in mind that
match.end(), match.span() etc. are mapped to the underlying text,
not the StringView-object!!!
"""
return regex.match(self._text, pos=self._begin, endpos=self._end)
[docs]
def index(self, absolute_index: cython.int) -> int:
"""Converts an index for a string watched by a StringView object
to an index relative to the string view object, e.g.::
>>> import re
>>> sv = StringView('xxIxx')[2:3]
>>> match = sv.match(re.compile('I'))
>>> match.end()
3
>>> sv.index(match.end())
1
"""
return absolute_index - self._begin
[docs]
@cython.locals(_begin=cython.int)
def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
"""Converts indices for a string watched by a StringView object
to indices relative to the string view object. See also: `sv_index()`
"""
_begin = self._begin # type: int
return tuple(index - _begin for index in absolute_indices)
[docs]
@cython.locals(_begin=cython.int)
def search(self, regex, start: Optional[int] = None, end: Optional[int] = None):
"""Executes regex.search on the StringView object and returns the
result, which is either a match-object or None. Keep in mind that
match.end(), match.span() etc. are mapped to the underlying text,
not the StringView-object!!!
"""
_begin = self._begin # type: int
start = _begin if start is None else max(_begin, min(_begin + start, self._end))
end = self._end if end is None else max(_begin, min(_begin + end, self._end))
return regex.search(self._text, start, end)
[docs]
def finditer(self, regex):
"""Executes regex.finditer on the StringView object and returns the
iterator of match objects. Keep in mind that match.end(), match.span()
etc. are mapped to the underlying text, not the StringView-object!!!
"""
return regex.finditer(self._text, pos=self._begin, endpos=self._end)
[docs]
@cython.locals(begin=cython.int, end=cython.int)
def strip(self, chars: str = ' \n\r\t') -> StringView:
"""Returns a copy of the StringView `self` with leading and trailing
whitespace removed.
"""
begin = first_char(self._text, self._begin, self._end, chars) - self._begin
end = last_char(self._text, self._begin, self._end, chars) - self._begin
return self if begin == 0 and end == self._len else self[begin:end]
[docs]
@cython.locals(begin=cython.int)
def lstrip(self, chars=' \n\t') -> StringView:
"""Returns a copy of `self` with leading whitespace removed."""
begin = first_char(self._text, self._begin, self._end, chars) - self._begin
return self if begin == 0 else self[begin:]
[docs]
@cython.locals(end=cython.int)
def rstrip(self, chars=' \n\t') -> StringView:
"""Returns a copy of `self` with trailing whitespace removed."""
end = last_char(self._text, self._begin, self._end, chars) - self._begin
return self if end == self._len else self[:end]
[docs]
@cython.locals(length=cython.int, i=cython.int, k=cython.int)
def split(self, sep=None): # -> List[Union[StringView, str]]:
"""Returns a list of the words in `self`, using `sep` as the
delimiter string. If `sep` is not specified or is None, any
whitespace string is a separator and empty strings are
removed from the result.
"""
if self._fullstring:
return self._fullstring.split(sep)
else:
pieces = [] # type: List[Union['StringView', str]]
length = len(sep)
k = 0
i = self.find(sep, k)
# while i >= 0:
# pieces.append(self._text[self._begin + k: self._begin + i])
# k = i + length
# i = self.find(sep, k)
# pieces.append(self._text[self._begin + k: self._end])
while i >= 0:
pieces.append(self[k:i])
k = i + length
i = self.find(sep, k)
pieces.append(self[k:])
return pieces
[docs]
def replace(self, old, new) -> str:
"""Returns a string where `old` is replaced by `new`."""
return str(self).replace(old, new)
EMPTY_STRING_VIEW = StringView('')
[docs]
class TextBuffer:
"""TextBuffer class manages a copy of an edited text for a language
server. The text can be changed via incremental edits. TextBuffer
keeps track of the state of the complete text at any point in time.
It works line oriented and lines of text can be retrieved via
indexing or slicing.
"""
def __init__(self, text: Union[str, StringView], version: cython.int = 0):
self._text = text # type: Union[str, StringView]
self._buffer = [] # type: List[Union[str, StringView]]
self.version = version if version >= 0 else 0
def _lazy_init(self):
self._buffer = [line.strip('\r') for line in self._text.split('\n')]
def __getitem__(self, index: Union[slice, int]) \
-> Union[List[Union[str, StringView]], str, StringView]:
if not self._buffer:
self._lazy_init()
return self._buffer.__getitem__(index)
def __str__(self) -> str:
if self._text:
return str(self._text)
return str(self.snapshot('\n'))
def __len__(self) -> int:
if self._text:
return len(self._text)
else:
return sum(len(line) for line in self._buffer) + len(self._buffer) - 1
@property
def buffer(self) -> List[Union[str, StringView]]:
if not self._buffer:
self._lazy_init()
return self._buffer
def lines(self) -> int:
if not self._buffer:
self._lazy_init()
return len(self._buffer)
[docs]
def update(self, l1: cython.int, c1: cython.int, l2: cython.int, c2: cython.int,
replacement: Union[str, StringView]):
"""Replaces the text-range from line and column (l1, c1) to
line and column (l2, c2) with the replacement-string.
"""
if not self._buffer:
self._lazy_init()
lines = [line.strip('\r') for line in replacement.split('\n')]
head = self._buffer[l1][:c1]
tail = self._buffer[l2][c2:]
lines[0] = head + lines[0]
lines[-1] += tail
self._buffer[l1:l2 + 1] = lines
self._text = '' # invalidate single-string copy
self.version += 1
[docs]
def text_edits(self, edits: Union[list, dict], version: cython.int = -1):
"""Incorporates the one or more text-edits or change-events into the text.
A Text-Edit is a dictionary of this form::
{"range": {"start": {"line": 0, "character": 0 },
"end": {"line": 0, "character": 0 } },
"newText": "..."}
In case of a change-event, the key "newText" is replaced by "text".
"""
def edit(ed: dict):
"""Weaves a single edit into the text-buffer."""
rng = ed["range"]
start = rng["start"]
end = rng["end"]
try:
replacement = ed['text']
except KeyError:
replacement = ed['newText']
self.update(start["line"], start["character"],
end["line"], end["character"], replacement)
if isinstance(edits, list):
for ed in edits:
edit(ed)
else:
edit(cast(dict, edits))
if version >= 0:
self.version = version
[docs]
def snapshot(self, eol: str = '\n') -> Union[str, StringView]:
"""Returns the current state of the entire text, using the given
end of line marker (``\\n`` or ``\\r\\n``)"""
if self._text:
return self._text
self._text = eol.join(str(line) for line in self._buffer)
return self._text