Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[stdlib] Add _SplitlinesIter #3858

Open
wants to merge 8 commits into
base: nightly
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 95 additions & 44 deletions stdlib/src/utils/string_slice.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ from memory import UnsafePointer, memcmp, memcpy, Span
from memory.memory import _memcmp_impl_unconstrained

from utils.format import _CurlyEntryFormattable, _FormatCurlyEntry

from os import abort
from ._utf8_validation import _is_valid_utf8

alias StaticString = StringSlice[StaticConstantOrigin]
Expand Down Expand Up @@ -166,6 +166,79 @@ fn _memrmem[
return UnsafePointer[Scalar[type]]()


@value
struct _SplitlinesIter[
martinvuyk marked this conversation as resolved.
Show resolved Hide resolved
is_mutable: Bool, //,
origin: Origin[is_mutable],
forward: Bool = True,
]:
"""Iterator for `StringSlice` over unicode linebreaks.

Parameters:
is_mutable: Whether the slice is mutable.
origin: The origin of the underlying string data.
forward: The iteration direction. `False` is backwards.
"""

alias `\r` = UInt8(ord("\r"))
alias `\n` = UInt8(ord("\n"))

var index: Int
var ptr: UnsafePointer[Byte]
var length: Int
var keepends: Bool

fn __iter__(self) -> Self:
return self

fn __next__(mut self) -> StringSlice[origin]:
# highly performance sensitive code, benchmark before touching
@parameter
if forward:
var eol_start = self.index
var eol_length = 0

while eol_start < self.length:
var b0 = self.ptr[eol_start]
var char_len = _utf8_first_byte_sequence_length(b0)
debug_assert(
eol_start + char_len <= self.length,
"corrupted sequence causing unsafe memory access",
)
var isnewline = unlikely(
_is_newline_char(self.ptr, eol_start, b0, char_len)
)
var char_end = int(isnewline) * (eol_start + char_len)
var next_idx = char_end * int(char_end < self.length)
var is_r_n = b0 == Self.`\r` and next_idx != 0 and self.ptr[
next_idx
] == Self.`\n`
eol_length = int(isnewline) * char_len + int(is_r_n)
if isnewline:
break
eol_start += char_len

var str_len = eol_start - self.index + int(
self.keepends
) * eol_length
var s = StringSlice[origin](
ptr=self.ptr + self.index, length=str_len
)
self.index = eol_start + eol_length
return s
else:
constrained[False, "reversed splitlines not yet implemented"]()
return abort[StringSlice[origin]]()

@always_inline
fn __has_next__(self) -> Bool:
@parameter
if forward:
return self.index < self.length
else:
return self.index > 0


@value
struct _StringSliceIter[
is_mutable: Bool, //,
Expand Down Expand Up @@ -216,6 +289,24 @@ struct _StringSliceIter[
else:
return self.index > 0

fn splitlines(
owned self: _StringSliceIter[forward=True], *, keepends: Bool = False
) -> _SplitlinesIter[origin, forward=True]:
"""Split the string at line boundaries. This corresponds to Python's
[universal newlines:](
https://docs.python.org/3/library/stdtypes.html#str.splitlines)
`"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`.

Args:
keepends: If True, line breaks are kept in the resulting strings.

Returns:
An iterator of StringSlices over the input split by line boundaries.
"""
return _SplitlinesIter[origin, True](
self.index, self.ptr, self.length, keepends
)

fn __len__(self) -> Int:
@parameter
if forward:
Expand Down Expand Up @@ -1027,62 +1118,22 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable]](
offset += b_len
return length != 0

fn splitlines[
O: ImmutableOrigin, //
](self: StringSlice[O], keepends: Bool = False) -> List[StringSlice[O]]:
fn splitlines(self, keepends: Bool = False) -> List[StringSlice[origin]]:
"""Split the string at line boundaries. This corresponds to Python's
[universal newlines:](
https://docs.python.org/3/library/stdtypes.html#str.splitlines)
`"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`.

Parameters:
O: The immutable origin.

Args:
keepends: If True, line breaks are kept in the resulting strings.

Returns:
A List of Strings containing the input split by line boundaries.
"""

# highly performance sensitive code, benchmark before touching
alias `\r` = UInt8(ord("\r"))
alias `\n` = UInt8(ord("\n"))

output = List[StringSlice[O]](capacity=128) # guessing
var ptr = self.unsafe_ptr()
var length = self.byte_length()
var offset = 0

while offset < length:
var eol_start = offset
var eol_length = 0

while eol_start < length:
var b0 = ptr[eol_start]
var char_len = _utf8_first_byte_sequence_length(b0)
debug_assert(
eol_start + char_len <= length,
"corrupted sequence causing unsafe memory access",
)
var isnewline = unlikely(
_is_newline_char(ptr, eol_start, b0, char_len)
)
var char_end = int(isnewline) * (eol_start + char_len)
var next_idx = char_end * int(char_end < length)
var is_r_n = b0 == `\r` and next_idx != 0 and ptr[
next_idx
] == `\n`
eol_length = int(isnewline) * char_len + int(is_r_n)
if isnewline:
break
eol_start += char_len

var str_len = eol_start - offset + int(keepends) * eol_length
var s = StringSlice[O](ptr=ptr + offset, length=str_len)
var output = List[StringSlice[origin]](capacity=128) # guessing
for s in self.__iter__().splitlines(keepends=keepends):
output.append(s)
offset = eol_start + eol_length

return output^


Expand Down
3 changes: 2 additions & 1 deletion stdlib/test/python/my_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def __init__(self, bar):

class AbstractPerson(ABC):
@abstractmethod
def method(self): ...
def method(self):
...


def my_function(name):
Expand Down
Loading