From 898562b49a5912a7076b22a8afe6100d0992f73d Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 10 Dec 2024 20:41:27 -0300 Subject: [PATCH 1/5] Add _SplitlinesIter Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 139 ++++++++++++++++++++--------- stdlib/test/python/my_module.py | 3 +- 2 files changed, 97 insertions(+), 45 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 03ed9be490..6752998b5d 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -31,7 +31,7 @@ from memory import UnsafePointer, memcmp, memcpy, Span from memory.memory import _memcmp_impl_unconstrained from utils.format import _CurlyEntryFormattable, _FormatCurlyEntry - +from os import abort from ._utf8_validation import _is_valid_utf8 alias StaticString = StringSlice[StaticConstantOrigin] @@ -166,6 +166,79 @@ fn _memrmem[ return UnsafePointer[Scalar[type]]() +@value +struct _SplitlinesIter[ + is_mutable: Bool, //, + origin: Origin[is_mutable], + forward: Bool = True, +]: + """Iterator for `StringSlice` over unicode linebreaks. + + Parameters: + is_mutable: Whether the slice is mutable. + origin: The origin of the underlying string data. + forward: The iteration direction. `False` is backwards. + """ + + alias `\r` = UInt8(ord("\r")) + alias `\n` = UInt8(ord("\n")) + + var index: Int + var ptr: UnsafePointer[Byte] + var length: Int + var keepends: Bool + + fn __iter__(self) -> Self: + return self + + fn __next__(mut self) -> StringSlice[origin]: + # highly performance sensitive code, benchmark before touching + @parameter + if forward: + var eol_start = self.index + var eol_length = 0 + + while eol_start < self.length: + var b0 = self.ptr[eol_start] + var char_len = _utf8_first_byte_sequence_length(b0) + debug_assert( + eol_start + char_len <= self.length, + "corrupted sequence causing unsafe memory access", + ) + var isnewline = unlikely( + _is_newline_char(self.ptr, eol_start, b0, char_len) + ) + var char_end = int(isnewline) * (eol_start + char_len) + var next_idx = char_end * int(char_end < self.length) + var is_r_n = b0 == Self.`\r` and next_idx != 0 and self.ptr[ + next_idx + ] == Self.`\n` + eol_length = int(isnewline) * char_len + int(is_r_n) + if isnewline: + break + eol_start += char_len + + var str_len = eol_start - self.index + int( + self.keepends + ) * eol_length + var s = StringSlice[origin]( + ptr=self.ptr + self.index, length=str_len + ) + self.index = eol_start + eol_length + return s + else: + constrained[False, "reversed splitlines not yet implemented"]() + return abort[StringSlice[origin]]() + + @always_inline + fn __has_next__(self) -> Bool: + @parameter + if forward: + return self.index < self.length + else: + return self.index > 0 + + @value struct _StringSliceIter[ is_mutable: Bool, //, @@ -216,6 +289,24 @@ struct _StringSliceIter[ else: return self.index > 0 + fn splitlines( + owned self: _StringSliceIter[forward=True], *, keepends: Bool = False + ) -> _SplitlinesIter[origin, forward=True]: + """Split the string at line boundaries. This corresponds to Python's + [universal newlines:]( + https://docs.python.org/3/library/stdtypes.html#str.splitlines) + `"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. + + Args: + keepends: If True, line breaks are kept in the resulting strings. + + Returns: + An iterator of StringSlices over the input split by line boundaries. + """ + return _SplitlinesIter[origin, True]( + self.index, self.ptr, self.length, keepends + ) + fn __len__(self) -> Int: @parameter if forward: @@ -1027,17 +1118,12 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable]]( offset += b_len return length != 0 - fn splitlines[ - O: ImmutableOrigin, // - ](self: StringSlice[O], keepends: Bool = False) -> List[StringSlice[O]]: + fn splitlines(self, keepends: Bool = False) -> List[StringSlice[origin]]: """Split the string at line boundaries. This corresponds to Python's [universal newlines:]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) `"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. - Parameters: - O: The immutable origin. - Args: keepends: If True, line breaks are kept in the resulting strings. @@ -1045,44 +1131,9 @@ struct StringSlice[is_mutable: Bool, //, origin: Origin[is_mutable]]( A List of Strings containing the input split by line boundaries. """ - # highly performance sensitive code, benchmark before touching - alias `\r` = UInt8(ord("\r")) - alias `\n` = UInt8(ord("\n")) - - output = List[StringSlice[O]](capacity=128) # guessing - var ptr = self.unsafe_ptr() - var length = self.byte_length() - var offset = 0 - - while offset < length: - var eol_start = offset - var eol_length = 0 - - while eol_start < length: - var b0 = ptr[eol_start] - var char_len = _utf8_first_byte_sequence_length(b0) - debug_assert( - eol_start + char_len <= length, - "corrupted sequence causing unsafe memory access", - ) - var isnewline = unlikely( - _is_newline_char(ptr, eol_start, b0, char_len) - ) - var char_end = int(isnewline) * (eol_start + char_len) - var next_idx = char_end * int(char_end < length) - var is_r_n = b0 == `\r` and next_idx != 0 and ptr[ - next_idx - ] == `\n` - eol_length = int(isnewline) * char_len + int(is_r_n) - if isnewline: - break - eol_start += char_len - - var str_len = eol_start - offset + int(keepends) * eol_length - var s = StringSlice[O](ptr=ptr + offset, length=str_len) + var output = List[StringSlice[origin]](capacity=128) # guessing + for s in self.__iter__().splitlines(keepends=keepends): output.append(s) - offset = eol_start + eol_length - return output^ diff --git a/stdlib/test/python/my_module.py b/stdlib/test/python/my_module.py index 8147b0a382..c78c39556e 100644 --- a/stdlib/test/python/my_module.py +++ b/stdlib/test/python/my_module.py @@ -25,7 +25,8 @@ def __init__(self, bar): class AbstractPerson(ABC): @abstractmethod - def method(self): ... + def method(self): + ... def my_function(name): From d2bcadfd2a27f618106416d7f5289a5b4261e823 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Wed, 11 Dec 2024 14:21:46 -0300 Subject: [PATCH 2/5] unformat my_module.py Signed-off-by: martinvuyk --- stdlib/test/python/my_module.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stdlib/test/python/my_module.py b/stdlib/test/python/my_module.py index c78c39556e..8147b0a382 100644 --- a/stdlib/test/python/my_module.py +++ b/stdlib/test/python/my_module.py @@ -25,8 +25,7 @@ def __init__(self, bar): class AbstractPerson(ABC): @abstractmethod - def method(self): - ... + def method(self): ... def my_function(name): From 257989b0aa63c2c3d2c47822895751dc5cabca03 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Thu, 26 Dec 2024 10:01:27 -0300 Subject: [PATCH 3/5] add implementation again Signed-off-by: martinvuyk --- .../src/collections/string/string_slice.mojo | 135 ++++++++++++------ 1 file changed, 94 insertions(+), 41 deletions(-) diff --git a/stdlib/src/collections/string/string_slice.mojo b/stdlib/src/collections/string/string_slice.mojo index f632e806fd..49ccf32df1 100644 --- a/stdlib/src/collections/string/string_slice.mojo +++ b/stdlib/src/collections/string/string_slice.mojo @@ -31,7 +31,7 @@ from memory.memory import _memcmp_impl_unconstrained from sys import bitwidthof, simdwidthof from sys.intrinsics import unlikely, likely from utils.stringref import StringRef, _memmem -from os import PathLike +from os import PathLike, abort alias StaticString = StringSlice[StaticConstantOrigin] """An immutable static string slice.""" @@ -165,6 +165,78 @@ fn _memrmem[ return UnsafePointer[Scalar[type]]() +@value +struct _SplitlinesIter[ + is_mutable: Bool, //, + origin: Origin[is_mutable], + forward: Bool = True, +]: + """Iterator for `StringSlice` over unicode linebreaks. + Parameters: + is_mutable: Whether the slice is mutable. + origin: The origin of the underlying string data. + forward: The iteration direction. `False` is backwards. + """ + + alias `\r` = UInt8(ord("\r")) + alias `\n` = UInt8(ord("\n")) + + var index: Int + var ptr: UnsafePointer[Byte] + var length: Int + var keepends: Bool + + fn __iter__(self) -> Self: + return self + + fn __next__(mut self) -> StringSlice[origin]: + # highly performance sensitive code, benchmark before touching + @parameter + if forward: + var eol_start = self.index + var eol_length = 0 + + while eol_start < self.length: + var b0 = self.ptr[eol_start] + var char_len = _utf8_first_byte_sequence_length(b0) + debug_assert( + eol_start + char_len <= self.length, + "corrupted sequence causing unsafe memory access", + ) + var isnewline = unlikely( + _is_newline_char(self.ptr, eol_start, b0, char_len) + ) + var char_end = int(isnewline) * (eol_start + char_len) + var next_idx = char_end * int(char_end < self.length) + var is_r_n = b0 == Self.`\r` and next_idx != 0 and self.ptr[ + next_idx + ] == Self.`\n` + eol_length = int(isnewline) * char_len + int(is_r_n) + if isnewline: + break + eol_start += char_len + + var str_len = eol_start - self.index + int( + self.keepends + ) * eol_length + var s = StringSlice[origin]( + ptr=self.ptr + self.index, length=str_len + ) + self.index = eol_start + eol_length + return s + else: + constrained[False, "reversed splitlines not yet implemented"]() + return abort[StringSlice[origin]]() + + @always_inline + fn __has_next__(self) -> Bool: + @parameter + if forward: + return self.index < self.length + else: + return self.index > 0 + + @value struct _StringSliceIter[ mut: Bool, //, @@ -230,6 +302,24 @@ struct _StringSliceIter[ Span[Byte, ImmutableAnyOrigin](ptr=self.ptr, length=self.index) ) + fn splitlines( + owned self: _StringSliceIter[forward=True], *, keepends: Bool = False + ) -> _SplitlinesIter[origin, forward=True]: + """Split the string at line boundaries. This corresponds to Python's + [universal newlines:]( + https://docs.python.org/3/library/stdtypes.html#str.splitlines) + `"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. + + Args: + keepends: If True, line breaks are kept in the resulting strings. + + Returns: + An iterator of StringSlices over the input split by line boundaries. + """ + return _SplitlinesIter[origin, True]( + self.index, self.ptr, self.length, keepends + ) + @value @register_passable("trivial") @@ -1089,9 +1179,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( offset += b_len return length != 0 - fn splitlines[ - O: ImmutableOrigin, // - ](self: StringSlice[O], keepends: Bool = False) -> List[StringSlice[O]]: + fn splitlines(self, keepends: Bool = False) -> List[Self]: """Split the string at line boundaries. This corresponds to Python's [universal newlines:]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) @@ -1107,44 +1195,9 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( A List of Strings containing the input split by line boundaries. """ - # highly performance sensitive code, benchmark before touching - alias `\r` = UInt8(ord("\r")) - alias `\n` = UInt8(ord("\n")) - - output = List[StringSlice[O]](capacity=128) # guessing - var ptr = self.unsafe_ptr() - var length = self.byte_length() - var offset = 0 - - while offset < length: - var eol_start = offset - var eol_length = 0 - - while eol_start < length: - var b0 = ptr[eol_start] - var char_len = _utf8_first_byte_sequence_length(b0) - debug_assert( - eol_start + char_len <= length, - "corrupted sequence causing unsafe memory access", - ) - var isnewline = unlikely( - _is_newline_char(ptr, eol_start, b0, char_len) - ) - var char_end = int(isnewline) * (eol_start + char_len) - var next_idx = char_end * int(char_end < length) - var is_r_n = b0 == `\r` and next_idx != 0 and ptr[ - next_idx - ] == `\n` - eol_length = int(isnewline) * char_len + int(is_r_n) - if isnewline: - break - eol_start += char_len - - var str_len = eol_start - offset + int(keepends) * eol_length - var s = StringSlice[O](ptr=ptr + offset, length=str_len) + var output = List[Self](capacity=128) # guessing + for s in self.__iter__().splitlines(keepends=keepends): output.append(s) - offset = eol_start + eol_length - return output^ fn count(self, substr: StringSlice) -> Int: From 50cffdf16d79e53a4f950c8a59e298f396a7c734 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Thu, 26 Dec 2024 10:02:23 -0300 Subject: [PATCH 4/5] fix docstring Signed-off-by: martinvuyk --- stdlib/src/collections/string/string_slice.mojo | 1 + 1 file changed, 1 insertion(+) diff --git a/stdlib/src/collections/string/string_slice.mojo b/stdlib/src/collections/string/string_slice.mojo index 49ccf32df1..25f50d2d19 100644 --- a/stdlib/src/collections/string/string_slice.mojo +++ b/stdlib/src/collections/string/string_slice.mojo @@ -172,6 +172,7 @@ struct _SplitlinesIter[ forward: Bool = True, ]: """Iterator for `StringSlice` over unicode linebreaks. + Parameters: is_mutable: Whether the slice is mutable. origin: The origin of the underlying string data. From cd2728b2b1c8f2822e258888bbb7739f6fe44530 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Thu, 26 Dec 2024 10:23:22 -0300 Subject: [PATCH 5/5] fix docstring Signed-off-by: martinvuyk --- stdlib/src/collections/string/string_slice.mojo | 3 --- 1 file changed, 3 deletions(-) diff --git a/stdlib/src/collections/string/string_slice.mojo b/stdlib/src/collections/string/string_slice.mojo index 25f50d2d19..a9aabe49a3 100644 --- a/stdlib/src/collections/string/string_slice.mojo +++ b/stdlib/src/collections/string/string_slice.mojo @@ -1186,9 +1186,6 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) `"\\r\\n"` and `"\\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. - Parameters: - O: The immutable origin. - Args: keepends: If True, line breaks are kept in the resulting strings.