Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[stdlib] Fix String.split() implementations #3528

Draft
wants to merge 138 commits into
base: nightly
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
138 commits
Select commit Hold shift + click to select a range
e971d52
fix string split()
martinvuyk Sep 22, 2024
f717b0a
fix string split(separator)
martinvuyk Sep 22, 2024
c8ac4df
fix string split(separator)
martinvuyk Sep 22, 2024
51ab01a
retry github CI
martinvuyk Sep 22, 2024
d11f381
Apply suggestions from code review
martinvuyk Sep 23, 2024
40ff065
retry github CI
martinvuyk Sep 23, 2024
6401227
Merge branch 'nightly' into fix-split-implementations
martinvuyk Sep 23, 2024
43c6fea
add suggestions from @JoeLoser
martinvuyk Sep 23, 2024
eb4c50a
add testcase suggested by @msaelices
martinvuyk Sep 23, 2024
94a68f2
retry github CI
martinvuyk Sep 23, 2024
8a23eee
Merge branch 'nightly' into fix-split-implementations
martinvuyk Sep 24, 2024
3373dad
retry github CI
martinvuyk Sep 24, 2024
62511da
Merge branch 'fix-split-implementations' of github.com:martinvuyk/moj…
martinvuyk Sep 24, 2024
4f80a08
Merge branch 'nightly' into fix-split-implementations
martinvuyk Sep 25, 2024
8865894
Merge branch 'nightly' into fix-split-implementations
martinvuyk Sep 26, 2024
2c9119e
Merge branch 'nightly' into fix-split-implementations
martinvuyk Sep 29, 2024
7a62da4
Merge branch 'nightly' into fix-split-implementations
martinvuyk Sep 30, 2024
25f61c4
Merge branch 'nightly' into fix-split-implementations
martinvuyk Oct 1, 2024
718a471
Merge branch 'nightly' into fix-split-implementations
martinvuyk Oct 1, 2024
b977841
refactor a bit
martinvuyk Oct 1, 2024
5229c17
fix detail
martinvuyk Oct 1, 2024
1550667
fix detail
martinvuyk Oct 1, 2024
a4fdf46
aggressive optimizations
martinvuyk Oct 2, 2024
4c753ce
fix detail
martinvuyk Oct 2, 2024
b752e85
some more branch reduction
martinvuyk Oct 2, 2024
1bd7568
fix detail
martinvuyk Oct 2, 2024
e6dfd38
fix detail
martinvuyk Oct 2, 2024
dd0ac67
fix detail
martinvuyk Oct 2, 2024
be62c8f
refactor StringSlice constructor
martinvuyk Oct 2, 2024
cba8a6b
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 2, 2024
6240387
refactor StringSlice constructor
martinvuyk Oct 2, 2024
2956b90
maybe some more overhead
martinvuyk Oct 2, 2024
b27da93
refactor output
martinvuyk Oct 3, 2024
1dca960
Merge branch 'nightly' into fix-split-implementations
martinvuyk Oct 6, 2024
0cada62
fix check_licences.mojo reference
martinvuyk Oct 13, 2024
e38a462
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 13, 2024
4deca4d
refactor split implementations
martinvuyk Oct 13, 2024
026544a
fix detail
martinvuyk Oct 13, 2024
22544db
fix detail
martinvuyk Oct 13, 2024
e28de55
fix detail
martinvuyk Oct 13, 2024
4d49ee8
fix detail
martinvuyk Oct 13, 2024
d7d9e1c
fix detail
martinvuyk Oct 13, 2024
f5fb7e9
fix detail
martinvuyk Oct 13, 2024
02e184e
fix detail
martinvuyk Oct 14, 2024
1656317
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 14, 2024
8fbf054
fix detail
martinvuyk Oct 14, 2024
495eb0a
refactor to use span instad of unsafe pointer and len
martinvuyk Oct 14, 2024
4935246
change split to not raise and return separated characters on empty se…
martinvuyk Oct 14, 2024
4769e0b
fix detail
martinvuyk Oct 14, 2024
c419112
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 14, 2024
07d23f6
fix details
martinvuyk Oct 14, 2024
e7bb811
add some traits and finally get it to work
martinvuyk Oct 15, 2024
4918971
fix details
martinvuyk Oct 15, 2024
e93a96e
fix details
martinvuyk Oct 15, 2024
c93f3ac
fix details
martinvuyk Oct 15, 2024
1b60734
fix details
martinvuyk Oct 15, 2024
c723949
add suggestion from @soraros
martinvuyk Oct 15, 2024
c37b55d
add suggestion from @soraros
martinvuyk Oct 15, 2024
972af10
add suggestion from @soraros
martinvuyk Oct 15, 2024
2ce7815
remove redundancy
martinvuyk Oct 15, 2024
4e962fe
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 16, 2024
cb6ba44
refactor and make StringSlice.split() return List[Self]
martinvuyk Oct 16, 2024
b54acac
delete stringref split
martinvuyk Oct 16, 2024
c736b71
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 16, 2024
b7edcad
Merge branch 'nightly' into fix-split-implementations
martinvuyk Oct 16, 2024
fb9f956
Merge branch 'fix-split-implementations' of github.com:martinvuyk/moj…
martinvuyk Oct 16, 2024
e01ad6a
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 17, 2024
8943ae0
add missing traits
martinvuyk Oct 17, 2024
ae0d0d0
add better tests and fix with shady use of rebind
martinvuyk Oct 17, 2024
1eb2065
refactor and add funcs that use new string list builder
martinvuyk Oct 18, 2024
8bcd82b
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 18, 2024
b25d5dc
fix conflicts
martinvuyk Oct 18, 2024
fb5562e
fix detail
martinvuyk Oct 18, 2024
fe746bf
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 19, 2024
969ba59
rename to_string to to_string_list
martinvuyk Oct 19, 2024
0982fea
even more performance gains
martinvuyk Oct 19, 2024
0a7c277
finallyclearclearclear
martinvuyk Oct 20, 2024
d5dc9d2
refactor some details
martinvuyk Oct 20, 2024
3d8dd84
fix details
martinvuyk Oct 20, 2024
3fe9834
fix details
martinvuyk Oct 20, 2024
98d326a
retry github CI
martinvuyk Oct 20, 2024
c8e8877
even more small perf improvements
martinvuyk Oct 20, 2024
4913d11
fix details
martinvuyk Oct 20, 2024
2080c72
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 20, 2024
a792b79
fix details
martinvuyk Oct 20, 2024
d4e20e1
add suggestion by @soraros
martinvuyk Oct 20, 2024
fcf0c6d
add suggestions by @soraros
martinvuyk Oct 21, 2024
6c30c88
Merge branch 'nightly' into fix-split-implementations
martinvuyk Oct 21, 2024
eb6d9d4
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 30, 2024
e6b8efb
fix after merge
martinvuyk Oct 30, 2024
d9ce8a3
reduce benchmark num_repetitions to 1
martinvuyk Oct 30, 2024
2804141
fix detal
martinvuyk Oct 30, 2024
3cbb3b7
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 30, 2024
4e23ab0
unify as_bytes_read and as_bytes_write
martinvuyk Oct 31, 2024
d398eee
fix details
martinvuyk Oct 31, 2024
02ab469
fix details
martinvuyk Oct 31, 2024
805dc26
fix details
martinvuyk Oct 31, 2024
dfb0ad3
fix details
martinvuyk Oct 31, 2024
aec96ad
fix details
martinvuyk Oct 31, 2024
77c0305
update changelog
martinvuyk Oct 31, 2024
9292585
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Oct 31, 2024
0c9a089
add suggestions
martinvuyk Nov 1, 2024
518dffa
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Nov 1, 2024
3be0878
mojo format
martinvuyk Nov 1, 2024
1ec7169
undo iadd and add changes
martinvuyk Nov 5, 2024
c357887
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Nov 5, 2024
8f51bca
revert get_immutable() -> read()
martinvuyk Nov 5, 2024
31e4b96
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Nov 12, 2024
31b9bad
fix unsafe ptr constructors
martinvuyk Nov 12, 2024
338afa6
fix unsafe ptr constructors
martinvuyk Nov 12, 2024
c0ff78d
fix unsafe ptr constructors
martinvuyk Nov 12, 2024
8917f7f
remove generic as_bytes
martinvuyk Nov 15, 2024
ff81054
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Nov 15, 2024
7b71d5a
fix remaining use of as_bytes[]
martinvuyk Nov 15, 2024
1bc9672
fix origin casting
martinvuyk Nov 15, 2024
bd2fa1e
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Dec 6, 2024
bc9f438
remove all generic origin shenanigans
martinvuyk Dec 6, 2024
91f8527
fix dangling import
martinvuyk Dec 6, 2024
4de5220
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Dec 7, 2024
5a0ed66
fix details
martinvuyk Dec 7, 2024
cec5e63
fix details
martinvuyk Dec 7, 2024
9aa8f4d
fix details
martinvuyk Dec 7, 2024
6b51a32
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Dec 15, 2024
01e8c97
refactor to use bitwise operations
martinvuyk Dec 16, 2024
3b367ab
remove bit utils
martinvuyk Dec 16, 2024
e53db2f
remove bit utils
martinvuyk Dec 16, 2024
8d6288e
remove bit utils
martinvuyk Dec 16, 2024
6b35221
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Dec 19, 2024
ba2c415
reduce the scope of the PR
martinvuyk Dec 19, 2024
79a6745
add tests that use stringslice assert_equal from #3898
martinvuyk Dec 19, 2024
954d39d
reduce the scope of the PR
martinvuyk Dec 19, 2024
a77d64d
even more perf, update benchmark numbers
martinvuyk Dec 20, 2024
e14e60a
fix details in benchmark
martinvuyk Dec 20, 2024
68ccd9a
Merge remote-tracking branch 'upstream/nightly' into fix-split-implem…
martinvuyk Jan 9, 2025
d70b32b
fix after merge
martinvuyk Jan 9, 2025
027b5ed
fix detail
martinvuyk Jan 9, 2025
324d2f2
fix details
martinvuyk Jan 9, 2025
69b7747
fix detail
martinvuyk Jan 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion stdlib/benchmarks/collections/bench_string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ from random import random_si64, seed
from benchmark import Bench, BenchConfig, Bencher, BenchId, Unit, keep, run

from collections.string._utf8_validation import _is_valid_utf8
from collections.string import StringSlice


# ===-----------------------------------------------------------------------===#
Expand Down Expand Up @@ -111,7 +112,7 @@ fn bench_string_split[
@always_inline
@parameter
fn call_fn() raises:
var res: List[String]
var res: List[StringSlice[__origin_of(items)]]

@parameter
if sequence:
Expand Down
102 changes: 71 additions & 31 deletions stdlib/src/builtin/string_literal.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ from collections.string.string_slice import (
StringSlice,
StaticString,
_StringSliceIter,
_to_string_list,
to_string_list,
_split,
)
from hashlib._hasher import _HashableWithHasher, _Hasher
from memory import UnsafePointer, memcpy, Span
Expand Down Expand Up @@ -439,20 +440,20 @@ struct StringLiteral(
return self.__str__()

fn __iter__(ref self) -> _StringSliceIter[StaticConstantOrigin]:
"""Return an iterator over the string literal.
"""Iterate over the string unicode characters.

Returns:
An iterator over the string.
An iterator of references to the string unicode characters.
"""
return _StringSliceIter[StaticConstantOrigin](
ptr=self.unsafe_ptr(), length=self.byte_length()
)

fn __reversed__(self) -> _StringSliceIter[StaticConstantOrigin, False]:
"""Iterate backwards over the string, returning immutable references.
"""Iterate backwards over the string unicode characters.

Returns:
A reversed iterator over the string.
A reversed iterator of references to the string unicode characters.
"""
return _StringSliceIter[StaticConstantOrigin, False](
ptr=self.unsafe_ptr(), length=self.byte_length()
Expand Down Expand Up @@ -524,21 +525,18 @@ struct StringLiteral(
Returns:
A string slice pointing to this static string literal.
"""

# FIXME(MSTDL-160):
# Enforce UTF-8 encoding in StringLiteral so this is actually
# guaranteed to be valid.
return StaticString(ptr=self.unsafe_ptr(), length=self.byte_length())
return StaticString(self)

@always_inline
fn as_bytes(self) -> Span[Byte, StaticConstantOrigin]:
"""
Returns a contiguous Span of the bytes owned by this string.
"""Returns a contiguous slice of the bytes owned by this string.

Returns:
A contiguous slice pointing to the bytes owned by this string.
"""

Notes:
This does not include the trailing null terminator.
"""
return Span[Byte, StaticConstantOrigin](
ptr=self.unsafe_ptr(), length=self.byte_length()
)
Expand All @@ -553,7 +551,6 @@ struct StringLiteral(
Notes:
This does not include the trailing null terminator.
"""
# Does NOT include the NUL terminator.
return Span[Byte, __origin_of(self)](
ptr=self.unsafe_ptr(), length=self.byte_length()
)
Expand Down Expand Up @@ -602,11 +599,11 @@ struct StringLiteral(
`start`. If not found, returns -1.

Args:
substr: The substring to find.
start: The offset from which to find.
substr: The substring to find.
start: The offset from which to find.

Returns:
The offset of `substr` relative to the beginning of the string.
The offset of `substr` relative to the beginning of the string.
"""
return self.as_string_slice().find(substr, start=start)

Expand Down Expand Up @@ -694,13 +691,36 @@ struct StringLiteral(
elems.each[add_elt]()
return result

fn split(self, sep: String, maxsplit: Int = -1) raises -> List[String]:
"""Split the string literal by a separator.
@always_inline
fn split(self, sep: StringSlice, maxsplit: Int) -> List[StaticString]:
"""Split the string by a separator.

Args:
sep: The string to split on.
maxsplit: The maximum amount of items to split from String.
Defaults to unlimited.

Returns:
A List of Strings containing the input split by the separator.

Examples:

```mojo
# Splitting with maxsplit
_ = "1,2,3".split(",", maxsplit=1) # ['1', '2,3']
# Splitting with starting or ending separators
_ = ",1,2,3,".split(",", maxsplit=1) # ['', '1,2,3,']
_ = "123".split("", maxsplit=1) # ['', '123']
```
.
"""
return self.as_string_slice().split(sep, maxsplit)

@always_inline
fn split(self, sep: StringSlice) -> List[StaticString]:
"""Split the string by a separator.

Args:
sep: The string to split on.

Returns:
A List of Strings containing the input split by the separator.
Expand All @@ -712,20 +732,40 @@ struct StringLiteral(
_ = "hello world".split(" ") # ["hello", "world"]
# Splitting adjacent separators
_ = "hello,,world".split(",") # ["hello", "", "world"]
# Splitting with starting or ending separators
_ = ",1,2,3,".split(",") # ['', '1', '2', '3', '']
_ = "123".split("") # ['', '1', '2', '3', '']
```
.
"""
return self.as_string_slice().split(sep)

@always_inline
fn split(self, *, maxsplit: Int) -> List[StaticString]:
"""Split the string by every Whitespace separator.

Args:
maxsplit: The maximum amount of items to split from String.

Returns:
A List of Strings containing the input split by the separator.

Examples:

```mojo
# Splitting with maxsplit
_ = "1,2,3".split(",", 1) # ['1', '2,3']
_ = "1 2 3".split(maxsplit=1) # ['1', '2 3']
```
.
"""
return str(self).split(sep, maxsplit)
return self.as_string_slice().split(maxsplit=maxsplit)

fn split(self, sep: NoneType = None, maxsplit: Int = -1) -> List[String]:
"""Split the string literal by every whitespace separator.
@always_inline
fn split(self, sep: NoneType = None) -> List[StaticString]:
"""Split the string by every Whitespace separator.

Args:
sep: None.
maxsplit: The maximum amount of items to split from string. Defaults
to unlimited.

Returns:
A List of Strings containing the input split by the separator.
Expand All @@ -736,16 +776,16 @@ struct StringLiteral(
# Splitting an empty string or filled with whitespaces
_ = " ".split() # []
_ = "".split() # []

# Splitting a string with leading, trailing, and middle whitespaces
_ = " hello world ".split() # ["hello", "world"]
# Splitting adjacent universal newlines:
_ = "hello \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029world".split()
# ["hello", "world"]
_ = (
"hello \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029world"
).split() # ["hello", "world"]
```
.
"""
return str(self).split(sep, maxsplit)
return self.as_string_slice().split()

fn splitlines(self, keepends: Bool = False) -> List[String]:
"""Split the string literal at line boundaries. This corresponds to Python's
Expand All @@ -759,7 +799,7 @@ struct StringLiteral(
Returns:
A List of Strings containing the input split by line boundaries.
"""
return _to_string_list(self.as_string_slice().splitlines(keepends))
return to_string_list(self.as_string_slice().splitlines(keepends))

fn count(self, substr: String) -> Int:
"""Return the number of non-overlapping occurrences of substring
Expand Down
Loading
Loading