diff --git a/Package.swift b/Package.swift index 7d73dd742..7566193c8 100644 --- a/Package.swift +++ b/Package.swift @@ -184,10 +184,10 @@ let targets: [CustomTarget] = [ kind: .testSupport, name: "_CollectionsTestSupport", dependencies: ["_CollectionsUtilities"]), - .target( - kind: .test, - name: "CollectionsTestSupportTests", - dependencies: ["_CollectionsTestSupport"]), +// .target( +// kind: .test, +// name: "CollectionsTestSupportTests", +// dependencies: ["_CollectionsTestSupport"]), .target( kind: .hidden, name: "_CollectionsUtilities", @@ -226,52 +226,52 @@ let targets: [CustomTarget] = [ name: "BitCollections", dependencies: ["_CollectionsUtilities"], exclude: ["CMakeLists.txt"]), - .target( - kind: .test, - name: "BitCollectionsTests", - dependencies: [ - "BitCollections", "_CollectionsTestSupport", "OrderedCollections" - ]), +// .target( +// kind: .test, +// name: "BitCollectionsTests", +// dependencies: [ +// "BitCollections", "_CollectionsTestSupport", "OrderedCollections" +// ]), .target( kind: .exported, name: "DequeModule", dependencies: ["_CollectionsUtilities"], exclude: ["CMakeLists.txt"]), - .target( - kind: .test, - name: "DequeTests", - dependencies: ["DequeModule", "_CollectionsTestSupport"]), +// .target( +// kind: .test, +// name: "DequeTests", +// dependencies: ["DequeModule", "_CollectionsTestSupport"]), .target( kind: .exported, name: "HashTreeCollections", dependencies: ["_CollectionsUtilities"], exclude: ["CMakeLists.txt"]), - .target( - kind: .test, - name: "HashTreeCollectionsTests", - dependencies: ["HashTreeCollections", "_CollectionsTestSupport"]), +// .target( +// kind: .test, +// name: "HashTreeCollectionsTests", +// dependencies: ["HashTreeCollections", "_CollectionsTestSupport"]), .target( kind: .exported, name: "HeapModule", dependencies: ["_CollectionsUtilities"], exclude: ["CMakeLists.txt"]), - .target( - kind: .test, - name: "HeapTests", - dependencies: ["HeapModule", "_CollectionsTestSupport"]), +// .target( +// kind: .test, +// name: "HeapTests", +// dependencies: ["HeapModule", "_CollectionsTestSupport"]), .target( kind: .exported, name: "OrderedCollections", dependencies: ["_CollectionsUtilities"], exclude: ["CMakeLists.txt"]), - .target( - kind: .test, - name: "OrderedCollectionsTests", - dependencies: ["OrderedCollections", "_CollectionsTestSupport"]), +// .target( +// kind: .test, +// name: "OrderedCollectionsTests", +// dependencies: ["OrderedCollections", "_CollectionsTestSupport"]), .target( kind: .exported, @@ -279,20 +279,20 @@ let targets: [CustomTarget] = [ dependencies: ["_CollectionsUtilities"], directory: "RopeModule", exclude: ["CMakeLists.txt"]), - .target( - kind: .test, - name: "RopeModuleTests", - dependencies: ["_RopeModule", "_CollectionsTestSupport"]), +// .target( +// kind: .test, +// name: "RopeModuleTests", +// dependencies: ["_RopeModule", "_CollectionsTestSupport"]), .target( kind: .exported, name: "SortedCollections", dependencies: ["_CollectionsUtilities"], directory: "SortedCollections"), - .target( - kind: .test, - name: "SortedCollectionsTests", - dependencies: ["SortedCollections", "_CollectionsTestSupport"]), +// .target( +// kind: .test, +// name: "SortedCollectionsTests", +// dependencies: ["SortedCollections", "_CollectionsTestSupport"]), .target( kind: .exported, diff --git a/Sources/Future/UTF8Span/UTF8EncodingError.swift b/Sources/Future/UTF8Span/UTF8EncodingError.swift new file mode 100644 index 000000000..02fc5c088 --- /dev/null +++ b/Sources/Future/UTF8Span/UTF8EncodingError.swift @@ -0,0 +1,241 @@ +extension Unicode.UTF8 { + /** + + The kind and location of a UTF-8 encoding error. + + Valid UTF-8 is represented by this table: + + ``` + ╔════════════════════╦════════╦════════╦════════╦════════╗ + ║ Scalar value ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║ + ╠════════════════════╬════════╬════════╬════════╬════════╣ + ║ U+0000..U+007F ║ 00..7F ║ ║ ║ ║ + ║ U+0080..U+07FF ║ C2..DF ║ 80..BF ║ ║ ║ + ║ U+0800..U+0FFF ║ E0 ║ A0..BF ║ 80..BF ║ ║ + ║ U+1000..U+CFFF ║ E1..EC ║ 80..BF ║ 80..BF ║ ║ + ║ U+D000..U+D7FF ║ ED ║ 80..9F ║ 80..BF ║ ║ + ║ U+E000..U+FFFF ║ EE..EF ║ 80..BF ║ 80..BF ║ ║ + ║ U+10000..U+3FFFF ║ F0 ║ 90..BF ║ 80..BF ║ 80..BF ║ + ║ U+40000..U+FFFFF ║ F1..F3 ║ 80..BF ║ 80..BF ║ 80..BF ║ + ║ U+100000..U+10FFFF ║ F4 ║ 80..8F ║ 80..BF ║ 80..BF ║ + ╚════════════════════╩════════╩════════╩════════╩════════╝ + ``` + + ### Classifying errors + + An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs + in a position that should be the start of a new scalar value. Unexpected + continuations can often occur when the input contains arbitrary data + instead of textual content. An unexpected continuation at the start of + input might mean that the input was not correctly sliced along scalar + boundaries or that it does not contain UTF-8. + + A *truncated scalar* is a multi-byte sequence that is the start of a valid + multi-byte scalar but is cut off before ending correctly. A truncated + scalar at the end of the input might mean that only part of the entire + input was received. + + A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate + code points are used by UTF-16 to encode scalars in the supplementary + planes. Their presence may mean the input was encoded in a different 8-bit + encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8. + + An *invalid non-surrogate code point* is any code point higher than + `U+10FFFF`. This can often occur when the input is arbitrary data instead + of textual content. + + An *overlong encoding* occurs when a scalar value that could have been + encoded using fewer bytes is encoded in a longer byte sequence. Overlong + encodings are invalid UTF-8 and can lead to security issues if not + correctly detected: + + - https://nvd.nist.gov/vuln/detail/CVE-2008-2938 + - https://nvd.nist.gov/vuln/detail/CVE-2000-0884 + + An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified + UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts + to bypass security measures. + + ### Reporting the range of the error + + The range of the error reported follows the *Maximal subpart of an + ill-formed subsequence* algorithm in which each error is either one byte + long or ends before the first byte that is disallowed. See "U+FFFD + Substitution of Maximal Subparts" in the Unicode Standard. Unicode started + recommending this algorithm in version 6 and is adopted by the W3C. + + The maximal subpart algorithm will produce a single multi-byte range for a + truncated scalar (a multi-byte sequence that is the start of a valid + multi-byte scalar but is cut off before ending correctly). For all other + errors (including overlong encodings, surrogates, and invalid code + points), it will produce an error per byte. + + Since overlong encodings, surrogates, and invalid code points are erroneous + by the second byte (at the latest), the above definition produces the same + ranges as defining such a sequence as a truncated scalar error followed by + unexpected continuation byte errors. The more semantically-rich + classification is reported. + + For example, a surrogate count point sequence `ED A0 80` will be reported + as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar` + followed by two `.unexpectedContinuationByte` errors. + + Other commonly reported error ranges can be constructed from this result. + For example, PEP 383's error-per-byte can be constructed by mapping over + the reported range. Similarly, constructing a single error for the longest + invalid byte range can be constructed by joining adjacent error ranges. + + ``` + ╔═════════════════╦══════╦═════╦═════╦═════╦═════╦═════╦═════╦══════╗ + ║ ║ 61 ║ F1 ║ 80 ║ 80 ║ E1 ║ 80 ║ C2 ║ 62 ║ + ╠═════════════════╬══════╬═════╬═════╬═════╬═════╬═════╬═════╬══════╣ + ║ Longest range ║ U+61 ║ err ║ ║ ║ ║ ║ ║ U+62 ║ + ║ Maximal subpart ║ U+61 ║ err ║ ║ ║ err ║ ║ err ║ U+62 ║ + ║ Error per byte ║ U+61 ║ err ║ err ║ err ║ err ║ err ║ err ║ U+62 ║ + ╚═════════════════╩══════╩═════╩═════╩═════╩═════╩═════╩═════╩══════╝ + ``` + + */ + @frozen + public struct EncodingError: Error, Sendable, Hashable, Codable { + /// The kind of encoding error + public var kind: Unicode.UTF8.EncodingError.Kind + + /// The range of offsets into our input containing the error + public var range: Range + + @_alwaysEmitIntoClient + public init( + _ kind: Unicode.UTF8.EncodingError.Kind, + _ range: some RangeExpression + ) { + self.kind = kind + self.range = range.relative(to: Int.min..U+10FFFF`) sequence + @_alwaysEmitIntoClient + public static var invalidNonSurrogateCodePointByte: Self { + .init(rawValue: 2) + } + + /// A byte in an overlong encoding sequence + @_alwaysEmitIntoClient + public static var overlongEncodingByte: Self { + .init(rawValue: 3) + } + + /// A multi-byte sequence that is the start of a valid multi-byte scalar + /// but is cut off before ending correctly + @_alwaysEmitIntoClient + public static var truncatedScalar: Self { + .init(rawValue: 4) + } + } +} + +@_unavailableInEmbedded +extension UTF8.EncodingError.Kind: CustomStringConvertible { + public var description: String { + switch self { + case .invalidNonSurrogateCodePointByte: + ".invalidNonSurrogateCodePointByte" + case .overlongEncodingByte: + ".overlongEncodingByte" + case .surrogateCodePointByte: + ".surrogateCodePointByte" + case .truncatedScalar: + ".truncatedScalar" + case .unexpectedContinuationByte: + ".unexpectedContinuationByte" + default: + fatalError("unreachable") + } + } +} + +@_unavailableInEmbedded +extension UTF8.EncodingError: CustomStringConvertible { + public var description: String { + "UTF8.EncodingError(\(kind), \(range))" + } +} + +extension UTF8 { + public // For demo purposes + static func _checkAllErrors( + _ s: some Sequence + ) -> some Sequence { + // TODO: Span fast path + // TODO: Fixed size buffer for non-contig inputs + // TODO: Lifetime-dependent result variant + let cus = Array(s) + return cus.withUnsafeBytes { + var bufPtr = $0 + var start = 0 + var errors: Array = [] + + // Remember the previous error, so that we can + // apply it to subsequent bytes instead of reporting + // just `.unexpectedContinuation`. + var priorError: UTF8.EncodingError? = nil + while true { + do throws(UTF8.EncodingError) { + _ = try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count) + return errors + } catch { + let adjustedRange = + error.range.lowerBound + start ..< error.range.upperBound + start + + let kind: UTF8.EncodingError.Kind + if let prior = priorError, + prior.range.upperBound == adjustedRange.lowerBound, + error.kind == .unexpectedContinuationByte + { + kind = prior.kind + } else { + kind = error.kind + } + let adjustedErr = UTF8.EncodingError(kind, adjustedRange) + priorError = adjustedErr + + let errEnd = error.range.upperBound + start += errEnd + bufPtr = .init(rebasing: bufPtr[errEnd...]) + errors.append(adjustedErr) + } + } + } + } +} diff --git a/Sources/Future/UTF8Span/UTF8Span.swift b/Sources/Future/UTF8Span/UTF8Span.swift new file mode 100644 index 000000000..858817c57 --- /dev/null +++ b/Sources/Future/UTF8Span/UTF8Span.swift @@ -0,0 +1,231 @@ + +@frozen +public struct UTF8Span: Copyable, ~Escapable { + public var unsafeBaseAddress: UnsafeRawPointer + + /* + A bit-packed count and flags (such as isASCII) + + ╔═══════╦═════╦═════╦═════╦══════════╦═══════╗ + ║ b63 ║ b62 ║ b61 ║ b60 ║ b59:56 ║ b56:0 ║ + ╠═══════╬═════╬═════╬═════╬══════════╬═══════╣ + ║ ASCII ║ NFC ║ SSC ║ NUL ║ reserved ║ count ║ + ╚═══════╩═════╩═════╩═════╩══════════╩═══════╝ + + ASCII means the contents are all-ASCII (<0x7F). + NFC means contents are in normal form C for fast comparisons. + SSC means single-scalar Characters (i.e. grapheme clusters): every + `Character` holds only a single `Unicode.Scalar`. + NUL means the contents are a null-terminated C string (that is, + there is a guranteed, borrowed NULL byte after the end of `count`). + + TODO: NUL means both no-interior and null-terminator, so does this + mean that String doesn't ever set it because we don't want to scan + for interior nulls? I think this is the only viable option... + + TODO: Contains-newline would be useful for Regex `.` + + Question: Should we have null-termination support? + A null-terminated UTF8Span has a NUL byte after its contents + and contains no interior NULs. How would we ensure the + NUL byte is exclusively borrowed by us? + + */ + @usableFromInline + internal var _countAndFlags: UInt64 + + @inlinable @inline(__always) + init( + _unsafeAssumingValidUTF8 start: UnsafeRawPointer, + _countAndFlags: UInt64, + owner: borrowing Owner + ) -> dependsOn(owner) Self { + self.unsafeBaseAddress = start + self._countAndFlags = _countAndFlags + + _invariantCheck() + } +} + +extension UTF8Span { + public init( + validating codeUnits: Span + ) throws(UTF8.EncodingError) -> dependsOn(codeUnits) Self { + self.unsafeBaseAddress = .init(codeUnits._start) + + let count = codeUnits._count + let isASCII = try unsafeBaseAddress._validateUTF8(limitedBy: count) + + self._countAndFlags = UInt64(truncatingIfNeeded: count) + if isASCII { + _setIsASCII() + } + _internalInvariant(self.count == codeUnits.count) + } + + @_alwaysEmitIntoClient + public init( + validatingUnsafe codeUnits: UnsafeBufferPointer, + owner: borrowing Owner + ) throws(UTF8.EncodingError) -> dependsOn(owner) Self { + try self.init( + validating: Span(unsafeElements: codeUnits, owner: owner)) + } + + // Question: do we want raw versions? + @_alwaysEmitIntoClient + public init( + validatingUnsafeRaw codeUnits: UnsafeRawBufferPointer, + owner: borrowing Owner + ) throws(UTF8.EncodingError) -> dependsOn(owner) Self { + try self.init( + validating: Span(unsafeBytes: codeUnits, owner: owner)) + } + + // Question: do we want separate count versions? + @_alwaysEmitIntoClient + public init( + validatingUnsafeStart start: UnsafePointer, + count: Int, + owner: borrowing Owner + ) throws(UTF8.EncodingError) -> dependsOn(owner) Self { + try self.init( + validating: Span(unsafeStart: start, count: count, owner: owner)) + } + + @_alwaysEmitIntoClient + public init( + validatingUnsafeStart start: UnsafeRawPointer, + count: Int, + owner: borrowing Owner + ) throws(UTF8.EncodingError) -> dependsOn(owner) Self { + try self.init( + validating: Span(unsafeStart: start, byteCount: count, owner: owner)) + } + + // Question: Do we do a raw version? String doesn't have one + // Also, should we do a UnsafePointer version, it's + // annoying to not have one sometimes...? + @_alwaysEmitIntoClient + public init( + validatingUnsafeRawCString nullTerminatedUTF8: UnsafeRawPointer, + owner: borrowing Owner + ) throws(UTF8.EncodingError) -> dependsOn(owner) Self { + // TODO: is there a better way? + try self.init( + validatingUnsafeCString: nullTerminatedUTF8.assumingMemoryBound( + to: CChar.self + ), + owner: owner) + _internalInvariant(self.isNullTerminatedCString) + } + + @_alwaysEmitIntoClient + public init( + validatingUnsafeCString nullTerminatedUTF8: UnsafePointer, + owner: borrowing Owner + ) throws(UTF8.EncodingError) -> dependsOn(owner) Self { + let len = UTF8._nullCodeUnitOffset(in: nullTerminatedUTF8) + try self.init( + validatingUnsafeStart: UnsafeRawPointer(nullTerminatedUTF8), + count: len, + owner: owner) + self._setIsNullTerminatedCString(true) + } +} + + +// MARK: Canonical comparison + +@_unavailableInEmbedded +extension UTF8Span { + // HACK: working around lack of internals + internal var _str: String { unsafeBaseAddress._str(0.. Bool { + self._str == other._str + } + + /// Whether `self` orders less than `other` under Unicode Canonical + /// Equivalence using normalized code-unit order (in NFC). + public func isCanonicallyLessThan( + _ other: UTF8Span + ) -> Bool { + self._str < other._str + } +} + + + +// MARK: String + + +extension UTF8Span { + /// Calls a closure with a pointer to the viewed contiguous storage. + /// + /// The buffer pointer passed as an argument to `body` is valid only + /// during the execution of `withUnsafeBufferPointer(_:)`. + /// Do not store or return the pointer for later use. + /// + /// - Parameter body: A closure with an `UnsafeBufferPointer` parameter + /// that points to the viewed contiguous storage. If `body` has + /// a return value, that value is also used as the return value + /// for the `withUnsafeBufferPointer(_:)` method. The closure's + /// parameter is valid only for the duration of its execution. + /// - Returns: The return value of the `body` closure parameter. + @_alwaysEmitIntoClient + borrowing public func withUnsafeBufferPointer< + E: Error, Result: ~Copyable & ~Escapable + >( + _ body: (_ buffer: borrowing UnsafeBufferPointer) throws(E) -> Result + ) throws(E) -> dependsOn(self) Result { + try body(unsafeBaseAddress._ubp(0.. UTF8Span { + let span = self[ + uncheckedOffsets: start ..< start &+ length + ].view(as: UInt8.self) + return try UTF8Span(validating: span) + } + + // TODO: Below are contingent on how we want to handle NUL-termination + public func parseNullTerminatedUTF8() throws -> UTF8Span { + fatalError() + } +} + +// TODO: Below is contingent on a Cursor or Iterator type +extension RawSpan.Cursor { + public mutating func parseUTF8(length: Int) throws -> UTF8Span { + fatalError() + } + public mutating func parseNullTerminatedUTF8() throws -> UTF8Span { + fatalError() + } +} +#endif + +// TODO: cString var, or something like that diff --git a/Sources/Future/UTF8Span/UTF8SpanBits.swift b/Sources/Future/UTF8Span/UTF8SpanBits.swift new file mode 100644 index 000000000..e04b5f5e8 --- /dev/null +++ b/Sources/Future/UTF8Span/UTF8SpanBits.swift @@ -0,0 +1,187 @@ +extension UTF8Span { + /// Returns whether the validated contents were all-ASCII. This is checked at + /// initialization time and remembered. + @_alwaysEmitIntoClient + public var isASCII: Bool { + 0 != _countAndFlags & Self._asciiBit + } + + + /// Returns whether the contents are known to be NFC. This is not + /// always checked at initialization time and is set by `checkForNFC`. + @_unavailableInEmbedded + @_alwaysEmitIntoClient + public var isKnownNFC: Bool { + 0 != _countAndFlags & Self._nfcBit + } + + /// Returns whether the contents are a null-terminated C string. If true, there + /// is a guaranteed null byte after the end of `count` and no null bytes stored + /// within the span + @_alwaysEmitIntoClient + public var isNullTerminatedCString: Bool { + 0 != _countAndFlags & Self._nullTerminatedCStringBit + } + + // Takes a paremeter because `extracting` may need to un-set the bit + @_alwaysEmitIntoClient + internal mutating func _setIsNullTerminatedCString(_ value: Bool) { + if value { + _countAndFlags |= Self._nullTerminatedCStringBit + } else { + _countAndFlags &= ~Self._nullTerminatedCStringBit + } + _invariantCheck() + } + + // Set the isASCII bit to true (also isNFC) + @_alwaysEmitIntoClient + internal mutating func _setIsASCII() { + self._countAndFlags |= Self._asciiBit | Self._nfcBit + } + + /// Do a scan checking for whether the contents are in Normal Form C. + /// When the contents are in NFC, canonical equivalence checks are much + /// faster. + /// + /// `quickCheck` will check for a subset of NFC contents using the + /// NFCQuickCheck algorithm, which is faster than the full normalization + /// algorithm. However, it cannot detect all NFC contents. + /// + /// Updates the `isKnownNFC` bit. + @_unavailableInEmbedded + public mutating func checkForNFC( + quickCheck: Bool + ) -> Bool { + if isKnownNFC { return true } + + if quickCheck { + var cur = 0 + while cur < count { + let (s, next) = decodeNextScalar(cur) + cur = next + if s.value < 0x300 { + continue + } + // TODO: Check (internal) Unicode NFCQuickCheck=YES property + return false + } + self._countAndFlags |= Self._nfcBit + return true + } + + // TODO: use faster internal algorithm + let normalized = _str._nfcCodeUnits + guard unsafeBaseAddress._urbp( + 0.. Bool { + if isKnownSingleScalarCharacters { return true } + + if quickCheck { + var idx = 0 + var currentScalar: Unicode.Scalar? = nil + while idx < count { + let (scalar, next) = decodeNextScalar(idx) + + if let cur = currentScalar { + guard _quickHasGraphemeBreakBetween(cur, scalar) else { + return false + } + } + + currentScalar = scalar + idx = next + } + + self._countAndFlags |= Self._singleScalarCharactersBit + return true + } + + var idx = 0 + while idx < count { + let nextIdx = nextCharacterStart(uncheckedAssumingAligned: idx) + guard nextIdx == nextScalarStart(idx) else { + return false + } + idx = nextIdx + } + + self._countAndFlags |= Self._singleScalarCharactersBit + return true + } +} + +extension UTF8Span { + @_alwaysEmitIntoClient @inline(__always) + internal static var _asciiBit: UInt64 { + 0x8000_0000_0000_0000 + } + + @_alwaysEmitIntoClient @inline(__always) + internal static var _nfcBit: UInt64 { + 0x4000_0000_0000_0000 + } + + @_alwaysEmitIntoClient @inline(__always) + internal static var _singleScalarCharactersBit: UInt64 { + 0x2000_0000_0000_0000 + } + + @_alwaysEmitIntoClient @inline(__always) + internal static var _nullTerminatedCStringBit: UInt64 { + 0x1000_0000_0000_0000 + } + + @_alwaysEmitIntoClient @inline(__always) + internal static var _countMask: UInt64 { + 0x00FF_FFFF_FFFF_FFFF + } + + @_alwaysEmitIntoClient @inline(__always) + internal static var _flagsMask: UInt64 { + 0xFF00_0000_0000_0000 + } + + @_alwaysEmitIntoClient + public var count: Int { + Int(truncatingIfNeeded: _countAndFlags & Self._countMask) + } + +// @_alwaysEmitIntoClient @inline(__always) +// internal var _end: UnsafeRawPointer { +// unsafeBaseAddress.advanced(by: _byteCount) +// } +} + + diff --git a/Sources/Future/UTF8Span/UTF8SpanCollectionLike.swift b/Sources/Future/UTF8Span/UTF8SpanCollectionLike.swift new file mode 100644 index 000000000..accafad9e --- /dev/null +++ b/Sources/Future/UTF8Span/UTF8SpanCollectionLike.swift @@ -0,0 +1,186 @@ +// TODO: add the isASCII fast paths + +extension UTF8Span { + /// Accesses the byte at the specified `position`. + /// + /// - Parameter position: The offset of the byte to access. `position` + /// must be greater or equal to zero, and less than `count`. + @_alwaysEmitIntoClient + public subscript(_ position: Int) -> UInt8 { + precondition(boundsCheck(position)) + return self[unchecked: position] + } + + /// Accesses the byte at the specified `position`. + /// + /// This subscript does not validate `position`; this is an unsafe operation. + /// + /// - Parameter position: The offset of the element to access. `position` + /// must be greater or equal to zero, and less than `count`. + @_alwaysEmitIntoClient + public subscript(unchecked position: Int) -> UInt8 { + _internalInvariant(boundsCheck(position)) + return unsafeBaseAddress._loadByte(position) + } + + /// Constructs a new `UTF8Span` span over the bytes within the supplied + /// range of positions within this span. + /// + /// `bounds` must be scalar aligned. + /// + /// The returned span's first item is always at offset 0; unlike buffer + /// slices, extracted spans do not generally share their indices with the + /// span from which they are extracted. + /// + /// - Parameter bounds: A valid range of positions. Every position in + /// this range must be within the bounds of this `Span`. + /// + /// - Returns: A `UTF8Span` over the bytes within `bounds`. + @_alwaysEmitIntoClient + public func extracting(_ bounds: some RangeExpression) -> Self { + let bounds = bounds.relative(to: Int.min.. + ) -> Self { + let bounds = bounds.relative(to: Int.min.. + ) -> Self { + let bounds = bounds.relative(to: Int.min.. Bool { + guard count == other.count else { + return false + } + for i in 0..) -> Bool { + var idx = 0 + for elt in other { + guard idx < count, self[unchecked: idx] == elt else { + return false + } + idx += 1 + } + return idx == count + } + + /// Whether this span has the same `Unicode.Scalar`s as `other`. + @_alwaysEmitIntoClient + public func scalarsEqual( + to other: some Sequence + ) -> Bool { + var idx = 0 + for elt in other { + guard idx < count else { return false } + let (scalar, next) = decodeNextScalar(uncheckedAssumingAligned: idx) + guard scalar == elt else { return false } + idx = next + } + return idx == count + } + + /// Whether this span has the same `Character`s as `other`. + @_unavailableInEmbedded + @_alwaysEmitIntoClient + public func charactersEqual( + to other: some Sequence + ) -> Bool { + var idx = 0 + for elt in other { + guard idx < count else { return false } + let (scalar, next) = decodeNextCharacter( + uncheckedAssumingAligned: idx) + guard scalar == elt else { return false } + idx = next + } + return idx == count + } + + @_alwaysEmitIntoClient + public var isEmpty: Bool { + count == 0 + } +} + +extension UTF8Span: ContiguousStorage { + @_alwaysEmitIntoClient + public var storage: Span { + Span( + unsafeStart: unsafeBaseAddress, + byteCount: count, + owner: self) + } +} diff --git a/Sources/Future/UTF8Span/UTF8SpanFundamentals.swift b/Sources/Future/UTF8Span/UTF8SpanFundamentals.swift new file mode 100644 index 000000000..b761cc754 --- /dev/null +++ b/Sources/Future/UTF8Span/UTF8SpanFundamentals.swift @@ -0,0 +1,545 @@ +// Core Scalar API +extension UTF8Span { + /// Whether `i` is on a boundary between Unicode scalar values. + @_alwaysEmitIntoClient + public func isScalarAligned(_ i: Int) -> Bool { + if i == count || i == 0 { return true } + precondition(boundsCheck(i)) + return isScalarAligned(unchecked: i) + } + + /// Whether `i` is on a boundary between Unicode scalar values. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func isScalarAligned(unchecked i: Int) -> Bool { + if i == count || i == 0 { return true } + _internalInvariant(boundsCheck(i)) + return unsafeBaseAddress._isScalarAligned(i) + } + + /// Whether `range`'s bounds are aligned to `Unicode.Scalar` boundaries. + @_alwaysEmitIntoClient + public func isScalarAligned(_ range: Range) -> Bool { + isScalarAligned(range.lowerBound) && isScalarAligned(range.upperBound) + } + + /// Whether `range`'s bounds are aligned to `Unicode.Scalar` boundaries. + /// + /// This function does not validate that `range` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func isScalarAligned(unchecked range: Range) -> Bool { + isScalarAligned(unchecked: range.lowerBound) + && isScalarAligned(unchecked: range.upperBound) + } + + /// Returns the start of the next `Unicode.Scalar` after the one starting at + /// `i`, or the end of the span if `i` denotes the final scalar. + /// + /// `i` must be scalar-aligned. + @_alwaysEmitIntoClient + public func nextScalarStart(_ i: Int) -> Int { + precondition(boundsCheck(i)) + return nextScalarStart(unchecked: i) + } + + /// Returns the start of the next `Unicode.Scalar` after the one starting at + /// `i`, or the end of the span if `i` denotes the final scalar. + /// + /// `i` must be scalar-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func nextScalarStart(unchecked i: Int) -> Int { + _internalInvariant(boundsCheck(i)) + precondition(isScalarAligned(i)) + return nextScalarStart(uncheckedAssumingAligned: i) + } + + /// Returns the start of the next `Unicode.Scalar` after the one starting at + /// `i`, or the end of the span if `i` denotes the final scalar. + /// + /// `i` must be scalar-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + /// + /// This function does not validate that `i` is scalar-aligned; this is an + /// unsafe operation if `i` isn't. + @_alwaysEmitIntoClient + public func nextScalarStart( + uncheckedAssumingAligned i: Int + ) -> Int { + _internalInvariant(boundsCheck(i)) + _internalInvariant(isScalarAligned(i)) + return unsafeBaseAddress._nextScalarStart(i) + } + + /// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar + /// before the one starting at `i` or the last scalar if `i` is the end of + /// the span. + /// + /// `i` must be scalar-aligned. + @_alwaysEmitIntoClient + public func previousScalarStart(_ i: Int) -> Int { + precondition(boundsCheck(i&-1)) + return previousScalarStart(unchecked: i) + } + + /// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar + /// before the one starting at `i` or the last scalar if `i` is the end of + /// the span. + /// + /// `i` must be scalar-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func previousScalarStart(unchecked i: Int) -> Int { + _internalInvariant(boundsCheck(i&-1)) + precondition(isScalarAligned(i)) + return previousScalarStart(uncheckedAssumingAligned: i) + } + + /// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar + /// before the one starting at `i` or the last scalar if `i` is the end of + /// the span. + /// + /// `i` must be scalar-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + /// + /// + /// This function does not validate that `i` is scalar-aligned; this is an + /// unsafe operation if `i` isn't. + @_alwaysEmitIntoClient + public func previousScalarStart( + uncheckedAssumingAligned i: Int + ) -> Int { + _internalInvariant(boundsCheck(i&-1)) + _internalInvariant(isScalarAligned(i)) + return unsafeBaseAddress._previousScalarStart(i) + } + + /// Decode the `Unicode.Scalar` starting at `i`. Return it and the start of + /// the next scalar. + /// + /// `i` must be scalar-aligned. + @_alwaysEmitIntoClient + public func decodeNextScalar( + _ i: Int + ) -> (Unicode.Scalar, nextScalarStart: Int) { + precondition(boundsCheck(i)) + return decodeNextScalar(unchecked: i) + } + + /// Decode the `Unicode.Scalar` starting at `i`. Return it and the start of + /// the next scalar. + /// + /// `i` must be scalar-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func decodeNextScalar( + unchecked i: Int + ) -> (Unicode.Scalar, nextScalarStart: Int) { + _internalInvariant(boundsCheck(i)) + precondition(isScalarAligned(i)) + return decodeNextScalar(uncheckedAssumingAligned: i) + } + + /// Decode the `Unicode.Scalar` starting at `i`. Return it and the start of + /// the next scalar. + /// + /// `i` must be scalar-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + /// + /// + /// This function does not validate that `i` is scalar-aligned; this is an + /// unsafe operation if `i` isn't. + @_alwaysEmitIntoClient + public func decodeNextScalar( + uncheckedAssumingAligned i: Int + ) -> (Unicode.Scalar, nextScalarStart: Int) { + _internalInvariant(boundsCheck(i)) + _internalInvariant(isScalarAligned(i)) + return unsafeBaseAddress._decodeScalar(startingAt: i) + } + + /// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar. + /// Return it and the start of that scalar. + /// + /// `i` must be scalar-aligned. + @_alwaysEmitIntoClient + public func decodePreviousScalar( + _ i: Int + ) -> (Unicode.Scalar, previousScalarStart: Int) { + precondition(boundsCheck(i &- 1)) + return decodePreviousScalar(unchecked: i) + } + + /// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar. + /// Return it and the start of that scalar. + /// + /// `i` must be scalar-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func decodePreviousScalar( + unchecked i: Int + ) -> (Unicode.Scalar, previousScalarStart: Int) { + _internalInvariant(boundsCheck(i &- 1)) + precondition(isScalarAligned(i)) + return decodePreviousScalar(uncheckedAssumingAligned: i) + } + + /// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar. + /// Return it and the start of that scalar. + /// + /// `i` must be scalar-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + /// + /// + /// This function does not validate that `i` is scalar-aligned; this is an + /// unsafe operation if `i` isn't. + @_alwaysEmitIntoClient + public func decodePreviousScalar( + uncheckedAssumingAligned i: Int + ) -> (Unicode.Scalar, previousScalarStart: Int) { + _internalInvariant(boundsCheck(i &- 1)) + _internalInvariant(isScalarAligned(i)) + return unsafeBaseAddress._decodeScalar(endingAt: i) + } +} + +// Derived Scalar API +extension UTF8Span { + /// Find the nearest scalar-aligned position `<= i`. + @_alwaysEmitIntoClient + public func scalarAlignBackwards(_ i: Int) -> Int { + precondition(boundsCheck(i)) + return scalarAlignBackwards(unchecked: i) + } + + /// Find the nearest scalar-aligned position `<= i`. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func scalarAlignBackwards(unchecked i: Int) -> Int { + _internalInvariant(boundsCheck(i)) + return unsafeBaseAddress._scalarAlign(i) + } + + /// Find the nearest scalar-aligned position `>= i`. + @_alwaysEmitIntoClient + public func scalarAlignForwards(_ i: Int) -> Int { + var i = i + while _slowPath(!isScalarAligned(i)) { + i &+= 1 + } + return i + } + + /// Find the nearest scalar-aligned position `>= i`. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func scalarAlignForwards(unchecked i: Int) -> Int { + var i = i + while _slowPath(!isScalarAligned(unchecked: i)) { + i &+= 1 + } + return i + } +} + +// Core Character API +extension UTF8Span { + // TODO: Single-scalar fast paths + + /// Whether `i` is on a boundary between `Character`s (i.e. grapheme + /// clusters). + @_alwaysEmitIntoClient + public func isCharacterAligned(_ i: Int) -> Bool { + if i == count || i == 0 { return true } + precondition(boundsCheck(i)) + return isCharacterAligned(unchecked: i) + } + + /// Whether `i` is on a boundary between `Character`s (i.e. grapheme + /// clusters). + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func isCharacterAligned(unchecked i: Int) -> Bool { + if i == count || i == 0 { return true } + _internalInvariant(boundsCheck(i)) + return unsafeBaseAddress._isCharacterAligned(i, limitedBy: count) + } + + /// Returns the start of the next `Character` (i.e. grapheme cluster) after + /// the one starting at `i`, or the end of the span if `i` denotes the final + /// `Character`. + /// + /// `i` must be `Character`-aligned. + @_alwaysEmitIntoClient + public func nextCharacterStart(_ i: Int) -> Int { + precondition(boundsCheck(i)) + return nextCharacterStart(unchecked: i) + } + + /// Returns the start of the next `Character` (i.e. grapheme cluster) after + /// the one starting at `i`, or the end of the span if `i` denotes the final + /// `Character`. + /// + /// `i` must be `Character`-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func nextCharacterStart(unchecked i: Int) -> Int { + _internalInvariant(boundsCheck(i)) + precondition(isCharacterAligned(i)) + return nextCharacterStart(uncheckedAssumingAligned: i) + } + + /// Returns the start of the next `Character` (i.e. grapheme cluster) after + /// the one starting at `i`, or the end of the span if `i` denotes the final + /// `Character`. + /// + /// `i` must be `Character`-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + /// + /// This function does not validate that `i` is `Character`-aligned; this is + /// an unsafe operation if `i` isn't. + @_alwaysEmitIntoClient + public func nextCharacterStart( + uncheckedAssumingAligned i: Int + ) -> Int { + _internalInvariant(boundsCheck(i)) + _internalInvariant(isCharacterAligned(i)) + return unsafeBaseAddress._nextCharacterStart(i, limitedBy: count) + } + + /// Returns the start of the `Character` (i.e. grapheme cluster) ending at + /// `i`, i.e. the `Character` before the one starting at `i` or the last + /// `Character` if `i` is the end of the span. + /// + /// `i` must be `Character`-aligned. + @_alwaysEmitIntoClient + public func previousCharacterStart(_ i: Int) -> Int { + precondition(boundsCheck(i&-1)) + return previousCharacterStart(unchecked: i) + } + + /// Returns the start of the `Character` (i.e. grapheme cluster) ending at + /// `i`, i.e. the `Character` before the one starting at `i` or the last + /// `Character` if `i` is the end of the span. + /// + /// `i` must be `Character`-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func previousCharacterStart(unchecked i: Int) -> Int { + _internalInvariant(boundsCheck(i&-1)) + precondition(isCharacterAligned(i)) + return previousCharacterStart(uncheckedAssumingAligned: i) + } + + /// Returns the start of the `Character` (i.e. grapheme cluster) ending at + /// `i`, i.e. the `Character` before the one starting at `i` or the last + /// `Character` if `i` is the end of the span. + /// + /// `i` must be `Character`-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + /// + /// This function does not validate that `i` is `Character`-aligned; this is + /// an unsafe operation if `i` isn't. + @_alwaysEmitIntoClient + public func previousCharacterStart( + uncheckedAssumingAligned i: Int + ) -> Int { + _internalInvariant(boundsCheck(i&-1)) + _internalInvariant(isCharacterAligned(i)) + return unsafeBaseAddress._previousCharacterStart(i, limitedBy: count) + } + + /// Decode the `Character` starting at `i` Return it and the start of the + /// next `Character`. + /// + /// `i` must be `Character`-aligned. + @_alwaysEmitIntoClient + public func decodeNextCharacter( + _ i: Int + ) -> (Character, nextCharacterStart: Int) { + precondition(boundsCheck(i)) + return decodeNextCharacter(unchecked: i) + } + + /// Decode the `Character` starting at `i` Return it and the start of the + /// next `Character`. + /// + /// `i` must be `Character`-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func decodeNextCharacter( + unchecked i: Int + ) -> (Character, nextCharacterStart: Int) { + _internalInvariant(boundsCheck(i)) + precondition(isCharacterAligned(i)) + return decodeNextCharacter(uncheckedAssumingAligned: i) + } + + /// Decode the `Character` starting at `i` Return it and the start of the + /// next `Character`. + /// + /// `i` must be `Character`-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + /// + /// This function does not validate that `i` is `Character`-aligned; this is + /// an unsafe operation if `i` isn't. + @_alwaysEmitIntoClient + public func decodeNextCharacter( + uncheckedAssumingAligned i: Int + ) -> (Character, nextCharacterStart: Int) { + _internalInvariant(boundsCheck(i)) + _internalInvariant(isCharacterAligned(i)) + return unsafeBaseAddress._decodeCharacter( + startingAt: i, limitedBy: count) + } + + /// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the + /// previous `Character`. Return it and the start of that `Character`. + /// + /// `i` must be `Character`-aligned. + @_alwaysEmitIntoClient + public func decodePreviousCharacter(_ i: Int) -> (Character, Int) { + precondition(boundsCheck(i &- 1)) + return decodePreviousCharacter(unchecked: i) + } + + /// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the + /// previous `Character`. Return it and the start of that `Character`. + /// + /// `i` must be `Character`-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func decodePreviousCharacter( + unchecked i: Int + ) -> (Character, Int) { + _internalInvariant(boundsCheck(i &- 1)) + precondition(isCharacterAligned(i)) + return decodePreviousCharacter(uncheckedAssumingAligned: i) + } + + /// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the + /// previous `Character`. Return it and the start of that `Character`. + /// + /// `i` must be `Character`-aligned. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + /// + /// This function does not validate that `i` is `Character`-aligned; this is + /// an unsafe operation if `i` isn't. + @_alwaysEmitIntoClient + public func decodePreviousCharacter( + uncheckedAssumingAligned i: Int + ) -> (Character, Int) { + _internalInvariant(boundsCheck(i &- 1)) + _internalInvariant(isCharacterAligned(i)) + return unsafeBaseAddress._decodeCharacter( + endingAt: i, limitedBy: count) + } + +} + +// Derived Character API +extension UTF8Span { + /// Find the nearest `Character` (i.e. grapheme cluster)-aligned position + /// that is `<= i`. + @_alwaysEmitIntoClient + public func characterAlignBackwards(_ i: Int) -> Int { + precondition(i == count || boundsCheck(i)) + return characterAlignBackwards(unchecked: i) + } + + /// Find the nearest `Character` (i.e. grapheme cluster)-aligned position + /// that is `<= i`. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func characterAlignBackwards(unchecked i: Int) -> Int { + _internalInvariant(i == count || boundsCheck(i)) + var i = i + while _slowPath(!isCharacterAligned(unchecked: i)) { + i &-= 1 + } + return i + } + + /// Find the nearest `Character` (i.e. grapheme cluster)-aligned position + /// that is `>= i`. + @_alwaysEmitIntoClient + public func characterAlignForwards(_ i: Int) -> Int { + precondition(i == count || boundsCheck(i)) + return characterAlignForwards(unchecked: i) + } + + /// Find the nearest `Character` (i.e. grapheme cluster)-aligned position + /// that is `>= i`. + /// + /// This function does not validate that `i` is within the span's bounds; + /// this is an unsafe operation. + @_alwaysEmitIntoClient + public func characterAlignForwards(unchecked i: Int) -> Int { + _internalInvariant(i == count || boundsCheck(i)) + var i = i + while _slowPath(!isCharacterAligned(unchecked: i)) { + i &+= 1 + } + return i + } +} + +// TODO: public? +extension UTF8Span { + /// Whether `i` is in bounds + @_alwaysEmitIntoClient + public func boundsCheck(_ i: Int) -> Bool { + i >= 0 && i < count + } + /// Whether `bounds` is in bounds + @_alwaysEmitIntoClient + public func boundsCheck(_ bounds: Range) -> Bool { + boundsCheck(bounds.lowerBound) + && boundsCheck(bounds.upperBound &- 1) + } +} + +// Future work: UTF-16 support when we get views diff --git a/Sources/Future/UTF8Span/UTF8SpanStaticString.swift b/Sources/Future/UTF8Span/UTF8SpanStaticString.swift new file mode 100644 index 000000000..a69f3eba8 --- /dev/null +++ b/Sources/Future/UTF8Span/UTF8SpanStaticString.swift @@ -0,0 +1,30 @@ + +#if false +extension UTF8Span { + @_alwaysEmitIntoClient + public init( + _ ss: StaticString + ) -> dependsOn(immortal) Self { + if ss.hasPointerRepresentation { + self.init( + _unsafeAssumingValidUTF8: UnsafeRawPointer(ss.utf8Start), + _countAndFlags: UInt64(ss.utf8CodeUnitCount), // TODO: isASCII + owner: ss + ) + } else { + fatalError("TODO: spill to stack, would this beed a coroutine? also, no longer immortal") + } + } +} + +extension UTF8Span: ExpressibleByStringLiteral { + public init(stringLiteral: StaticString) { + fatalError() + } +} +#endif + +/// +/// +/// + diff --git a/Sources/Future/UTF8Span/UTF8SpanString.swift b/Sources/Future/UTF8Span/UTF8SpanString.swift new file mode 100644 index 000000000..337db1163 --- /dev/null +++ b/Sources/Future/UTF8Span/UTF8SpanString.swift @@ -0,0 +1,95 @@ +#if false +extension StaticString { + public var utf8Span: UTF8Span { + _read { + fatalError() + } + } +} +#endif + +@_unavailableInEmbedded +extension String { + // NOTE: If `self` is lazily bridged NSString, or is in a small-string + // form, memory may be allocated... + public var utf8Span: UTF8Span { + _read { + + + var copy = self + copy.makeContiguousUTF8() + copy.append("12345678901234567890") // make native + copy.removeLast(20) // remove what we just did + + // crazy unsafe + let buffer = utf8.withContiguousStorageIfAvailable({ $0 })! + + let span = Span(unsafeElements: buffer, owner: copy) + yield UTF8Span( + _unsafeAssumingValidUTF8: .init(span._start), + _countAndFlags: UInt64(span.count), // TODO: set the flags + owner: span) + + +/* + The below doesn't work, utf8Span[0] returns "0" instead of the first byte + + let a = ContiguousArray(self.utf8) + let span = Span( + unsafeStart: a._baseAddressIfContiguous!, count: a.count, owner: a + ) + yield UTF8Span( + _unsafeAssumingValidUTF8: .init(span._start), + _countAndFlags: UInt64(span.count), // TODO: set the flags + owner: span) +*/ + +#if false + +// let span: Span +// var copy = self +// copy.makeContiguousUTF8() +// + +// copy.withUTF8 { +// span = +// } + if count < 16 { // Wrong way to know whether the String is smol +// if _guts.isSmall { +// let /*@addressable*/ rawStorage = _guts.asSmall._storage +// let span = RawSpan( +// unsafeRawPointer: UnsafeRawPointer(Builtin.adressable(rawStorage)), +// count: MemoryLayout<_SmallString.RawBitPattern>.size, +// owner: self +// ) +// yield span.view(as: UTF8.CodeUnit.self) + + let a = ContiguousArray(self.utf8) +// yield a.storage + span = Span( + unsafeStart: a._baseAddressIfContiguous!, count: 1, owner: a + ) + } + else if let buffer = utf8.withContiguousStorageIfAvailable({ $0 }) { + // this is totally wrong, but there is a way with stdlib-internal API + span = Span(unsafeElements: buffer, owner: self) + } + else { // copy non-fast code units if we don't have eager bridging + let a = ContiguousArray(self.utf8) +// yield a.storage + span = Span( + unsafeStart: a._baseAddressIfContiguous!, count: 1, owner: a + ) + } + + // TODO: set null-terminated bit + +// let span = self.utf8.storage + yield UTF8Span( + _unsafeAssumingValidUTF8: .init(span._start), + _countAndFlags: UInt64(span.count), // TODO: set the flags + owner: span) +#endif + } + } +} diff --git a/Sources/Future/UTF8Span/UTF8SpanViews.swift b/Sources/Future/UTF8Span/UTF8SpanViews.swift new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/Sources/Future/UTF8Span/UTF8SpanViews.swift @@ -0,0 +1 @@ + diff --git a/Sources/Future/UTF8Span/stdlibDuplicates.swift b/Sources/Future/UTF8Span/stdlibDuplicates.swift new file mode 100644 index 000000000..97b942e85 --- /dev/null +++ b/Sources/Future/UTF8Span/stdlibDuplicates.swift @@ -0,0 +1,253 @@ +/* + + Duplicates of functions currently in the stdlib + + */ + + +import Builtin + +@inlinable @inline(__always) +internal func _utf8ScalarLength(_ x: UInt8) -> Int { + _internalInvariant(!UTF8.isContinuation(x)) + if UTF8.isASCII(x) { return 1 } + // TODO(String micro-performance): check codegen + return (~x).leadingZeroBitCount +} + +@usableFromInline @_transparent +internal func _internalInvariant( + _ condition: @autoclosure () -> Bool, + _ message: @autoclosure () -> String = String(), + file: StaticString = #file, line: UInt = #line +) { + assert(condition(), message()) +} + +@inlinable +@inline(__always) +internal func _decodeUTF8(_ x: UInt8) -> Unicode.Scalar { + _internalInvariant(UTF8.isASCII(x)) + return Unicode.Scalar(_unchecked: UInt32(x)) +} + +@inlinable +@inline(__always) +internal func _decodeUTF8(_ x: UInt8, _ y: UInt8) -> Unicode.Scalar { + _internalInvariant(_utf8ScalarLength(x) == 2) + _internalInvariant(UTF8.isContinuation(y)) + let x = UInt32(x) + let value = ((x & 0b0001_1111) &<< 6) | _continuationPayload(y) + return Unicode.Scalar(_unchecked: value) +} + +@inlinable +@inline(__always) +internal func _decodeUTF8( + _ x: UInt8, _ y: UInt8, _ z: UInt8 +) -> Unicode.Scalar { + _internalInvariant(_utf8ScalarLength(x) == 3) + _internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z)) + let x = UInt32(x) + let value = ((x & 0b0000_1111) &<< 12) + | (_continuationPayload(y) &<< 6) + | _continuationPayload(z) + return Unicode.Scalar(_unchecked: value) +} + +@inlinable +@inline(__always) +internal func _decodeUTF8( + _ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8 +) -> Unicode.Scalar { + _internalInvariant(_utf8ScalarLength(x) == 4) + _internalInvariant( + UTF8.isContinuation(y) && UTF8.isContinuation(z) + && UTF8.isContinuation(w)) + let x = UInt32(x) + let value = ((x & 0b0000_1111) &<< 18) + | (_continuationPayload(y) &<< 12) + | (_continuationPayload(z) &<< 6) + | _continuationPayload(w) + return Unicode.Scalar(_unchecked: value) +} + +@inlinable +@inline(__always) +internal func _continuationPayload(_ x: UInt8) -> UInt32 { + return UInt32(x & 0x3F) +} + +extension Unicode.Scalar { + @inlinable + init(_unchecked value: UInt32) { + // Hacked together + self = Builtin.reinterpretCast(value) + } +} + +@inlinable @inline(__always) +internal func _utf8ScalarLength( + _ utf8: UnsafeBufferPointer, endingAt i: Int + ) -> Int { + var len = 1 + while UTF8.isContinuation(utf8[i &- len]) { + len &+= 1 + } + _internalInvariant(len == _utf8ScalarLength(utf8[i &- len])) + return len +} + + +internal func _quickHasGraphemeBreakBetween( + _ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar +) -> Bool { + + // CR-LF is a special case: no break between these + if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) { + return false + } + + // Whether the given scalar, when it appears paired with another scalar + // satisfying this property, has a grapheme break between it and the other + // scalar. + func hasBreakWhenPaired(_ x: Unicode.Scalar) -> Bool { + // TODO: This doesn't generate optimal code, tune/re-write at a lower + // level. + // + // NOTE: Order of case ranges affects codegen, and thus performance. All + // things being equal, keep existing order below. + switch x.value { + // Unified CJK Han ideographs, common and some supplemental, amongst + // others: + // U+3400 ~ U+A4CF + case 0x3400...0xa4cf: return true + + // Repeat sub-300 check, this is beneficial for common cases of Latin + // characters embedded within non-Latin script (e.g. newlines, spaces, + // proper nouns and/or jargon, punctuation). + // + // NOTE: CR-LF special case has already been checked. + case 0x0000...0x02ff: return true + + // Non-combining kana: + // U+3041 ~ U+3096 + // U+30A1 ~ U+30FC + case 0x3041...0x3096: return true + case 0x30a1...0x30fc: return true + + // Non-combining modern (and some archaic) Cyrillic: + // U+0400 ~ U+0482 (first half of Cyrillic block) + case 0x0400...0x0482: return true + + // Modern Arabic, excluding extenders and prependers: + // U+061D ~ U+064A + case 0x061d...0x064a: return true + + // Precomposed Hangul syllables: + // U+AC00 ~ U+D7AF + case 0xac00...0xd7af: return true + + // Common general use punctuation, excluding extenders: + // U+2010 ~ U+2029 + case 0x2010...0x2029: return true + + // CJK punctuation characters, excluding extenders: + // U+3000 ~ U+3029 + case 0x3000...0x3029: return true + + // Full-width forms: + // U+FF01 ~ U+FF9D + case 0xFF01...0xFF9D: return true + + default: return false + } + } + return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) +} + + +private var _CR: UInt8 { return 0x0d } +private var _LF: UInt8 { return 0x0a } + +internal func _allASCII(_ input: UnsafeBufferPointer) -> Bool { + if input.isEmpty { return true } + + // NOTE: Avoiding for-in syntax to avoid bounds checks + // + // TODO(String performance): SIMD-ize + // + let count = input.count + var ptr = UnsafeRawPointer(input.baseAddress._unsafelyUnwrappedUnchecked) + + let asciiMask64 = 0x8080_8080_8080_8080 as UInt64 + let asciiMask32 = UInt32(truncatingIfNeeded: asciiMask64) + let asciiMask16 = UInt16(truncatingIfNeeded: asciiMask64) + let asciiMask8 = UInt8(truncatingIfNeeded: asciiMask64) + + let end128 = ptr + count & ~(MemoryLayout<(UInt64, UInt64)>.stride &- 1) + let end64 = ptr + count & ~(MemoryLayout.stride &- 1) + let end32 = ptr + count & ~(MemoryLayout.stride &- 1) + let end16 = ptr + count & ~(MemoryLayout.stride &- 1) + let end = ptr + count + + + while ptr < end128 { + let pair = ptr.loadUnaligned(as: (UInt64, UInt64).self) + let result = (pair.0 | pair.1) & asciiMask64 + guard result == 0 else { return false } + ptr = ptr + MemoryLayout<(UInt64, UInt64)>.stride + } + + // If we had enough bytes for two iterations of this, we would have hit + // the loop above, so we only need to do this once + if ptr < end64 { + let value = ptr.loadUnaligned(as: UInt64.self) + guard value & asciiMask64 == 0 else { return false } + ptr = ptr + MemoryLayout.stride + } + + if ptr < end32 { + let value = ptr.loadUnaligned(as: UInt32.self) + guard value & asciiMask32 == 0 else { return false } + ptr = ptr + MemoryLayout.stride + } + + if ptr < end16 { + let value = ptr.loadUnaligned(as: UInt16.self) + guard value & asciiMask16 == 0 else { return false } + ptr = ptr + MemoryLayout.stride + } + + if ptr < end { + let value = ptr.loadUnaligned(fromByteOffset: 0, as: UInt8.self) + guard value & asciiMask8 == 0 else { return false } + } + _internalInvariant(ptr == end || ptr + 1 == end) + return true +} + + +extension Optional { + /// - Returns: `unsafelyUnwrapped`. + /// + /// This version is for internal stdlib use; it avoids any checking + /// overhead for users, even in Debug builds. + @inlinable + internal var _unsafelyUnwrappedUnchecked: Wrapped { + @inline(__always) + get { + if let x = self { + return x + } + _internalInvariant(false, "_unsafelyUnwrappedUnchecked of nil optional") + } + } +} + +extension Unicode.Scalar { + internal static var _replacementCharacter: Unicode.Scalar { + return Unicode.Scalar(_unchecked: 0xFFFD) + } +} + diff --git a/Sources/Future/UTF8Span/stdlibHelpers.swift b/Sources/Future/UTF8Span/stdlibHelpers.swift new file mode 100644 index 000000000..f203f1a61 --- /dev/null +++ b/Sources/Future/UTF8Span/stdlibHelpers.swift @@ -0,0 +1,189 @@ +/* + + Additional helpers build on stdlibDuplicates.swift + + */ + +import Builtin + +extension UnsafeRawPointer { + @_alwaysEmitIntoClient + func _loadByte(_ i: Int) -> UInt8 { + _internalInvariant(i >= 0) + return (self+i).loadUnaligned(as: UInt8.self) + } + + @_alwaysEmitIntoClient + func _isUTF8Continuation(_ i: Int) -> Bool { + UTF8.isContinuation(_loadByte(i)) + } + + @_alwaysEmitIntoClient + func _isScalarAligned(_ i: Int) -> Bool { + _internalInvariant(i >= 0) + return !_isUTF8Continuation(i) + } + + @_alwaysEmitIntoClient + func _nextScalarStart(_ i: Int) -> Int { + i &+ _utf8ScalarLength(_loadByte(i)) + } + + // NOTE: Adaptation of `_decodeScalar` to work on URP + @_alwaysEmitIntoClient + internal func _decodeScalar( + startingAt i: Int + ) -> (Unicode.Scalar, nextScalarStart: Int) { + let cu0 = _loadByte(i) + let len = _utf8ScalarLength(cu0) + let next = len &+ i + switch len { + case 1: return (_decodeUTF8(cu0), next) + case 2: return (_decodeUTF8(cu0, _loadByte(i &+ 1)), next) + case 3: return ( + _decodeUTF8(cu0, _loadByte(i &+ 1), _loadByte(i &+ 2)), next + ) + case 4: + return ( + _decodeUTF8( + cu0, _loadByte(i &+ 1), _loadByte(i &+ 2), _loadByte(i &+ 3) + ), + next + ) + default: Builtin.unreachable() + } + } + + @_alwaysEmitIntoClient + internal func _decodeScalar( + endingAt i: Int + ) -> (Unicode.Scalar, previousScalarStart: Int) { + // TODO: no need to double load the bytes... + let start = _previousScalarStart(i) + return (_decodeScalar(startingAt: start).0, start) + } + + @_alwaysEmitIntoClient + internal func _previousScalarStart(_ i: Int) -> Int { + var prev = i &- 1 + _internalInvariant(prev >= 0) + while _isUTF8Continuation(prev) { + prev &-= 1 + _internalInvariant(prev >= 0) + } + _internalInvariant(i == prev + _utf8ScalarLength(_loadByte(prev))) + return prev + } + + @_alwaysEmitIntoClient + internal func _scalarAlign(_ i: Int) -> Int { + var i = i + while _slowPath(!_isScalarAligned(i)) { + i &-= 1 + } + return i + } +} + +extension UnsafeRawPointer { + // TODO: ASCII fast path wrappers around ufi functions + + // TODO: hook up to real grapheme breaking + internal func _urbp(_ range: Range) -> UnsafeRawBufferPointer { + .init(start: self + range.lowerBound, count: range.count) + } + + @_alwaysEmitIntoClient + func _ubp(_ range: Range) -> UnsafeBufferPointer { + UnsafeBufferPointer( + start: UnsafePointer((self+range.lowerBound)._rawValue), + count: range.count) + } + + internal func _str(_ range: Range) -> String { + String(decoding: _urbp(range) , as: UTF8.self) + } + + @usableFromInline + internal func _isCharacterAligned( + _ i: Int, + limitedBy end: Int + ) -> Bool { + _internalInvariant(i >= 0 && i <= end) + if i == 0 || i == end { + return true + } + + // TODO: call internals instead + let str = _str(0.. Int { + _internalInvariant((0.. Int { + _internalInvariant(_isCharacterAligned(i, limitedBy: end)) + + // TODO: call internals instead + let str = _str(0.. (Character, nextCharacterStart: Int) { + let nextStart = _nextCharacterStart(i, limitedBy: end) + return (Character(_str(i.. (Character, nextCharacterStart: Int) { + let start = _previousCharacterStart(i, limitedBy: end) + _internalInvariant(start >= 0) + + return (Character(_str(start..) + } + + // Returns isASCII + // TODO: return more values + internal func _validateUTF8( + limitedBy end: Int + ) throws(UTF8.EncodingError) -> Bool { + switch validateUTF8(_ubp(0.. Bool { + return (0xC2...0xF4).contains(x) +} + +private func _isNotOverlong_F0(_ x: UInt8) -> Bool { + return (0x90...0xBF).contains(x) +} + +private func _isNotInvalid_F4(_ x: UInt8) -> Bool { + return UTF8.isContinuation(x) && x <= 0x8F +} + +private func _isNotOverlong_E0(_ x: UInt8) -> Bool { + return (0xA0...0xBF).contains(x) +} + +private func _isNotInvalid_ED(_ x: UInt8) -> Bool { + return UTF8.isContinuation(x) && x <= 0x9F +} + +internal struct UTF8ExtraInfo: Equatable { + public var isASCII: Bool +} + +@inline(never) // slow-path +private func _diagnoseInvalidUTF8MultiByteLeading( + _ x: UInt8 +) -> UTF8.EncodingError.Kind { + _internalInvariant(x >= 0x80) + _internalInvariant(!_isUTF8MultiByteLeading(x)) + switch x { + case 0x80...0xBF: + return .unexpectedContinuationByte + case 0xC0..<0xC2: + return .overlongEncodingByte + default: + _internalInvariant(x > 0xF4) + return .invalidNonSurrogateCodePointByte + } +} + +internal enum UTF8ValidationResult { + case success(UTF8ExtraInfo) + case error( + kind: UTF8.EncodingError.Kind, toBeReplaced: Range + ) +} + +extension UTF8ValidationResult: Equatable {} + +internal func validateUTF8(_ buf: UnsafeBufferPointer) -> UTF8ValidationResult { + if _allASCII(buf) { + return .success(UTF8ExtraInfo(isASCII: true)) + } + + var iter = buf.makeIterator() + var lastValidIndex = buf.startIndex + + @inline(__always) func guarantee( + _ f: (UInt8) -> Bool, + _ err: UTF8.EncodingError.Kind + ) throws(UTF8.EncodingError.Kind) { + guard let cu = iter.next() else { + throw .truncatedScalar + } + guard f(cu) else { + throw err + } + } + @inline(__always) func guaranteeContinuation( + ) throws(UTF8.EncodingError.Kind) { + try guarantee(UTF8.isContinuation, .truncatedScalar) + } + + func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int { + // function body copied from UTF8.ForwardParser._invalidLength + if _buffer._storage & 0b0__1100_0000__1111_0000 + == 0b0__1000_0000__1110_0000 { + // 2-byte prefix of 3-byte sequence. The top 5 bits of the decoded result + // must be nonzero and not a surrogate + let top5Bits = _buffer._storage & 0b0__0010_0000__0000_1111 + if top5Bits != 0 && top5Bits != 0b0__0010_0000__0000_1101 { return 2 } + } + else if _buffer._storage & 0b0__1100_0000__1111_1000 + == 0b0__1000_0000__1111_0000 + { + // Prefix of 4-byte sequence. The top 5 bits of the decoded result + // must be nonzero and no greater than 0b0__0100_0000 + let top5bits = UInt16(_buffer._storage & 0b0__0011_0000__0000_0111) + if top5bits != 0 && top5bits.byteSwapped <= 0b0__0000_0100__0000_0000 { + return _buffer._storage & 0b0__1100_0000__0000_0000__0000_0000 + == 0b0__1000_0000__0000_0000__0000_0000 ? 3 : 2 + } + } + return 1 + } + + func _legacyNarrowIllegalRange(buf: Slice>) -> Range { + var reversePacked: UInt32 = 0 + if let third = buf.dropFirst(2).first { + reversePacked |= UInt32(third) + reversePacked <<= 8 + } + if let second = buf.dropFirst().first { + reversePacked |= UInt32(second) + reversePacked <<= 8 + } + reversePacked |= UInt32(buf.first!) + let _buffer: (_storage: UInt32, x: ()) = (reversePacked, ()) + let invalids = _legacyInvalidLengthCalculation(_buffer) + return buf.startIndex ..< buf.startIndex + invalids + } + + func findInvalidRange(_ buf: Slice>) -> Range { + var endIndex = buf.startIndex + var iter = buf.makeIterator() + _ = iter.next() + while let cu = iter.next(), UTF8.isContinuation(cu) { + endIndex += 1 + // Unicode's Maximal subpart of an ill-formed subsequence will yield + // at most 3 bytes of error. + if buf.distance(from: buf.startIndex, to: endIndex) >= 3 { + break + } + } + let illegalRange = Range(buf.startIndex...endIndex) + _internalInvariant(illegalRange.clamped(to: (buf.startIndex.., firstKnownBrokenRange: Range) -> String { + _internalInvariant(!input.isEmpty, "empty input doesn't need to be repaired") + _internalInvariant(firstKnownBrokenRange.clamped(to: input.indices) == firstKnownBrokenRange) + // During this process, `remainingInput` contains the remaining bytes to process. It's split into three + // non-overlapping sub-regions: + // + // 1. `goodChunk` (may be empty) containing bytes that are known good UTF-8 and can be copied into the output String + // 2. `brokenRange` (never empty) the next range of broken bytes, + // 3. the remainder (implicit, will become the next `remainingInput`) + // + // At the beginning of the process, the `goodChunk` starts at the beginning and extends to just before the first + // known broken byte. The known broken bytes are covered in the `brokenRange` and everything following that is + // the remainder. + // We then copy the `goodChunk` into the target buffer and append a UTF8 replacement character. `brokenRange` is + // skipped (replaced by the replacement character) and we restart the same process. This time, `goodChunk` extends + // from the byte after the previous `brokenRange` to the next `brokenRange`. + + // NOTE: was using _StringGuts directly, now we're using an intermediary Array and will re-validate + // This is only relevant if we actually need to repair the contents, in which case perf is very different. + // var result = _StringGuts() + + var result = Array() + + // NOTE: was withUTF8CodeUnits { $0.count } + let replacementCharacterCount: Int + if #available(macOS 10.15, *) { + replacementCharacterCount = Unicode.Scalar._replacementCharacter.utf8.count + } else { + // Fallback on earlier versions + fatalError("Mac pre-10.15") + } + + result.reserveCapacity(input.count + 5 * replacementCharacterCount) // extra space for some replacement characters + + var brokenRange: Range = firstKnownBrokenRange + var remainingInput = input + repeat { + _internalInvariant(!brokenRange.isEmpty, "broken range empty") + _internalInvariant(!remainingInput.isEmpty, "empty remaining input doesn't need to be repaired") + let goodChunk = remainingInput[.. + ) -> Unicode.Scalar? { + fatalError() + } + + + /// Returns the next normalized scalar, + /// iteratively invoking the scalar producer if necessary + /// + public mutating func resume( + scalarProducer: () -> Unicode.Scalar? + ) -> Unicode.Scalar? { + fatalError() + } + + /// Marks the end of the logical text stream + /// and returns remaining data from the normalizer's buffers. + /// + public mutating func flush() -> Unicode.Scalar? { + fatalError() + } + + /// Resets the normalizer to its initial state. + /// + /// Any allocated buffer capacity will be kept and reused + /// unless it exceeds the given maximum capacity, + /// in which case it will be discarded. + /// + public mutating func reset(maximumCapacity: Int = Int.max) { + fatalError() + } +} + +import Future + +extension UTF8Span { + func nthNormalizedScalar( + _ n: Int + ) -> Unicode.Scalar? { + var normalizer = NFCNormalizer() + var pos = 0 + var count = 0 + + while true { + guard let s = normalizer.resume(scalarProducer: { + guard pos < count else { + return nil + } + let (scalar, next) = self.decodeNextScalar( + uncheckedAssumingAligned: pos) + pos = next + return scalar + }) else { + return nil + } + + if count == n { return s } + count += 1 + } + } +} diff --git a/Tests/FutureTests/UTF8Span/UTF8SpanTests.swift b/Tests/FutureTests/UTF8Span/UTF8SpanTests.swift new file mode 100644 index 000000000..407c92e29 --- /dev/null +++ b/Tests/FutureTests/UTF8Span/UTF8SpanTests.swift @@ -0,0 +1,288 @@ +@testable +import Future + +import XCTest + + +func bitEqual( + _ span1: UTF8Span, + _ span2: UTF8Span +) -> Bool { + span1.unsafeBaseAddress == span2.unsafeBaseAddress && + span1._countAndFlags == span2._countAndFlags +} + +class UTF8SpanTests: XCTestCase { + // TODO: basic operations tests + + func testFoo() { + let str = "abcdefghijklnmo" + let span = str.utf8Span + print(span[0]) // prints 0 + + print(stableSince("abc")) // 4.1 + print(stableSince("abc🤯")) // 10.0 + print(stableSince("abc🤯🫥")) // 14.0 + + + } + + // Returns the latest Unicode version from which normalization + // is stable, or `nil` if `s` contains any unassigned code points. + func stableSince(_ s: String) -> Unicode.Version? { + var base = (major: 4, minor: 1) + for s in s.unicodeScalars { + guard let age = s.properties.age else { + return nil + } + if age.major > base.major || + (age.major == base.major && age.minor > base.minor) + { + base = age + } + } + return base + } + + + func testInitForwarding() throws { + // TODO: test we get same bits from various init pathways + // include null-terminated ones (stripping the isNULL bit of course) + } + + func testNullTermination() throws { + func runTest(_ input: String) throws { + let utf8 = input.utf8 + let nullIdx = utf8.firstIndex(of: 0) ?? utf8.endIndex + let prefixCount = utf8.distance( + from: utf8.startIndex, to: nullIdx) + + try Array(utf8).withUnsafeBytes { + let nullContent = try UTF8Span( + validatingUnsafeRaw: $0, owner: $0) + let nullTerminated = try UTF8Span( + validatingUnsafeRawCString: $0.baseAddress!, owner: $0) + + XCTAssertFalse(nullContent.isNullTerminatedCString) + XCTAssertTrue(nullTerminated.isNullTerminatedCString) + XCTAssertEqual(nullContent.count, utf8.count) + XCTAssertEqual(nullTerminated.count, prefixCount) + } + } + try runTest("abcdefg\0") + try runTest("abc\0defg\0") + try runTest("a🧟‍♀️bc\0defg\0") + try runTest("a🧟‍♀️bc\0\u{301}defg") + try runTest("abc\0\u{301}defg\0") + } + + func testContentViews() throws { + func runTest(_ input: String) throws { + // For convenience, we use the API defined in + // UTF8SpanViews.swift + + // TODO: also try input.utf8Span after compiler bug fixes + + let array = Array(input.utf8) + let span = try UTF8Span(validating: array.storage) + + do { + var strIter = input.makeIterator() + var spanIter = span.characters.makeIterator() + + while let c = strIter.next() { + guard let spanC = spanIter.next() else { + XCTFail("Span ran out of content") + return + } + XCTAssertEqual(c, spanC) + } + XCTAssertNil(spanIter.next()) + } + + do { + var strIter = input.unicodeScalars.makeIterator() + var spanIter = span.unicodeScalars.makeIterator() + + while let c = strIter.next() { + guard let spanC = spanIter.next() else { + XCTFail("Span ran out of content") + return + } + XCTAssertEqual(c, spanC) + } + XCTAssertNil(spanIter.next()) + } + + // TODO: uncomment the collection style bidi tests + // when compiler bug is fixes + + // Scalars + do { + + var strIdx = input.unicodeScalars.startIndex + var spanIdx = span.unicodeScalars.startIndex + while strIdx != input.unicodeScalars.endIndex { + XCTAssertEqual( + input.utf8.distance(from: input.startIndex, to: strIdx), + spanIdx.position) + XCTAssertEqual(input.unicodeScalars[strIdx], span.unicodeScalars[spanIdx]) + input.unicodeScalars.formIndex(after: &strIdx) + span.unicodeScalars.formIndex(after: &spanIdx) + } + XCTAssertEqual(spanIdx, span.unicodeScalars.endIndex) + + strIdx = input.unicodeScalars.endIndex + spanIdx = span.unicodeScalars.endIndex + while strIdx != input.startIndex { + XCTAssertEqual( + input.utf8.distance(from: input.startIndex, to: strIdx), + spanIdx.position) + input.unicodeScalars.formIndex(before: &strIdx) + span.unicodeScalars.formIndex(before: &spanIdx) + XCTAssertEqual(input.unicodeScalars[strIdx], span.unicodeScalars[spanIdx]) + } + } + + // Characters + do { + var strIdx = input.startIndex + var spanIdx = span.characters.startIndex + while strIdx != input.endIndex { + XCTAssertEqual( + input.utf8.distance(from: input.startIndex, to: strIdx), + spanIdx.position) + XCTAssertEqual(input[strIdx], span.characters[spanIdx]) + input.formIndex(after: &strIdx) + span.characters.formIndex(after: &spanIdx) + } + XCTAssertEqual(spanIdx, span.characters.endIndex) + + strIdx = input.endIndex + spanIdx = span.characters.endIndex + while strIdx != input.startIndex { + XCTAssertEqual( + input.utf8.distance(from: input.startIndex, to: strIdx), + spanIdx.position) + input.formIndex(before: &strIdx) + span.characters.formIndex(before: &spanIdx) + XCTAssertEqual(input[strIdx], span.characters[spanIdx]) + } + } + + } + + try runTest("abc") + try runTest("abcdefghiljkmnop") + try runTest("abcde\0fghiljkmnop") + try runTest("a🧟‍♀️bc\0\u{301}defg") + try runTest("a🧟‍♀️bce\u{301}defg") + try runTest("a🧟‍♀️bce\u{301}defg\r\n 🇺🇸") + + } + + func testCanonicalEquivalence() throws { + // TODO: equivalence checks + // TODO: canonically less than checks + } + + func testMisc() throws { + // TODO: test withUnsafeBufferPointer + + } + + func testQueries() throws { + // TODO: test isASCII + // TODO: test knownNFC and checks for NFC + // TODO: test single scalar character and checks + + /// + enum CheckLevelToPass { + case always // Passes upon bit inspection + case quick // Passes under quick checking + case full // Passes under full checking + case never // Doesn't succeed under full checking + + func check( + query: () -> Bool, + transform: (Bool) -> Bool + ) { + switch self { + case .always: + XCTAssert(query()) + + case .quick: + XCTAssertFalse(query()) + let b = transform(true) + XCTAssert(b && query()) + + case .full: + XCTAssertFalse(query()) + var b = transform(true) + XCTAssertFalse(b || query()) + + b = transform(false) + XCTAssert(b && query()) + + case .never: + XCTAssertFalse(query()) + var b = transform(true) + XCTAssertFalse(b || query()) + + b = transform(false) + XCTAssertFalse(b || query()) + } + } + } + + func runTest( + _ input: String, + isASCII: Bool, + isNFC: CheckLevelToPass, + isSSC: CheckLevelToPass + ) throws { + let array = Array(input.utf8) + var span = try UTF8Span(validating: array.storage) + + XCTAssertEqual(isASCII, span.isASCII) + + isNFC.check( + query: { span.isKnownNFC }, + transform: { span.checkForNFC(quickCheck: $0) } + ) + + isSSC.check( + query: { span.isKnownSingleScalarCharacters }, + transform: { span.checkForSingleScalarCharacters(quickCheck: $0) } + ) + + } + + // FIXME: shouldn't be .full for SSC + try runTest("abc", isASCII: true, isNFC: .always, isSSC: .quick) + try runTest("abcde\u{301}", isASCII: false, isNFC: .never, isSSC: .never) + try runTest("abcdè", isASCII: false, isNFC: .quick, isSSC: .quick) + + try runTest( + "abcd日", + isASCII: false, + isNFC: .full, // FIXME: change to quick when we query QC properties + isSSC: .quick) + + try runTest( + "a강c", // NOTE: Precomposed Gang U+AC15 + isASCII: false, + isNFC: .full, // FIXME: change to quick when we query QC properties + isSSC: .quick) + + try runTest( + "a강c", // NOTE: Decomposed Gang U+1100 U+1161 U+11BC + isASCII: false, + isNFC: .never, + isSSC: .never) + + + // TODO(perf): speed up grapheme breaking based on single scalar + // character, speed up nextScalarStart via isASCII, ... + } +} diff --git a/Tests/FutureTests/UTF8Span/UTF8SpanValidationTests.swift b/Tests/FutureTests/UTF8Span/UTF8SpanValidationTests.swift new file mode 100644 index 000000000..dd71cf385 --- /dev/null +++ b/Tests/FutureTests/UTF8Span/UTF8SpanValidationTests.swift @@ -0,0 +1,245 @@ +import Future +import XCTest + +extension Range { + func _offset(by start: Int) -> Range { + start + lowerBound ..< start + upperBound + } +} + +private func utf8Validate( + _ bytes: Array, + expectedErrors errors: [UTF8.EncodingError] +) throws { + let span = bytes.storage + if errors.isEmpty { + // No errors expected + _ = try UTF8Span(validating: span) + return + } + + // Check for each error. + // NOTE: We currently do it by slicing, which will change the + // error classification. + for expectedError in errors { + let start = expectedError.range.lowerBound + do throws(UTF8.EncodingError) { + _ = try UTF8Span(validating: span.extracting(start...)) + XCTAssert(false) + } catch { + let adjustedErr = UTF8.EncodingError( + error.kind, + error.range._offset(by: start) + ) + XCTAssertEqual(expectedError, adjustedErr) + } + } + + // Rest of input should be error-free + if let start = errors.last?.range.upperBound, start < bytes.count { + _ = try UTF8Span(validating: span.extracting(start...)) + } +} + +private struct ValidationError { + var error: UTF8.EncodingError + + // When fetching all errors, we'll get the error kind given. When + // slicing in order to get the next error (e.g. + // `UTF8Span.init(validating:))`, we'll get `.unexpectedContinuation`. + var errorStart: Bool + + init(_ error: UTF8.EncodingError, errorStart: Bool) { + self.error = error + self.errorStart = errorStart + } + + public static func unexpectedContinuationByte( + at i: Int, errorStart: Bool = true + ) -> Self { + Self(UTF8.EncodingError(.unexpectedContinuationByte, at: i), errorStart: errorStart) + } + + public static func surrogateCodePointByte( + at i: Int, errorStart: Bool = true + ) -> Self { + Self(UTF8.EncodingError(.surrogateCodePointByte, at: i), errorStart: errorStart) + } + + public static func invalidNonSurrogateCodePointByte( + at i: Int, errorStart: Bool = true + ) -> Self { + Self(UTF8.EncodingError(.invalidNonSurrogateCodePointByte, at: i), errorStart: errorStart) + } + + public static func overlongEncodingByte( + at i: Int, errorStart: Bool = true + ) -> Self { + Self(UTF8.EncodingError(.overlongEncodingByte, at: i), errorStart: errorStart) + } + + public static func truncatedScalar( + _ range: some RangeExpression, errorStart: Bool = true + ) -> Self { + Self(UTF8.EncodingError(.truncatedScalar, range), errorStart: errorStart) + } +} + +private struct ValidationTestCase { + var bytes: [UInt8] + + // When fetching all errors, we'll get the error kind given. When + // slicing in order to get the next error (e.g. + // `UTF8Span.init(validating:))`, we'll get `.unexpectedContinuation`. + var errors: [ValidationError] + + init( + _ bytes: [UInt8], + _ errors: [ValidationError] + ) { + self.bytes = bytes + self.errors = errors + } + + func fetchError( + at i: Int, wasSliced: Bool + ) -> UTF8.EncodingError { + let err = errors[i] + if wasSliced && !err.errorStart { + return .init(.unexpectedContinuationByte, err.error.range) + } + return err.error + } + + func testAllErrors() { + let caughtErrors = Array(UTF8._checkAllErrors(bytes)) + for i in 0.. + + @inlinable @inline(__always) + public var codeUnits: CodeUnits { + .init( + unsafeStart: unsafeBaseAddress, + byteCount: count, + owner: self) + } + + @frozen + public struct UnicodeScalarView: ~Escapable { + @inline(__always) + public let span: UTF8Span + + @inlinable @inline(__always) + public init(_ span: UTF8Span) { + self.span = span + } + } + + @inlinable @inline(__always) + public var unicodeScalars: UnicodeScalarView { + _read { yield .init(self) } + } + + @frozen + public struct CharacterView: ~Escapable { + @inline(__always) + public let span: UTF8Span + + @inlinable @inline(__always) + public init(_ span: UTF8Span) { + self.span = span + } + } + + @inlinable @inline(__always) + public var characters: CharacterView { + _read { yield .init(self) } + } + + @frozen + public struct UTF16View: ~Escapable { + @inline(__always) + public let span: UTF8Span + + @inlinable @inline(__always) + public init(_ span: UTF8Span) { + self.span = span + } + } + + @inlinable @inline(__always) + public var utf16: UTF16View { + _read { yield .init(self) } + } + +} + + +extension UTF8Span.UnicodeScalarView { + // NOTE: I decided not to store the length, or the scalar value itself. + // Storing the length wouldn't speed up subscript that much and would + // require that index(after:) load the subsequent byte. We'll still have a + // custom iterator. I decided not to store the scalar value as that would + // slow down index-only operations + // + // NOTE: `indices` returns `Range` which means that `indices.contains + // (i)` will return true for `i >= indices.lowerBounds && i <= + // indices.upperBound`, whether aligned or not + // + // TODO: Should it be RawSpan.Index, so that we can do allocation checking? + // + // Note: Wrapper struct, but with public access, so that it's not used + // across views accidentally + @frozen + public struct Index: Comparable, Hashable { + public var position: Int + + // TODO: Do we want the public init to take the span so we can + // precondition that it's aligned? + @inlinable @inline(__always) + public init(_ position: Int) { + self.position = position + } + + @inlinable @inline(__always) + public static func < ( + lhs: UTF8Span.UnicodeScalarView.Index, + rhs: UTF8Span.UnicodeScalarView.Index + ) -> Bool { + lhs.position < rhs.position + } + } + + public typealias Element = Unicode.Scalar + + @frozen + public struct Iterator: ~Escapable { + public typealias Element = Unicode.Scalar + + @inline(__always) + public let span: UTF8Span + + @inline(__always) + public var position: Int + + @inlinable @inline(__always) + init(_ span: UTF8Span) { + self.span = span + self.position = 0 + } + + @inlinable + public mutating func next() -> Unicode.Scalar? { + guard position < span.count else { + return nil + } + let (res, pos) = span.decodeNextScalar( + uncheckedAssumingAligned: position) + position = pos + return res + } + } + + @inlinable @inline(__always) + public borrowing func makeIterator() -> Iterator { + .init(span) + } + + @inlinable @inline(__always) + public var startIndex: Index { .init(0) } + + @inlinable @inline(__always) + public var endIndex: Index { .init(span.count) } + +// @inlinable @inline(__always) +// public var count: Int { fatalError() } + +// @inlinable @inline(__always) +// public var isEmpty: Bool { startIndex == endIndex } + +// @inlinable @inline(__always) +// public var indices: Range { +// startIndex.. Index { + .init(span.nextScalarStart(i.position)) + } + + @inlinable + public func index(before i: Index) -> Index { + .init(span.previousScalarStart(uncheckedAssumingAligned: i.position)) + } + + @inlinable + public func formIndex(after i: inout Index) { + i = index(after: i) + } + + @inlinable + public func formIndex(before i: inout Index) { + i = index(before: i) + } + + @inlinable + public subscript(position: Index) -> Element { + borrowing _read { + yield span.decodeNextScalar(position.position).0 + } + } + +#if false + @inlinable + public subscript(unchecked position: Index) -> Element { + borrowing _read { + fatalError() + } + } + + @inlinable + public subscript(bounds: Range) -> Self { + get { + fatalError() + } + } + + @inlinable + public subscript(unchecked bounds: Range) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(bounds: some RangeExpression) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(unchecked bounds: some RangeExpression) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(x: UnboundedRange) -> Self { + borrowing get { + fatalError() + } + } + + @inlinable + public func elementsEqual(_ other: Self) -> Bool { + + } + + // NOTE: No Collection overload, since it's the same code as + // the sequence one. + + @inlinable + public func elementsEqual(_ other: some Sequence) -> Bool { + var iter = self.makeIterator() + for elt in other { + guard elt == iter.next() else { return false } + } + return iter.next() == nil + } +#endif + + +} + +extension UTF8Span.CharacterView { + // NOTE: I decided not to store the length, so that + // index after doesn't need to measure the next grapheme cluster. + // We define a custom iterator to make iteration faster. + // + // Because the next-after-next grapheme cluster may be arbitrarily large, I + // think it's better not to measure it pre-emptively as part of index + // (after:). Instead we'll do exactly the operation the programmer asked + // for. + // + // Note: Wrapper struct, but with public access, so that + // it's not used across views accidentally + @frozen + public struct Index: Comparable, Hashable { + public var position: Int + + @inlinable @inline(__always) + public init(_ position: Int) { + self.position = position + } + + @inlinable @inline(__always) + public static func < ( + lhs: UTF8Span.CharacterView.Index, + rhs: UTF8Span.CharacterView.Index + ) -> Bool { + lhs.position < rhs.position + } + } + + public typealias Element = Character + + @frozen + public struct Iterator: ~Escapable { + public typealias Element = Character + + @inline(__always) + public let span: UTF8Span + + @inline(__always) + public var position: Int + + @inlinable @inline(__always) + init(_ span: UTF8Span) { + self.span = span + self.position = 0 + } + + @inlinable + public mutating func next() -> Character? { + guard position < span.count else { + return nil + } + let (res, pos) = span.decodeNextCharacter( + uncheckedAssumingAligned: position) + position = pos + return res + } + } + + @inlinable @inline(__always) + public borrowing func makeIterator() -> Iterator { + .init(span) + } + + @inlinable @inline(__always) + public var startIndex: Index { .init(0) } + + @inlinable @inline(__always) + public var endIndex: Index { .init(span.count) } + + @inlinable + public func index(after i: Index) -> Index { + .init(span.nextCharacterStart(i.position)) + } + + @inlinable + public func index(before i: Index) -> Index { + .init(span.previousCharacterStart(i.position)) + } + + @inlinable + public func formIndex(after i: inout Index) { + i = index(after: i) + } + + @inlinable + public func formIndex(before i: inout Index) { + i = index(before: i) + } + + + +#if false + @inlinable + public func index( + _ i: Index, offsetBy distance: Int, limitedBy limit: Index + ) -> Index? { + fatalError() + } + + + @inlinable + public func index(_ i: Index, offsetBy distance: Int) -> Index { + fatalError() + } + + @inlinable + public func formIndex(_ i: inout Index, offsetBy distance: Int) { + fatalError() + } + + @inlinable + public func formIndex( + _ i: inout Index, offsetBy distance: Int, limitedBy limit: Index + ) -> Bool { + fatalError() + } + +#endif + + @inlinable + public subscript(position: Index) -> Element { + borrowing _read { + yield span.decodeNextCharacter(position.position).0 + } + } + +#if false + + @inlinable + public subscript(unchecked position: Index) -> Element { + borrowing _read { + fatalError() + } + } + + @inlinable + public subscript(bounds: Range) -> Self { + get { + fatalError() + } + } + + @inlinable + public subscript(unchecked bounds: Range) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(bounds: some RangeExpression) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(unchecked bounds: some RangeExpression) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(x: UnboundedRange) -> Self { + borrowing get { + fatalError() + } + } + + @inlinable + public func distance(from start: Index, to end: Index) -> Int { + fatalError() + } + + @inlinable + public func elementsEqual(_ other: Self) -> Bool { + self.span.isCanonicallyEquivalent(to: other.span) + } + + // NOTE: No Collection overload, since it's the same code as + // the sequence one. + + @inlinable + public func elementsEqual(_ other: some Sequence) -> Bool { + var iter = self.makeIterator() + for elt in other { + guard elt == iter.next() else { return false } + } + return iter.next() == nil + } + +#endif + +} + +#if false +extension UTF8Span.UTF16View { + @frozen + public struct Index: Comparable, Hashable { + // TODO: top bit or low bit + @usableFromInline @inline(__always) + internal var _rawValue: UInt64 + + @inlinable @inline(__always) + public var position: Int { + fatalError() + } + + /// Whether this index is referring to the second code unit of a non-BMP + /// Unicode Scalar value. + @inlinable @inline(__always) + public var secondCodeUnit: Bool { + fatalError() + } + + @inlinable @inline(__always) + public init( + _ position: Int, + secondCodeUnit: Bool + ) { + fatalError() + } + + @inlinable @inline(__always) + public static func < ( + lhs: UTF8Span.UTF16View.Index, + rhs: UTF8Span.UTF16View.Index + ) -> Bool { + if lhs.position == rhs.position { + return !lhs.secondCodeUnit && rhs.secondCodeUnit + } + return lhs.position < rhs.position + } + } + + public typealias Element = UInt16 + + @frozen + public struct Iterator: ~Escapable { + public typealias Element = UInt16 + + @inline(__always) + public let span: UTF8Span + + @inline(__always) + public var index: UTF8Span.UTF16View.Index + + @inlinable @inline(__always) + init(_ span: UTF8Span) { + self.span = span + fatalError() + } + + @inlinable + public mutating func next() -> UInt16? { + guard index.position < span.count else { + return nil + } + fatalError() + } + } + + @inlinable @inline(__always) + public borrowing func makeIterator() -> Iterator { + .init(span) + } + + @inlinable @inline(__always) + public var startIndex: Index { fatalError() } + + @inlinable @inline(__always) + public var endIndex: Index { fatalError() } + + @inlinable @inline(__always) + public var count: Int { fatalError() } + + @inlinable @inline(__always) + public var isEmpty: Bool { startIndex == endIndex } + + @inlinable @inline(__always) + public var indices: Range { + startIndex.. Index { + fatalError() + } + + @inlinable + public func index(before i: Index) -> Index { + fatalError() + } + + @inlinable + public func index( + _ i: Index, offsetBy distance: Int, limitedBy limit: Index + ) -> Index? { + fatalError() + } + + @inlinable + public func formIndex(after i: inout Index) { + fatalError() + } + + @inlinable + public func formIndex(before i: inout Index) { + fatalError() + } + + @inlinable + public func index(_ i: Index, offsetBy distance: Int) -> Index { + fatalError() + } + + @inlinable + public func formIndex(_ i: inout Index, offsetBy distance: Int) { + fatalError() + } + + @inlinable + public func formIndex( + _ i: inout Index, offsetBy distance: Int, limitedBy limit: Index + ) -> Bool { + fatalError() + } + + @inlinable + public subscript(position: Index) -> Element { + borrowing _read { + fatalError() + } + } + + @inlinable + public subscript(unchecked position: Index) -> Element { + borrowing _read { + fatalError() + } + } + + @inlinable + public subscript(bounds: Range) -> Self { + get { + fatalError() + } + } + + @inlinable + public subscript(unchecked bounds: Range) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(bounds: some RangeExpression) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(unchecked bounds: some RangeExpression) -> Self { + borrowing get { + fatalError() + } + } + + @_alwaysEmitIntoClient + public subscript(x: UnboundedRange) -> Self { + borrowing get { + fatalError() + } + } + + @inlinable + public func distance(from start: Index, to end: Index) -> Int { + fatalError() + } + + @inlinable + public func elementsEqual(_ other: Self) -> Bool { + span.codeUnits.elementsEqual(other.span.codeUnits) + } + + // NOTE: No Collection overload, since it's the same code as + // the sequence one. + + @inlinable + public func elementsEqual(_ other: some Sequence) -> Bool { + var iter = self.makeIterator() + for elt in other { + guard elt == iter.next() else { return false } + } + return iter.next() == nil + } + +} + +#endif