swiftlang · rxwei · Jun 20, 2022 · milseman · Jun 23, 2022 · milseman
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -1,5 +1,6 @@
 @_implementationOnly import _RegexParser
 
+@available(SwiftStdlib 5.7, *)
 extension Compiler {
   struct ByteCodeGen {
     var options: MatchingOptions
@@ -15,8 +16,15 @@ extension Compiler {
   }
 }
 
+@available(SwiftStdlib 5.7, *)
 extension Compiler.ByteCodeGen {
   mutating func emitRoot(_ root: DSLTree.Node) throws -> MEProgram {
+    // FIXME: Remove once output type erasure is represented in the matching
+    // engine. This workaround is to prevent a top-level `Regex<AnyRegexOutput>`
+    // from being emitted as a matcher, which would be an infinite recursion.
+    if case let .typeErase(child) = root {
+      return try emitRoot(child)
+    }
     // The whole match (`.0` element of output) is equivalent to an implicit
     // capture over the entire regex.
     try emitNode(.capture(name: nil, reference: nil, root))
@@ -25,6 +33,7 @@ extension Compiler.ByteCodeGen {
   }
 }
 
+@available(SwiftStdlib 5.7, *)
 fileprivate extension Compiler.ByteCodeGen {
   mutating func emitAtom(_ a: DSLTree.Atom) throws {
     defer {
@@ -765,6 +774,28 @@ fileprivate extension Compiler.ByteCodeGen {
     case .characterPredicate:
       throw Unsupported("character predicates")
 
+    case .typeErase(let child):
+      // FIXME: This is a workaround for `Regex<AnyRegexOutput>` not working in
+      // the DSL. This separates any `Regex<AnyRegexOutput>` into its own
+      // compilation unit, but is less efficient. We should instead represent
+      // output type erasure in the matching engine (`beginTypeErase`,
+      // `endTypeErase`).
+      //
+      // Long-term design:
+      //   beginTypeErase
+      //   <code for child>
+      //   endTypeErase
+      let program = try Compiler(tree: DSLTree(child)).emit()
+      let executor = Executor(program: program)
+      return emitMatcher { input, startIndex, range in
+        guard let match: Regex<AnyRegexOutput>.Match = try executor.match(
+          input, in: startIndex..<range.upperBound, .partialFromFront
+        ) else {
+          return nil
+        }
+        return (match.range.upperBound, match.output)
+      }
+
     case .trivia, .empty:
       return nil
     }

diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift
@@ -11,6 +11,7 @@
 
 @_implementationOnly import _RegexParser
 
+@available(SwiftStdlib 5.7, *)
 class Compiler {
   let tree: DSLTree
 
@@ -34,6 +35,7 @@ class Compiler {
   }
 }
 
+@available(SwiftStdlib 5.7, *)
 func _compileRegex(
   _ regex: String, _ syntax: SyntaxOptions = .traditional
 ) throws -> Executor {

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -39,7 +39,7 @@ extension DSLTree.Node {
     case .orderedChoice, .conditional, .concatenation,
         .capture, .nonCapturingGroup,
         .quantification, .trivia, .empty,
-        .absentFunction: return nil
+        .absentFunction, .typeErase: return nil
 
     case .consumer:
       fatalError("FIXME: Is this where we handle them?")

diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift
@@ -193,6 +193,15 @@ extension Instruction {
     ///
     case backreference
 
+    /// Push a new type erasure scope into the capture stack.
+    case beginTypeErase
+
+    /// Pop the last type erasure scope, create a `AnyRegexOutput` from that
+    /// scope, and store it in a value register.
+    ///
+    ///     endTypeErase(_: ValReg)
+    case endTypeErase
+
     // MARK: Matching: State transitions
 
     // TODO: State transitions need more work. We want

diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift
@@ -187,6 +187,14 @@ extension MEProgram.Builder {
       .init(capture: cap, transform: trans)))
   }
 
+  mutating func buildBeginTypeErase() {
+    instructions.append(.init(.beginTypeErase))
+  }
+
+  mutating func buildEndTypeErase() {
+    instructions.append(.init(.endTypeErase))
+  }
+
   mutating func buildMatcher(
     _ fun: MatcherRegister, into reg: ValueRegister
   ) {

diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
@@ -442,6 +442,12 @@ extension Processor {
         value, overwriteInitial: sp)
       controller.step()
 
+    case .beginTypeErase:
+      fatalError("Unimplemented")
+
+    case .endTypeErase:
+      fatalError("Unimplemented")
+
     case .builtinAssertion:
       builtinAssertion()
 

diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift
@@ -11,6 +11,7 @@
 
 @_implementationOnly import _RegexParser
 
+@available(SwiftStdlib 5.7, *)
 struct Executor {
   // TODO: consider let, for now lets us toggle tracing
   var engine: Engine

diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift
@@ -280,6 +280,9 @@ extension PrettyPrinter {
 
     case .absentFunction:
       print("/* TODO: absent function */")
+
+    case .typeErase:
+      print("/* TODO: type erasure */")
     }
   }
 

diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift
@@ -56,7 +56,13 @@ public struct Regex<Output>: RegexComponent {
   }
 
   public var regex: Regex<Output> {
-    self
+    if Output.self == AnyRegexOutput.self {
+      if case .typeErase = root {
+        return self
+      }
+      return .init(node: .typeErase(root))
+    }
+    return self
   }
 }
 

diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift
@@ -93,6 +93,10 @@ extension DSLTree {
 
     case matcher(Any.Type, _MatcherInterface)
 
+    // MARK: - Type erasure
+
+    case typeErase(Node)
+
     // TODO: Would this just boil down to a consumer?
     case characterPredicate(_CharacterPredicateInterface)
   }
@@ -265,6 +269,7 @@ extension DSLTree.Node {
     case let .capture(_, _, n, _):        return [n]
     case let .nonCapturingGroup(_, n):    return [n]
     case let .quantification(_, _, n):    return [n]
+    case let .typeErase(n): return [n]
 
     case let .conditional(_, t, f): return [t,f]
 
@@ -486,6 +491,7 @@ public struct CaptureTransform: Hashable, CustomStringConvertible {
 // These wrapper types are required because even @_spi-marked public APIs can't
 // include symbols from implementation-only dependencies.
 
+@available(SwiftStdlib 5.7, *)
 extension DSLTree.Node {
   func _addCaptures(
     to list: inout CaptureList,
@@ -551,7 +557,7 @@ extension DSLTree.Node {
       break
 
     case .customCharacterClass, .atom, .trivia, .empty,
-        .quotedLiteral, .consumer, .characterPredicate:
+        .quotedLiteral, .consumer, .characterPredicate, .typeErase:
       break
     }
   }
@@ -566,7 +572,7 @@ extension DSLTree.Node {
          .conditional, .quantification, .customCharacterClass, .atom,
          .trivia, .empty, .quotedLiteral, .regexLiteral, .absentFunction,
          .convertedRegexLiteral, .consumer,
-         .characterPredicate, .matcher:
+         .characterPredicate, .matcher, .typeErase:
       return false
     }
   }
@@ -583,16 +589,28 @@ extension DSLTree.Node {
 
   /// Returns the type of the whole match, i.e. `.0` element type of the output.
   var wholeMatchType: Any.Type {
-    if case .matcher(let type, _) = outputDefiningNode {
+    switch outputDefiningNode {
+    case .matcher(let type, _):
       return type
+    case .typeErase:
+      return AnyRegexOutput.self
+    default:
+      return Substring.self
     }
-    return Substring.self
   }
 }
 
 extension DSLTree {
+  @available(SwiftStdlib 5.7, *)
   var captureList: CaptureList {
     var list = CaptureList()
+    // FIXME: This is peering through any top-level `.typeErase`. Once type
+    // erasure was handled in the engine, this can be simplified to using `root`
+    // directly.
+    var root = root
+    while case let .typeErase(child) = root {
+      root = child
+    }
     list.append(.init(type: root.wholeMatchType, optionalDepth: 0, .fake))
     root._addCaptures(to: &list, optionalNesting: 0)
     return list
@@ -620,6 +638,7 @@ extension DSLTree {
       case let .capture(_, _, n, _):        return [_Tree(n)]
       case let .nonCapturingGroup(_, n):    return [_Tree(n)]
       case let .quantification(_, _, n):    return [_Tree(n)]
+      case let .typeErase(n):               return [_Tree(n)]
 
       case let .conditional(_, t, f): return [_Tree(t), _Tree(f)]
 

diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift
@@ -1071,6 +1071,78 @@ class RegexDSLTests: XCTestCase {
       }
     }
   }
+
+  func testTypeErasedRegexInDSL() throws {
+    do {
+      let input = "johnappleseed: 12."
+      let numberRegex = try! Regex(#"(\d+)\.?"#)
+      let regex = Regex {
+        Capture {
+          OneOrMore(.word)
+        }
+        ZeroOrMore(.whitespace)
+        ":"
+        ZeroOrMore(.whitespace)
+        numberRegex
+      }
+      let match = try XCTUnwrap(input.wholeMatch(of: regex))
+      XCTAssertEqual(match.0, input[...])
+      XCTAssertEqual(match.1, "johnappleseed")
+    }
+    do {
+      let input = "johnappleseed: 12."
+      let numberRegex = try! Regex(#"(\d+)\.?"#)
+      let regex = Regex {
+        Capture {
+          OneOrMore(.word)
+        }
+        ZeroOrMore(.whitespace)
+        ":"
+        ZeroOrMore(.whitespace)
+        Capture { numberRegex }
+      }
+      let match = try XCTUnwrap(input.wholeMatch(of: regex))
+      XCTAssertEqual(match.0, input[...])
+      XCTAssertEqual(match.1, "johnappleseed")
+      XCTAssertEqual(match.2[0].value as? Substring, "12.")
+      XCTAssertEqual(match.2[1].value as? Substring, "12")
+    }
+    do {
+      let input = "johnappleseed: 12."
+      // Anchors should be with respect to the entire input.
+      let numberRegex = try! Regex(#"^(\d+)\.?"#)
+      let regex = Regex {
+        Capture {
+          OneOrMore(.word)
+        }
+        ZeroOrMore(.whitespace)
+        ":"
+        ZeroOrMore(.whitespace)
+        Capture { numberRegex }
+      }
+      XCTAssertNil(input.wholeMatch(of: regex))
+    }
+    do {
+      let input = "johnappleseed: 12.[12]"
+      // Backreferences in a type-erased regex are scoped to the type-erased
+      // regex itself. `\1` here should refer to "12", not "johnappleseed"
+      let numberRegex = try! Regex(#"(\d+)\.?\[\1\]"#)
+      let regex = Regex {
+        Capture {
+          OneOrMore(.word)
+        }
+        ZeroOrMore(.whitespace)
+        ":"
+        ZeroOrMore(.whitespace)
+        Capture { numberRegex }
+      }
+      let match = try XCTUnwrap(input.wholeMatch(of: regex))
+      XCTAssertEqual(match.0, input[...])
+      XCTAssertEqual(match.1, "johnappleseed")
+      XCTAssertEqual(match.2[0].value as? Substring, "12.[12]")
+      XCTAssertEqual(match.2[1].value as? Substring, "12")
+    }
+  }
 }
 
 extension Unicode.Scalar {