-
Notifications
You must be signed in to change notification settings - Fork 46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow Regex<AnyRegexOutput>
to be used in the DSL.
#504
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,6 +193,15 @@ extension Instruction { | |
/// | ||
case backreference | ||
|
||
/// Push a new type erasure scope into the capture stack. | ||
case beginTypeErase | ||
|
||
/// Pop the last type erasure scope, create a `AnyRegexOutput` from that | ||
/// scope, and store it in a value register. | ||
/// | ||
/// endTypeErase(_: ValReg) | ||
case endTypeErase | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do these instructions do? What's their semantics? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added comments. The processor will maintain a stack of capture lists. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not fully parsing this. Could you write some XFAIL tests that illustrate the work that remains to be done after this PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's no XFAIL-able tests. This PR fixes the bug, but it's not as efficient as handling this in the bytecode directly. I'd be happy to chat in person also. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How would There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think backtracking into ARO will be supported, since |
||
|
||
// MARK: Matching: State transitions | ||
|
||
// TODO: State transitions need more work. We want | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,7 +56,13 @@ public struct Regex<Output>: RegexComponent { | |
} | ||
|
||
public var regex: Regex<Output> { | ||
self | ||
if Output.self == AnyRegexOutput.self { | ||
if case .typeErase = root { | ||
return self | ||
} | ||
return .init(node: .typeErase(root)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we do this on creation instead, or somewhere else? |
||
} | ||
return self | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -93,6 +93,10 @@ extension DSLTree { | |
|
||
case matcher(Any.Type, _MatcherInterface) | ||
|
||
// MARK: - Type erasure | ||
|
||
case typeErase(Node) | ||
|
||
// TODO: Would this just boil down to a consumer? | ||
case characterPredicate(_CharacterPredicateInterface) | ||
} | ||
|
@@ -265,6 +269,7 @@ extension DSLTree.Node { | |
case let .capture(_, _, n, _): return [n] | ||
case let .nonCapturingGroup(_, n): return [n] | ||
case let .quantification(_, _, n): return [n] | ||
case let .typeErase(n): return [n] | ||
|
||
case let .conditional(_, t, f): return [t,f] | ||
|
||
|
@@ -486,6 +491,7 @@ public struct CaptureTransform: Hashable, CustomStringConvertible { | |
// These wrapper types are required because even @_spi-marked public APIs can't | ||
// include symbols from implementation-only dependencies. | ||
|
||
@available(SwiftStdlib 5.7, *) | ||
extension DSLTree.Node { | ||
func _addCaptures( | ||
to list: inout CaptureList, | ||
|
@@ -551,7 +557,7 @@ extension DSLTree.Node { | |
break | ||
|
||
case .customCharacterClass, .atom, .trivia, .empty, | ||
.quotedLiteral, .consumer, .characterPredicate: | ||
.quotedLiteral, .consumer, .characterPredicate, .typeErase: | ||
break | ||
} | ||
} | ||
|
@@ -566,7 +572,7 @@ extension DSLTree.Node { | |
.conditional, .quantification, .customCharacterClass, .atom, | ||
.trivia, .empty, .quotedLiteral, .regexLiteral, .absentFunction, | ||
.convertedRegexLiteral, .consumer, | ||
.characterPredicate, .matcher: | ||
.characterPredicate, .matcher, .typeErase: | ||
return false | ||
} | ||
} | ||
|
@@ -583,16 +589,28 @@ extension DSLTree.Node { | |
|
||
/// Returns the type of the whole match, i.e. `.0` element type of the output. | ||
var wholeMatchType: Any.Type { | ||
if case .matcher(let type, _) = outputDefiningNode { | ||
switch outputDefiningNode { | ||
case .matcher(let type, _): | ||
return type | ||
case .typeErase: | ||
return AnyRegexOutput.self | ||
default: | ||
return Substring.self | ||
} | ||
return Substring.self | ||
} | ||
} | ||
|
||
extension DSLTree { | ||
@available(SwiftStdlib 5.7, *) | ||
var captureList: CaptureList { | ||
var list = CaptureList() | ||
// FIXME: This is peering through any top-level `.typeErase`. Once type | ||
// erasure was handled in the engine, this can be simplified to using `root` | ||
// directly. | ||
var root = root | ||
while case let .typeErase(child) = root { | ||
root = child | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this scan in here? It seems like you'd want to capture an |
||
list.append(.init(type: root.wholeMatchType, optionalDepth: 0, .fake)) | ||
root._addCaptures(to: &list, optionalNesting: 0) | ||
return list | ||
|
@@ -620,6 +638,7 @@ extension DSLTree { | |
case let .capture(_, _, n, _): return [_Tree(n)] | ||
case let .nonCapturingGroup(_, n): return [_Tree(n)] | ||
case let .quantification(_, _, n): return [_Tree(n)] | ||
case let .typeErase(n): return [_Tree(n)] | ||
|
||
case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1071,6 +1071,78 @@ class RegexDSLTests: XCTestCase { | |
} | ||
} | ||
} | ||
|
||
func testTypeErasedRegexInDSL() throws { | ||
do { | ||
let input = "johnappleseed: 12." | ||
let numberRegex = try! Regex(#"(\d+)\.?"#) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a test case where the dynamic regex begins with a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added one. The idea is to fix up the search bounds so that It happens to work today because anchors currently use the base string's bounds, not the input slice's bounds. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, there is a bug here that another bug is masking, but when that other bug is fixed, how do we fix this bug? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can illustrate the difference with a substring input, where the subject bounds are the substring's bounds and the search bounds are contained within. |
||
let regex = Regex { | ||
Capture { | ||
OneOrMore(.word) | ||
} | ||
ZeroOrMore(.whitespace) | ||
":" | ||
ZeroOrMore(.whitespace) | ||
numberRegex | ||
} | ||
let match = try XCTUnwrap(input.wholeMatch(of: regex)) | ||
XCTAssertEqual(match.0, input[...]) | ||
XCTAssertEqual(match.1, "johnappleseed") | ||
} | ||
do { | ||
let input = "johnappleseed: 12." | ||
let numberRegex = try! Regex(#"(\d+)\.?"#) | ||
let regex = Regex { | ||
Capture { | ||
OneOrMore(.word) | ||
} | ||
ZeroOrMore(.whitespace) | ||
":" | ||
ZeroOrMore(.whitespace) | ||
Capture { numberRegex } | ||
} | ||
let match = try XCTUnwrap(input.wholeMatch(of: regex)) | ||
XCTAssertEqual(match.0, input[...]) | ||
XCTAssertEqual(match.1, "johnappleseed") | ||
XCTAssertEqual(match.2[0].value as? Substring, "12.") | ||
XCTAssertEqual(match.2[1].value as? Substring, "12") | ||
} | ||
do { | ||
let input = "johnappleseed: 12." | ||
// Anchors should be with respect to the entire input. | ||
let numberRegex = try! Regex(#"^(\d+)\.?"#) | ||
let regex = Regex { | ||
Capture { | ||
OneOrMore(.word) | ||
} | ||
ZeroOrMore(.whitespace) | ||
":" | ||
ZeroOrMore(.whitespace) | ||
Capture { numberRegex } | ||
} | ||
XCTAssertNil(input.wholeMatch(of: regex)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Positive match tests with anchors? |
||
} | ||
do { | ||
let input = "johnappleseed: 12.[12]" | ||
// Backreferences in a type-erased regex are scoped to the type-erased | ||
// regex itself. `\1` here should refer to "12", not "johnappleseed" | ||
let numberRegex = try! Regex(#"(\d+)\.?\[\1\]"#) | ||
let regex = Regex { | ||
Capture { | ||
OneOrMore(.word) | ||
} | ||
ZeroOrMore(.whitespace) | ||
":" | ||
ZeroOrMore(.whitespace) | ||
Capture { numberRegex } | ||
} | ||
let match = try XCTUnwrap(input.wholeMatch(of: regex)) | ||
XCTAssertEqual(match.0, input[...]) | ||
XCTAssertEqual(match.1, "johnappleseed") | ||
XCTAssertEqual(match.2[0].value as? Substring, "12.[12]") | ||
XCTAssertEqual(match.2[1].value as? Substring, "12") | ||
} | ||
} | ||
} | ||
|
||
extension Unicode.Scalar { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Matchers don't get the subject bounds, so I don't see how this would support anchors that refer to the subject bounds. We could have a different matcher interface or else add subject bounds as an additional parameter for the internal code (probably not API though).