-
-
Notifications
You must be signed in to change notification settings - Fork 131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update dependencies #329
Update dependencies #329
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
use std::convert::TryFrom; | ||
|
||
use lazy_static::lazy_static; | ||
use regex_syntax::hir::{Hir, HirKind, RepetitionKind, RepetitionRange}; | ||
use regex_syntax::hir::{Dot, Hir, HirKind, Repetition}; | ||
use regex_syntax::ParserBuilder; | ||
|
||
pub use regex_syntax::hir::{Class, ClassUnicode, Literal}; | ||
|
@@ -10,10 +10,10 @@ use crate::error::{Error, Result}; | |
|
||
lazy_static! { | ||
/// DOT regex that matches utf8 only. | ||
static ref DOT_UTF8: Hir = Hir::dot(false); | ||
static ref DOT_UTF8: Hir = Hir::dot(Dot::AnyChar); | ||
|
||
/// DOT regex that matches any byte. | ||
static ref DOT_BYTES: Hir = Hir::dot(true); | ||
static ref DOT_BYTES: Hir = Hir::dot(Dot::AnyByte); | ||
} | ||
|
||
/// Middle Intermediate Representation of the regex, built from | ||
|
@@ -28,7 +28,7 @@ pub enum Mir { | |
Concat(Vec<Mir>), | ||
Alternation(Vec<Mir>), | ||
Class(Class), | ||
Literal(Literal), | ||
Literal(char), | ||
} | ||
|
||
impl Mir { | ||
|
@@ -48,7 +48,7 @@ impl Mir { | |
pub fn binary(source: &str) -> Result<Mir> { | ||
Mir::try_from( | ||
ParserBuilder::new() | ||
.allow_invalid_utf8(true) | ||
.utf8(false) | ||
.unicode(false) | ||
.build() | ||
.parse(source)?, | ||
|
@@ -58,7 +58,7 @@ impl Mir { | |
pub fn binary_ignore_case(source: &str) -> Result<Mir> { | ||
Mir::try_from( | ||
ParserBuilder::new() | ||
.allow_invalid_utf8(true) | ||
.utf8(false) | ||
.unicode(false) | ||
.case_insensitive(true) | ||
.build() | ||
|
@@ -111,23 +111,38 @@ impl TryFrom<Hir> for Mir { | |
|
||
Ok(Mir::Alternation(alternation)) | ||
} | ||
HirKind::Literal(literal) => Ok(Mir::Literal(literal)), | ||
HirKind::Literal(literal) => { | ||
let s = std::str::from_utf8(&*literal.0).unwrap(); | ||
let mut chars = s.chars().map(Mir::Literal).peekable(); | ||
let c = chars.next().expect("a literal cannot be empty"); | ||
if chars.peek().is_some() { | ||
Ok(Mir::Concat(std::iter::once(c).chain(chars).collect())) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suspect this is why my PR differs to #320, and doesn't require changing the class priority. I'm producing multiple There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That could be an alternative solution, maybe you should refer to that in #320 directly? |
||
} else { | ||
Ok(c) | ||
} | ||
} | ||
HirKind::Class(class) => Ok(Mir::Class(class)), | ||
HirKind::Repetition(repetition) => { | ||
if !repetition.greedy { | ||
let Repetition { | ||
min, | ||
max, | ||
sub, | ||
greedy, | ||
} = repetition; | ||
|
||
if !greedy { | ||
return Err("#[regex]: non-greedy parsing is currently unsupported.".into()); | ||
} | ||
|
||
let kind = repetition.kind; | ||
let is_dot = if repetition.hir.is_always_utf8() { | ||
*repetition.hir == *DOT_UTF8 | ||
let is_dot = if sub.properties().is_utf8() { | ||
*sub == *DOT_UTF8 | ||
} else { | ||
*repetition.hir == *DOT_BYTES | ||
*sub == *DOT_BYTES | ||
}; | ||
let mir = Mir::try_from(*repetition.hir)?; | ||
|
||
match kind { | ||
RepetitionKind::ZeroOrMore | RepetitionKind::OneOrMore if is_dot => { | ||
let sub: Mir = Mir::try_from(*sub)?; | ||
match (min, max) { | ||
(0 | 1, None) if is_dot => { | ||
Err( | ||
"#[regex]: \".+\" and \".*\" patterns will greedily consume \ | ||
the entire source till the end as Logos does not allow \ | ||
|
@@ -139,47 +154,34 @@ impl TryFrom<Hir> for Mir { | |
.into() | ||
) | ||
} | ||
RepetitionKind::ZeroOrOne => Ok(Mir::Maybe(Box::new(mir))), | ||
RepetitionKind::ZeroOrMore => Ok(Mir::Loop(Box::new(mir))), | ||
RepetitionKind::OneOrMore => { | ||
Ok(Mir::Concat(vec![mir.clone(), Mir::Loop(Box::new(mir))])) | ||
} | ||
RepetitionKind::Range(range) => match range { | ||
RepetitionRange::Exactly(n) => { | ||
let mut out = Vec::with_capacity(n as usize); | ||
for _ in 0..n { | ||
out.push(mir.clone()); | ||
} | ||
Ok(Mir::Concat(out)) | ||
} | ||
RepetitionRange::AtLeast(n) => { | ||
let mut out = Vec::with_capacity(n as usize); | ||
for _ in 0..n { | ||
out.push(mir.clone()); | ||
} | ||
out.push(Mir::Loop(Box::new(mir))); | ||
Ok(Mir::Concat(out)) | ||
} | ||
RepetitionRange::Bounded(n, m) => { | ||
let mut out = Vec::with_capacity(m as usize); | ||
for _ in 0..n { | ||
out.push(mir.clone()); | ||
} | ||
for _ in n..m { | ||
out.push(Mir::Maybe(Box::new(mir.clone()))); | ||
} | ||
Ok(Mir::Concat(out)) | ||
} | ||
}, | ||
// ZeroOrOne | ||
(0, Some(1)) => Ok(Mir::Maybe(Box::new(sub))), | ||
// ZeroOrMore | ||
(0, None) => Ok(Mir::Loop(Box::new(sub))), | ||
// OneOrMore | ||
(1, None) => Ok(Mir::Concat(vec![sub.clone(), Mir::Loop(Box::new(sub))])), | ||
// Exactly | ||
(n, Some(m)) if n == m => Ok(Mir::Concat( | ||
std::iter::repeat(sub).take(n as usize).collect(), | ||
)), | ||
// AtLeast | ||
(n, None) => Ok(Mir::Concat( | ||
(std::iter::repeat(sub.clone()).take(n as usize)) | ||
.chain([Mir::Loop(Box::new(sub))]) | ||
.collect(), | ||
)), | ||
// Bounded | ||
(n, Some(m)) => Ok(Mir::Concat( | ||
(std::iter::repeat(sub.clone()).take(n as usize)) | ||
.chain(std::iter::repeat(Mir::Maybe(Box::new(sub))).take((n..m).len())) | ||
.collect(), | ||
)), | ||
} | ||
} | ||
HirKind::Group(group) => Mir::try_from(*group.hir), | ||
HirKind::WordBoundary(_) => { | ||
Err("#[regex]: word boundaries are currently unsupported.".into()) | ||
} | ||
HirKind::Anchor(_) => { | ||
Err("#[regex]: anchors in #[regex] are currently unsupported.".into()) | ||
HirKind::Look(_) => { | ||
Err("#[regex]: lookahead and lookbehind are currently unsupported.".into()) | ||
} | ||
HirKind::Capture(capture) => Mir::try_from(*capture.sub), | ||
} | ||
} | ||
} | ||
|
@@ -192,7 +194,7 @@ mod tests { | |
fn priorities() { | ||
let regexes = [ | ||
("[a-z]+", 1), | ||
("a|b", 2), | ||
("a|b", 1), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note, the HIR representation here changes from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed in #320, I don't think we just adapt our code to possible optimization. Otherwise, this can be a never ending game where each optimizer in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see your point, but it seems like the only sensible way forward from that is to fork regex-syntax. Some optimisations will not be invertible. Alternatively, the priority function could be based on the number of nodes/edges in a regex-syntax derived canonical DFA, which I assume will be stable across optimisations? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hum i don’t think optimisations need to be lossy. To me, an optimisation results in the same behavior, which is what the priority should compute There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I clarified what I meant as lossy as “non-invertible”. I mean that optimisations in regex-syntax may optimise in a way that we don’t have enough information to get back to the form we expect. However we should always be able to get to the same canonical DFA. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
("a|[b-z]", 1), | ||
("(foo)+", 6), | ||
("foobar", 12), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note,
HirKind::Literal
is now multiple characters. I'm not sure if it's required to be able to parse as utf8 or not, this may need to use a different approach if it's allowed arbitrary bytes.