Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update dependencies #329

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions logos-codegen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ fnv = "1.0.6"
syn = { version = "2.0.13", features = ["full"] }
quote = "1.0.3"
proc-macro2 = "1.0.9"
regex-syntax = "0.6"
regex-syntax = "0.7"
lazy_static = "1.4.0"

[dev-dependencies]
pretty_assertions = "0.6.1"
pretty_assertions = "1.4"
25 changes: 7 additions & 18 deletions logos-codegen/src/graph/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::fmt::Debug;
use regex_syntax::utf8::Utf8Sequences;

use crate::graph::{Disambiguate, Fork, Graph, Node, NodeId, Range, ReservedId, Rope};
use crate::mir::{Class, ClassUnicode, Literal, Mir};
use crate::mir::{Class, ClassUnicode, Mir};

impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId {
Expand Down Expand Up @@ -51,14 +51,8 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {

self.insert_or_push(reserved, fork)
}
Mir::Literal(literal) => {
let pattern = match literal {
Literal::Unicode(unicode) => {
unicode.encode_utf8(&mut [0; 4]).as_bytes().to_vec()
}
Literal::Byte(byte) => [byte].to_vec(),
};

Mir::Literal(c) => {
let pattern = c.encode_utf8(&mut [0u8; 4]).as_bytes().to_vec();
self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss))
}
Mir::Concat(mut concat) => {
Expand All @@ -71,18 +65,13 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
let mut then = then;

let mut handle_bytes = |graph: &mut Self, mir, then: &mut NodeId| match mir {
Mir::Literal(Literal::Unicode(u)) => {
cur -= u.len_utf8();
for (i, byte) in u.encode_utf8(&mut [0; 4]).bytes().enumerate() {
Mir::Literal(c) => {
cur -= c.len_utf8();
for (i, byte) in c.encode_utf8(&mut [0u8; 4]).bytes().enumerate() {
ropebuf[cur + i] = byte.into();
}
None
}
Mir::Literal(Literal::Byte(byte)) => {
cur -= 1;
ropebuf[cur] = byte.into();
None
}
Mir::Class(Class::Unicode(class)) if is_one_ascii(&class) => {
cur -= 1;
ropebuf[cur] = class.ranges()[0].into();
Expand Down Expand Up @@ -217,7 +206,7 @@ mod tests {

let mir = Mir::utf8("a|b").unwrap();

assert_eq!(mir.priority(), 2);
assert_eq!(mir.priority(), 1);

let leaf = graph.push(Node::Leaf("LEAF"));
let id = graph.regex(mir, leaf);
Expand Down
110 changes: 56 additions & 54 deletions logos-codegen/src/mir.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::convert::TryFrom;

use lazy_static::lazy_static;
use regex_syntax::hir::{Hir, HirKind, RepetitionKind, RepetitionRange};
use regex_syntax::hir::{Dot, Hir, HirKind, Repetition};
use regex_syntax::ParserBuilder;

pub use regex_syntax::hir::{Class, ClassUnicode, Literal};
Expand All @@ -10,10 +10,10 @@ use crate::error::{Error, Result};

lazy_static! {
/// DOT regex that matches utf8 only.
static ref DOT_UTF8: Hir = Hir::dot(false);
static ref DOT_UTF8: Hir = Hir::dot(Dot::AnyChar);

/// DOT regex that matches any byte.
static ref DOT_BYTES: Hir = Hir::dot(true);
static ref DOT_BYTES: Hir = Hir::dot(Dot::AnyByte);
}

/// Middle Intermediate Representation of the regex, built from
Expand All @@ -28,7 +28,7 @@ pub enum Mir {
Concat(Vec<Mir>),
Alternation(Vec<Mir>),
Class(Class),
Literal(Literal),
Literal(char),
}

impl Mir {
Expand All @@ -48,7 +48,7 @@ impl Mir {
pub fn binary(source: &str) -> Result<Mir> {
Mir::try_from(
ParserBuilder::new()
.allow_invalid_utf8(true)
.utf8(false)
.unicode(false)
.build()
.parse(source)?,
Expand All @@ -58,7 +58,7 @@ impl Mir {
pub fn binary_ignore_case(source: &str) -> Result<Mir> {
Mir::try_from(
ParserBuilder::new()
.allow_invalid_utf8(true)
.utf8(false)
.unicode(false)
.case_insensitive(true)
.build()
Expand Down Expand Up @@ -111,23 +111,38 @@ impl TryFrom<Hir> for Mir {

Ok(Mir::Alternation(alternation))
}
HirKind::Literal(literal) => Ok(Mir::Literal(literal)),
HirKind::Literal(literal) => {
let s = std::str::from_utf8(&*literal.0).unwrap();
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, HirKind::Literal is now multiple characters. I'm not sure if it's required to be able to parse as utf8 or not, this may need to use a different approach if it's allowed arbitrary bytes.

let mut chars = s.chars().map(Mir::Literal).peekable();
let c = chars.next().expect("a literal cannot be empty");
if chars.peek().is_some() {
Ok(Mir::Concat(std::iter::once(c).chain(chars).collect()))
Copy link
Author

@therealbnut therealbnut Jul 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect this is why my PR differs to #320, and doesn't require changing the class priority. I'm producing multiple Mir::Literal for one HirKind::Literal.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That could be an alternative solution, maybe you should refer to that in #320 directly?

} else {
Ok(c)
}
}
HirKind::Class(class) => Ok(Mir::Class(class)),
HirKind::Repetition(repetition) => {
if !repetition.greedy {
let Repetition {
min,
max,
sub,
greedy,
} = repetition;

if !greedy {
return Err("#[regex]: non-greedy parsing is currently unsupported.".into());
}

let kind = repetition.kind;
let is_dot = if repetition.hir.is_always_utf8() {
*repetition.hir == *DOT_UTF8
let is_dot = if sub.properties().is_utf8() {
*sub == *DOT_UTF8
} else {
*repetition.hir == *DOT_BYTES
*sub == *DOT_BYTES
};
let mir = Mir::try_from(*repetition.hir)?;

match kind {
RepetitionKind::ZeroOrMore | RepetitionKind::OneOrMore if is_dot => {
let sub: Mir = Mir::try_from(*sub)?;
match (min, max) {
(0 | 1, None) if is_dot => {
Err(
"#[regex]: \".+\" and \".*\" patterns will greedily consume \
the entire source till the end as Logos does not allow \
Expand All @@ -139,47 +154,34 @@ impl TryFrom<Hir> for Mir {
.into()
)
}
RepetitionKind::ZeroOrOne => Ok(Mir::Maybe(Box::new(mir))),
RepetitionKind::ZeroOrMore => Ok(Mir::Loop(Box::new(mir))),
RepetitionKind::OneOrMore => {
Ok(Mir::Concat(vec![mir.clone(), Mir::Loop(Box::new(mir))]))
}
RepetitionKind::Range(range) => match range {
RepetitionRange::Exactly(n) => {
let mut out = Vec::with_capacity(n as usize);
for _ in 0..n {
out.push(mir.clone());
}
Ok(Mir::Concat(out))
}
RepetitionRange::AtLeast(n) => {
let mut out = Vec::with_capacity(n as usize);
for _ in 0..n {
out.push(mir.clone());
}
out.push(Mir::Loop(Box::new(mir)));
Ok(Mir::Concat(out))
}
RepetitionRange::Bounded(n, m) => {
let mut out = Vec::with_capacity(m as usize);
for _ in 0..n {
out.push(mir.clone());
}
for _ in n..m {
out.push(Mir::Maybe(Box::new(mir.clone())));
}
Ok(Mir::Concat(out))
}
},
// ZeroOrOne
(0, Some(1)) => Ok(Mir::Maybe(Box::new(sub))),
// ZeroOrMore
(0, None) => Ok(Mir::Loop(Box::new(sub))),
// OneOrMore
(1, None) => Ok(Mir::Concat(vec![sub.clone(), Mir::Loop(Box::new(sub))])),
// Exactly
(n, Some(m)) if n == m => Ok(Mir::Concat(
std::iter::repeat(sub).take(n as usize).collect(),
)),
// AtLeast
(n, None) => Ok(Mir::Concat(
(std::iter::repeat(sub.clone()).take(n as usize))
.chain([Mir::Loop(Box::new(sub))])
.collect(),
)),
// Bounded
(n, Some(m)) => Ok(Mir::Concat(
(std::iter::repeat(sub.clone()).take(n as usize))
.chain(std::iter::repeat(Mir::Maybe(Box::new(sub))).take((n..m).len()))
.collect(),
)),
}
}
HirKind::Group(group) => Mir::try_from(*group.hir),
HirKind::WordBoundary(_) => {
Err("#[regex]: word boundaries are currently unsupported.".into())
}
HirKind::Anchor(_) => {
Err("#[regex]: anchors in #[regex] are currently unsupported.".into())
HirKind::Look(_) => {
Err("#[regex]: lookahead and lookbehind are currently unsupported.".into())
}
HirKind::Capture(capture) => Mir::try_from(*capture.sub),
}
}
}
Expand All @@ -192,7 +194,7 @@ mod tests {
fn priorities() {
let regexes = [
("[a-z]+", 1),
("a|b", 2),
("a|b", 1),
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, the HIR representation here changes from Alternate(['a', 'b']) to Class('a'..='b'), which is probably the result of an upstream optimisation - if the test wants it to be an alternate, then it's probably better to do a|c.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed in #320, I don't think we just adapt our code to possible optimization. Otherwise, this can be a never ending game where each optimizer in regex-syntax would lead to a breaking change in logos :'-)

Copy link
Author

@therealbnut therealbnut Aug 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point, but it seems like the only sensible way forward from that is to fork regex-syntax. Some optimisations will not be invertible.

Alternatively, the priority function could be based on the number of nodes/edges in a regex-syntax derived canonical DFA, which I assume will be stable across optimisations?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hum i don’t think optimisations need to be lossy. To me, an optimisation results in the same behavior, which is what the priority should compute

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I clarified what I meant as lossy as “non-invertible”. I mean that optimisations in regex-syntax may optimise in a way that we don’t have enough information to get back to the form we expect.

However we should always be able to get to the same canonical DFA.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

("a|[b-z]", 1),
("(foo)+", 6),
("foobar", 12),
Expand Down
23 changes: 7 additions & 16 deletions logos-codegen/src/parser/ignore_flags.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,16 +210,16 @@ pub mod ascii_case {
fn make_ascii_case_insensitive(self) -> Mir {
if self.is_ascii_lowercase() {
Mir::Alternation(vec![
Mir::Literal(hir::Literal::Byte(self - 32)),
Mir::Literal(hir::Literal::Byte(self)),
Mir::Literal((self - 32) as char),
Mir::Literal(self as char),
])
} else if self.is_ascii_uppercase() {
Mir::Alternation(vec![
Mir::Literal(hir::Literal::Byte(self)),
Mir::Literal(hir::Literal::Byte(self + 32)),
Mir::Literal(self as char),
Mir::Literal((self + 32) as char),
])
} else {
Mir::Literal(hir::Literal::Byte(self))
Mir::Literal(self as char)
}
}
}
Expand All @@ -229,16 +229,7 @@ pub mod ascii_case {
if self.is_ascii() {
(self as u8).make_ascii_case_insensitive()
} else {
Mir::Literal(hir::Literal::Unicode(self))
}
}
}

impl MakeAsciiCaseInsensitive for hir::Literal {
fn make_ascii_case_insensitive(self) -> Mir {
match self {
hir::Literal::Byte(b) => b.make_ascii_case_insensitive(),
hir::Literal::Unicode(c) => c.make_ascii_case_insensitive(),
Mir::Literal(self)
}
}
}
Expand Down Expand Up @@ -360,7 +351,7 @@ pub mod ascii_case {
.collect(),
),
Mir::Class(c) => c.make_ascii_case_insensitive(),
Mir::Literal(l) => l.make_ascii_case_insensitive(),
Mir::Literal(c) => c.make_ascii_case_insensitive(),
}
}
}
Expand Down