maciejhirsz · therealbnut · Jul 28, 2023 · therealbnut · Jul 28, 2023 · therealbnut
diff --git a/logos-codegen/Cargo.toml b/logos-codegen/Cargo.toml
@@ -17,8 +17,8 @@ fnv = "1.0.6"
 syn = { version = "2.0.13", features = ["full"] }
 quote = "1.0.3"
 proc-macro2 = "1.0.9"
-regex-syntax = "0.6"
+regex-syntax = "0.7"
 lazy_static = "1.4.0"
 
 [dev-dependencies]
-pretty_assertions = "0.6.1"
+pretty_assertions = "1.4"
diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs
@@ -3,7 +3,7 @@ use std::fmt::Debug;
 use regex_syntax::utf8::Utf8Sequences;
 
 use crate::graph::{Disambiguate, Fork, Graph, Node, NodeId, Range, ReservedId, Rope};
-use crate::mir::{Class, ClassUnicode, Literal, Mir};
+use crate::mir::{Class, ClassUnicode, Mir};
 
 impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
     pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId {
@@ -51,14 +51,8 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
 
                 self.insert_or_push(reserved, fork)
             }
-            Mir::Literal(literal) => {
-                let pattern = match literal {
-                    Literal::Unicode(unicode) => {
-                        unicode.encode_utf8(&mut [0; 4]).as_bytes().to_vec()
-                    }
-                    Literal::Byte(byte) => [byte].to_vec(),
-                };
-
+            Mir::Literal(c) => {
+                let pattern = c.encode_utf8(&mut [0u8; 4]).as_bytes().to_vec();
                 self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss))
             }
             Mir::Concat(mut concat) => {
@@ -71,18 +65,13 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
                 let mut then = then;
 
                 let mut handle_bytes = |graph: &mut Self, mir, then: &mut NodeId| match mir {
-                    Mir::Literal(Literal::Unicode(u)) => {
-                        cur -= u.len_utf8();
-                        for (i, byte) in u.encode_utf8(&mut [0; 4]).bytes().enumerate() {
+                    Mir::Literal(c) => {
+                        cur -= c.len_utf8();
+                        for (i, byte) in c.encode_utf8(&mut [0u8; 4]).bytes().enumerate() {
                             ropebuf[cur + i] = byte.into();
                         }
                         None
                     }
-                    Mir::Literal(Literal::Byte(byte)) => {
-                        cur -= 1;
-                        ropebuf[cur] = byte.into();
-                        None
-                    }
                     Mir::Class(Class::Unicode(class)) if is_one_ascii(&class) => {
                         cur -= 1;
                         ropebuf[cur] = class.ranges()[0].into();
@@ -217,7 +206,7 @@ mod tests {
 
         let mir = Mir::utf8("a|b").unwrap();
 
-        assert_eq!(mir.priority(), 2);
+        assert_eq!(mir.priority(), 1);
 
         let leaf = graph.push(Node::Leaf("LEAF"));
         let id = graph.regex(mir, leaf);

diff --git a/logos-codegen/src/mir.rs b/logos-codegen/src/mir.rs
@@ -1,7 +1,7 @@
 use std::convert::TryFrom;
 
 use lazy_static::lazy_static;
-use regex_syntax::hir::{Hir, HirKind, RepetitionKind, RepetitionRange};
+use regex_syntax::hir::{Dot, Hir, HirKind, Repetition};
 use regex_syntax::ParserBuilder;
 
 pub use regex_syntax::hir::{Class, ClassUnicode, Literal};
@@ -10,10 +10,10 @@ use crate::error::{Error, Result};
 
 lazy_static! {
     /// DOT regex that matches utf8 only.
-    static ref DOT_UTF8: Hir = Hir::dot(false);
+    static ref DOT_UTF8: Hir = Hir::dot(Dot::AnyChar);
 
     /// DOT regex that matches any byte.
-    static ref DOT_BYTES: Hir = Hir::dot(true);
+    static ref DOT_BYTES: Hir = Hir::dot(Dot::AnyByte);
 }
 
 /// Middle Intermediate Representation of the regex, built from
@@ -28,7 +28,7 @@ pub enum Mir {
     Concat(Vec<Mir>),
     Alternation(Vec<Mir>),
     Class(Class),
-    Literal(Literal),
+    Literal(char),
 }
 
 impl Mir {
@@ -48,7 +48,7 @@ impl Mir {
     pub fn binary(source: &str) -> Result<Mir> {
         Mir::try_from(
             ParserBuilder::new()
-                .allow_invalid_utf8(true)
+                .utf8(false)
                 .unicode(false)
                 .build()
                 .parse(source)?,
@@ -58,7 +58,7 @@ impl Mir {
     pub fn binary_ignore_case(source: &str) -> Result<Mir> {
         Mir::try_from(
             ParserBuilder::new()
-                .allow_invalid_utf8(true)
+                .utf8(false)
                 .unicode(false)
                 .case_insensitive(true)
                 .build()
@@ -111,23 +111,38 @@ impl TryFrom<Hir> for Mir {
 
                 Ok(Mir::Alternation(alternation))
             }
-            HirKind::Literal(literal) => Ok(Mir::Literal(literal)),
+            HirKind::Literal(literal) => {
+                let s = std::str::from_utf8(&*literal.0).unwrap();
+                let mut chars = s.chars().map(Mir::Literal).peekable();
+                let c = chars.next().expect("a literal cannot be empty");
+                if chars.peek().is_some() {
+                    Ok(Mir::Concat(std::iter::once(c).chain(chars).collect()))
+                } else {
+                    Ok(c)
+                }
+            }
             HirKind::Class(class) => Ok(Mir::Class(class)),
             HirKind::Repetition(repetition) => {
-                if !repetition.greedy {
+                let Repetition {
+                    min,
+                    max,
+                    sub,
+                    greedy,
+                } = repetition;
+
+                if !greedy {
                     return Err("#[regex]: non-greedy parsing is currently unsupported.".into());
                 }
 
-                let kind = repetition.kind;
-                let is_dot = if repetition.hir.is_always_utf8() {
-                    *repetition.hir == *DOT_UTF8
+                let is_dot = if sub.properties().is_utf8() {
+                    *sub == *DOT_UTF8
                 } else {
-                    *repetition.hir == *DOT_BYTES
+                    *sub == *DOT_BYTES
                 };
-                let mir = Mir::try_from(*repetition.hir)?;
 
-                match kind {
-                    RepetitionKind::ZeroOrMore | RepetitionKind::OneOrMore if is_dot => {
+                let sub: Mir = Mir::try_from(*sub)?;
+                match (min, max) {
+                    (0 | 1, None) if is_dot => {
                         Err(
                             "#[regex]: \".+\" and \".*\" patterns will greedily consume \
                             the entire source till the end as Logos does not allow \
@@ -139,47 +154,34 @@ impl TryFrom<Hir> for Mir {
                             .into()
                         )
                     }
-                    RepetitionKind::ZeroOrOne => Ok(Mir::Maybe(Box::new(mir))),
-                    RepetitionKind::ZeroOrMore => Ok(Mir::Loop(Box::new(mir))),
-                    RepetitionKind::OneOrMore => {
-                        Ok(Mir::Concat(vec![mir.clone(), Mir::Loop(Box::new(mir))]))
-                    }
-                    RepetitionKind::Range(range) => match range {
-                        RepetitionRange::Exactly(n) => {
-                            let mut out = Vec::with_capacity(n as usize);
-                            for _ in 0..n {
-                                out.push(mir.clone());
-                            }
-                            Ok(Mir::Concat(out))
-                        }
-                        RepetitionRange::AtLeast(n) => {
-                            let mut out = Vec::with_capacity(n as usize);
-                            for _ in 0..n {
-                                out.push(mir.clone());
-                            }
-                            out.push(Mir::Loop(Box::new(mir)));
-                            Ok(Mir::Concat(out))
-                        }
-                        RepetitionRange::Bounded(n, m) => {
-                            let mut out = Vec::with_capacity(m as usize);
-                            for _ in 0..n {
-                                out.push(mir.clone());
-                            }
-                            for _ in n..m {
-                                out.push(Mir::Maybe(Box::new(mir.clone())));
-                            }
-                            Ok(Mir::Concat(out))
-                        }
-                    },
+                    // ZeroOrOne
+                    (0, Some(1)) => Ok(Mir::Maybe(Box::new(sub))),
+                    // ZeroOrMore
+                    (0, None) => Ok(Mir::Loop(Box::new(sub))),
+                    // OneOrMore
+                    (1, None) => Ok(Mir::Concat(vec![sub.clone(), Mir::Loop(Box::new(sub))])),
+                    // Exactly
+                    (n, Some(m)) if n == m => Ok(Mir::Concat(
+                        std::iter::repeat(sub).take(n as usize).collect(),
+                    )),
+                    // AtLeast
+                    (n, None) => Ok(Mir::Concat(
+                        (std::iter::repeat(sub.clone()).take(n as usize))
+                            .chain([Mir::Loop(Box::new(sub))])
+                            .collect(),
+                    )),
+                    // Bounded
+                    (n, Some(m)) => Ok(Mir::Concat(
+                        (std::iter::repeat(sub.clone()).take(n as usize))
+                            .chain(std::iter::repeat(Mir::Maybe(Box::new(sub))).take((n..m).len()))
+                            .collect(),
+                    )),
                 }
             }
-            HirKind::Group(group) => Mir::try_from(*group.hir),
-            HirKind::WordBoundary(_) => {
-                Err("#[regex]: word boundaries are currently unsupported.".into())
-            }
-            HirKind::Anchor(_) => {
-                Err("#[regex]: anchors in #[regex] are currently unsupported.".into())
+            HirKind::Look(_) => {
+                Err("#[regex]: lookahead and lookbehind are currently unsupported.".into())
             }
+            HirKind::Capture(capture) => Mir::try_from(*capture.sub),
         }
     }
 }
@@ -192,7 +194,7 @@ mod tests {
     fn priorities() {
         let regexes = [
             ("[a-z]+", 1),
-            ("a|b", 2),
+            ("a|b", 1),
             ("a|[b-z]", 1),
             ("(foo)+", 6),
             ("foobar", 12),

diff --git a/logos-codegen/src/parser/ignore_flags.rs b/logos-codegen/src/parser/ignore_flags.rs
@@ -210,16 +210,16 @@ pub mod ascii_case {
         fn make_ascii_case_insensitive(self) -> Mir {
             if self.is_ascii_lowercase() {
                 Mir::Alternation(vec![
-                    Mir::Literal(hir::Literal::Byte(self - 32)),
-                    Mir::Literal(hir::Literal::Byte(self)),
+                    Mir::Literal((self - 32) as char),
+                    Mir::Literal(self as char),
                 ])
             } else if self.is_ascii_uppercase() {
                 Mir::Alternation(vec![
-                    Mir::Literal(hir::Literal::Byte(self)),
-                    Mir::Literal(hir::Literal::Byte(self + 32)),
+                    Mir::Literal(self as char),
+                    Mir::Literal((self + 32) as char),
                 ])
             } else {
-                Mir::Literal(hir::Literal::Byte(self))
+                Mir::Literal(self as char)
             }
         }
     }
@@ -229,16 +229,7 @@ pub mod ascii_case {
             if self.is_ascii() {
                 (self as u8).make_ascii_case_insensitive()
             } else {
-                Mir::Literal(hir::Literal::Unicode(self))
-            }
-        }
-    }
-
-    impl MakeAsciiCaseInsensitive for hir::Literal {
-        fn make_ascii_case_insensitive(self) -> Mir {
-            match self {
-                hir::Literal::Byte(b) => b.make_ascii_case_insensitive(),
-                hir::Literal::Unicode(c) => c.make_ascii_case_insensitive(),
+                Mir::Literal(self)
             }
         }
     }
@@ -360,7 +351,7 @@ pub mod ascii_case {
                         .collect(),
                 ),
                 Mir::Class(c) => c.make_ascii_case_insensitive(),
-                Mir::Literal(l) => l.make_ascii_case_insensitive(),
+                Mir::Literal(c) => c.make_ascii_case_insensitive(),
             }
         }
     }