feat: lalr parsers

umut-sahin · Sep 9, 2024 · 76ade26 · 76ade26
1 parent d322688
commit 76ade26
Show file tree

Hide file tree

Showing 11 changed files with 1,305 additions and 94 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,6 +11,7 @@ authors = [
 ]
 
 [dependencies]
+clap = { version = "4.5", optional = true, features = ["derive"] }
 colored = { version = "2.1" }
 dirs = { version = "5.0", optional = true }
 indexmap = { version = "2.3" }
@@ -29,7 +30,7 @@ criterion = { version = "0.5", features = ["html_reports"] }
 
 [features]
 default = ["repl"]
-repl = ["dirs", "rustyline"]
+repl = ["clap", "dirs", "rustyline"]
 
 [profile.release]
 lto = "fat"

diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@
   * [5) Constructing ACTION and GOTO tables](#5-constructing-action-and-goto-tables)
   * [6) Tokenizing the input](#6-tokenizing-the-input)
   * [7) Parsing the tokens](#7-parsing-the-tokens)
+* [Can I have an LALR(1) parser instead of an LR(1) parser?](#can-i-have-an-lalr1-parser-instead-of-an-lr1-parser)
 * [Any benchmarks?](#any-benchmarks)
 * [Can I modify it?](#can-i-modify-it)
 * [Which resources did you use when creating this?](#which-resources-did-you-use-when-creating-this)
@@ -307,7 +308,7 @@ fn main() {
       return;
     }
   };
-  let parser = match Parser::new(grammar) {
+  let parser = match Parser::lr(grammar) {
     Ok(parser) => parser,
     Err(error) => {
       eprintln!("parser error: {}", error);
@@ -966,6 +967,68 @@ E
       └─ 1
 ```
 
+## Can I have an LALR(1) parser instead of an LR(1) parser?
+
+Yes, `dotlr` supports both LR(1) and LALR(1) parsers!
+
+In the CLI, you can simply use the `--lalr` option:
+
+```shell
+dotlr --lalr grammar.lr "1 + 2 * 3"
+```
+
+And in the API, you can use `Parser::lalr` instead of `Parser::lr`:
+
+```rust
+Parser::lalr(grammar)
+```
+
+LALR(1) parser is construction is very similar to LR(1) parser construction. The only difference is
+that after [4) Constructing the LR(1) automaton](#4-constructing-the-lr1-automaton), there is
+another step to reduce the LR(1) automaton to an LALR(1) automaton, using the following algorithm:
+
+```python
+# Iterate over the state pairs of the automaton.
+for state1 in automaton.states:
+  for state2 in automaton.states:
+    # Check if the states share the same core.
+    # Which means their items are the same ignoring the lookaheads.
+
+    # Here is an example:
+    # ...
+    # +-------+------------------------+--------------+---------------+
+    # | 3     |  T -> %id . '(' E ')'  | { $, '+' }   |  '('  ->  4   |
+    # |       |  T -> %id .            | { $, '+' }   |               |
+    # +-------+------------------------+--------------+---------------+
+    # ...
+    # +-------+------------------------+--------------+---------------+
+    # | 6     |  T -> %id . '(' E ')'  | { ')', '+' } |  '('  ->  7   |
+    # |       |  T -> %id .            | { ')', '+' } |               |
+    # +-------+------------------------+--------------+---------------+
+    # ...
+
+    if state1.core == state2.core:
+      # Merge the states.
+      # Which is combining lookaheads of the same items.
+      # Transitions should be mapped to the new states as well.
+
+      # Here is the merge of the two states in the example above:
+      # ...
+      # +-------+------------------------+-----------------+--------------+
+      # | 3     |  T -> %id . '(' E ')'  | { $, '+', ')' } |  '('  ->  4  |
+      # |       |  T -> %id .            | { $, '+', ')' } |              |
+      # +-------+------------------------+-----------------+--------------+
+      # ...
+
+      automaton.merge_states(state1, state2)
+```
+
+the actual implementation is a bit more involved, but the idea is exactly this.
+Luckily, it's documented extensively at
+[automaton.rs](https://github.com/umut-sahin/dotlr/blob/main/src/automaton.rs)
+(search for `to_lalr`). I highly recommend reading the comments in the
+source to understand the nuances of the implementation.
+
 ## Any benchmarks?
 
 Yes, even though `dotlr` isn't a performance focused project, I thought it'd be interesting to have
@@ -983,13 +1046,27 @@ This command prints the following in my own computer with an `Intel i7-12700K` C
 ```
 ...
 
-Parsing JSON/Simple     time:   [262.04 ms 263.31 ms 264.60 ms]
-                        thrpt:  [94.218 MiB/s 94.680 MiB/s 95.138 MiB/s]
+Parsing JSON/Simple LR(1)
+                        time:   [260.33 ms 265.74 ms 269.29 ms]
+                        thrpt:  [92.578 MiB/s 93.815 MiB/s 95.765 MiB/s]
+
+...
+
+Parsing JSON/Simple LALR(1)
+                        time:   [287.93 ms 288.71 ms 289.49 ms]
+                        thrpt:  [86.119 MiB/s 86.350 MiB/s 86.583 MiB/s]
+
+...
+
+Parsing JSON/Optimized LR(1)
+                        time:   [211.55 ms 211.71 ms 211.90 ms]
+                        thrpt:  [117.65 MiB/s 117.76 MiB/s 117.85 MiB/s]
 
 ...
 
-Parsing JSON/Optimized  time:   [181.44 ms 181.63 ms 181.82 ms]
-                        thrpt:  [137.11 MiB/s 137.26 MiB/s 137.40 MiB/s]
+Parsing JSON/Optimized LALR(1)
+                        time:   [192.66 ms 193.53 ms 194.39 ms]
+                        thrpt:  [128.25 MiB/s 128.82 MiB/s 129.40 MiB/s]
 
 ...
 ```

diff --git a/benches/parsing_json.rs b/benches/parsing_json.rs
@@ -24,17 +24,26 @@ fn benchmark_parsing_json(criterion: &mut Criterion) {
 
     for (name, definition) in grammars {
         let grammar = Grammar::parse(definition).unwrap();
-        let parser = Parser::new(grammar).unwrap();
-        let tokens = parser.tokenize(input).unwrap();
-        group.bench_function(name, |b| {
-            b.iter_batched(
-                || tokens.clone(),
-                |tokens| {
-                    criterion::black_box(parser.parse(tokens).unwrap());
+        for lalr in [false, true] {
+            let parser = if lalr {
+                Parser::lalr(grammar.clone()).unwrap()
+            } else {
+                Parser::lr(grammar.clone()).unwrap()
+            };
+            let tokens = parser.tokenize(input).unwrap();
+            group.bench_function(
+                format!("{} {}(1)", name, if lalr { "LALR" } else { "LR" }),
+                |b| {
+                    b.iter_batched(
+                        || tokens.clone(),
+                        |tokens| {
+                            criterion::black_box(parser.parse(tokens)).unwrap();
+                        },
+                        BatchSize::PerIteration,
+                    );
                 },
-                BatchSize::PerIteration,
             );
-        });
+        }
     }
 }
 

diff --git a/examples/calculator.rs b/examples/calculator.rs
@@ -18,7 +18,7 @@ fn main() -> ExitCode {
 
     let grammar_string = include_str!("../assets/grammars/correct/calculator.lr");
     let grammar = Grammar::parse(grammar_string).expect("invalid grammar");
-    let parser = Parser::new(grammar).expect("unsupported grammar");
+    let parser = Parser::lr(grammar).expect("unsupported grammar");
 
     match args.next() {
         Some(input) => calculate(&parser, &input),

diff --git a/examples/json.rs b/examples/json.rs
@@ -143,7 +143,7 @@ fn main() {
     let grammar_string = include_str!("../assets/grammars/correct/json.lr");
     let grammar = Grammar::parse(grammar_string).expect("invalid grammar");
 
-    let parser = Parser::new(grammar).expect("unsupported grammar");
+    let parser = Parser::lalr(grammar).expect("unsupported grammar");
 
     let input = include_str!("../assets/data/sample.json");
     let tokens = parser.tokenize(input).expect("tokenization failed");

diff --git a/src/automaton.rs b/src/automaton.rs
@@ -59,7 +59,7 @@ impl Display for Item {
 
 
 /// State of an LR(1) automaton.
-#[derive(Debug, Default, Eq)]
+#[derive(Clone, Debug, Default, Eq)]
 pub struct State {
     id: usize,
     items: SmallVec<[Item; 2]>,
@@ -254,6 +254,149 @@ impl Automaton {
     }
 }
 
+impl Automaton {
+    /// Converts the LR(1) automaton into an LALR(1) automaton.
+    pub fn to_lalr(self) -> Automaton {
+        // We'll start by computing the states that share the same core.
+        // Core of a state is its items without the lookahead.
+        // In the end we want `state_groups` to be something like:
+        // [
+        //   { 0 },       -> New state 0 will be the copy of the original state 0
+        //   { 1 },       -> New state 1 will be the copy of the original state 1
+        //   { 2, 9 },    -> New state 2 will be the merge of the original states 2 and 9
+        //   { 3, 6 },    -> New state 3 will be the merge of the original states 3 and 6
+        //   { 4, 7 },    -> New state 4 will be the merge of the original states 4 and 7
+        //   { 5, 8 },    -> New state 5 will be the merge of the original states 5 and 8
+        //   { 10, 13 },  -> New state 6 will be the merge of the original states 10 and 13
+        //   { 11, 14 },  -> New state 7 will be the merge of the original states 11 and 14
+        //   { 12, 15 },  -> New state 8 will be the merge of the original states 12 and 15
+        // ]
+        let mut state_groups = Vec::<IndexSet<usize>>::new();
+        for (state_index, state) in self.states.iter().enumerate() {
+            let mut group = None;
+            for state_group in state_groups.iter_mut() {
+                assert!(!state_group.is_empty());
+
+                let candidate_index = state_group.iter().next().unwrap();
+                let candidate_state = &self.states[*candidate_index];
+
+                if state.items.len() == candidate_state.items.len() {
+                    let mut can_be_merged = true;
+                    for item in state.items.iter() {
+                        let mut candidate_state_has_same_item_without_lookahead = false;
+                        for candidate_item in candidate_state.items.iter() {
+                            if item.dot == candidate_item.dot && item.rule == candidate_item.rule {
+                                candidate_state_has_same_item_without_lookahead = true;
+                                break;
+                            }
+                        }
+                        if !candidate_state_has_same_item_without_lookahead {
+                            can_be_merged = false;
+                            break;
+                        }
+                    }
+                    if can_be_merged {
+                        group = Some(state_group);
+                    }
+                }
+            }
+            match group {
+                Some(group) => {
+                    group.insert(state_index);
+                },
+                None => {
+                    state_groups.push(IndexSet::from([state_index]));
+                },
+            }
+        }
+
+        // Now we'll compute the mapping from the old states to the new states.
+        // In the end we want `state_map` to be something like:
+        // {
+        //      0: 0,  -> Original state  0 will become the new state 0
+        //      1: 1,  -> Original state  1 will become the new state 1
+        //      2: 2,  -> Original state  2 will become the new state 2
+        //      3: 3,  -> Original state  3 will become the new state 3
+        //      4: 4,  -> Original state  4 will become the new state 4
+        //      5: 5,  -> Original state  5 will become the new state 5
+        //      6: 3,  -> Original state  6 will become the new state 3
+        //      7: 4,  -> Original state  7 will become the new state 4
+        //      8: 5,  -> Original state  8 will become the new state 5
+        //      9: 2,  -> Original state  9 will become the new state 2
+        //     10: 6,  -> Original state 10 will become the new state 6
+        //     11: 7,  -> Original state 11 will become the new state 7
+        //     12: 8,  -> Original state 12 will become the new state 8
+        //     13: 6,  -> Original state 13 will become the new state 6
+        //     14: 7,  -> Original state 14 will become the new state 7
+        //     15: 8,  -> Original state 15 will become the new state 8
+        // }
+        let mut state_map = BTreeMap::<usize, usize>::new();
+        for (new_state_index, state_group) in state_groups.iter().enumerate() {
+            for old_state_index in state_group.iter().copied() {
+                state_map.insert(old_state_index, new_state_index);
+            }
+        }
+
+        // Finally, we compute the new states.
+        let mut new_states = Vec::<State>::with_capacity(state_groups.len());
+        for (id, state_group) in state_groups.into_iter().enumerate() {
+            // We'll create a new state for each group in `state_groups`.
+
+            // We make sure that the group is not empty, which shouldn't happen.
+            assert!(!state_group.is_empty());
+
+            // Get an iterator of the indices of the states to merge.
+            let mut state_indices = state_group.into_iter();
+
+            // Create the new state from the first original state.
+            let mut new_state = self.states[state_indices.next().unwrap()].clone();
+
+            // Set the id of the state to the index of the group.
+            new_state.id = id;
+
+            // Update the transitions of the new state according to `state_map`.
+            for next_state in new_state.transitions.values_mut() {
+                *next_state = state_map[next_state];
+            }
+
+            // Merge the new state with other states in the group.
+            for state_index in state_indices {
+                // Get the state to merge.
+                let state_to_merge = &self.states[state_index];
+
+                // Make sure the state is merged into the correct state.
+                assert_eq!(state_map[&state_to_merge.id], id);
+
+                // Make sure the transitions of the state are the same as the new state.
+                for (atomic_pattern, next_state) in state_to_merge.transitions.iter() {
+                    assert!(new_state.transitions.contains_key(atomic_pattern));
+                    assert_eq!(new_state.transitions[atomic_pattern], state_map[next_state])
+                }
+
+                // Extend the lookahead of the items of the new state.
+                for item in state_to_merge.items.iter() {
+                    let mut merged = false;
+                    for new_item in new_state.items.iter_mut() {
+                        if new_item.dot == item.dot && new_item.rule == item.rule {
+                            new_item.lookahead.extend(item.lookahead.iter().cloned());
+                            merged = true;
+                            break;
+                        }
+                    }
+                    // Make sure the item existed in both states.
+                    assert!(merged);
+                }
+            }
+
+            // Add the merged state to the new states.
+            new_states.push(new_state);
+        }
+
+        // Crate the LALR(1) automaton using the new states.
+        Automaton { states: new_states }
+    }
+}
+
 impl Automaton {
     /// Gets the states of the automaton.
     pub fn states(&self) -> &[State] {

diff --git a/src/grammar.rs b/src/grammar.rs
@@ -200,7 +200,7 @@ impl Display for Rule {
 
 
 /// Grammar of a language.
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct Grammar {
     symbols: IndexSet<Symbol>,
     start_symbol: Symbol,