Skip to content

Commit

Permalink
feat: lalr parsers
Browse files Browse the repository at this point in the history
  • Loading branch information
umut-sahin committed Sep 9, 2024
1 parent d322688 commit 76ade26
Show file tree
Hide file tree
Showing 11 changed files with 1,305 additions and 94 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ authors = [
]

[dependencies]
clap = { version = "4.5", optional = true, features = ["derive"] }
colored = { version = "2.1" }
dirs = { version = "5.0", optional = true }
indexmap = { version = "2.3" }
Expand All @@ -29,7 +30,7 @@ criterion = { version = "0.5", features = ["html_reports"] }

[features]
default = ["repl"]
repl = ["dirs", "rustyline"]
repl = ["clap", "dirs", "rustyline"]

[profile.release]
lto = "fat"
Expand Down
87 changes: 82 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
* [5) Constructing ACTION and GOTO tables](#5-constructing-action-and-goto-tables)
* [6) Tokenizing the input](#6-tokenizing-the-input)
* [7) Parsing the tokens](#7-parsing-the-tokens)
* [Can I have an LALR(1) parser instead of an LR(1) parser?](#can-i-have-an-lalr1-parser-instead-of-an-lr1-parser)
* [Any benchmarks?](#any-benchmarks)
* [Can I modify it?](#can-i-modify-it)
* [Which resources did you use when creating this?](#which-resources-did-you-use-when-creating-this)
Expand Down Expand Up @@ -307,7 +308,7 @@ fn main() {
return;
}
};
let parser = match Parser::new(grammar) {
let parser = match Parser::lr(grammar) {
Ok(parser) => parser,
Err(error) => {
eprintln!("parser error: {}", error);
Expand Down Expand Up @@ -966,6 +967,68 @@ E
└─ 1
```

## Can I have an LALR(1) parser instead of an LR(1) parser?

Yes, `dotlr` supports both LR(1) and LALR(1) parsers!

In the CLI, you can simply use the `--lalr` option:

```shell
dotlr --lalr grammar.lr "1 + 2 * 3"
```

And in the API, you can use `Parser::lalr` instead of `Parser::lr`:

```rust
Parser::lalr(grammar)
```

LALR(1) parser is construction is very similar to LR(1) parser construction. The only difference is
that after [4) Constructing the LR(1) automaton](#4-constructing-the-lr1-automaton), there is
another step to reduce the LR(1) automaton to an LALR(1) automaton, using the following algorithm:

```python
# Iterate over the state pairs of the automaton.
for state1 in automaton.states:
for state2 in automaton.states:
# Check if the states share the same core.
# Which means their items are the same ignoring the lookaheads.

# Here is an example:
# ...
# +-------+------------------------+--------------+---------------+
# | 3 | T -> %id . '(' E ')' | { $, '+' } | '(' -> 4 |
# | | T -> %id . | { $, '+' } | |
# +-------+------------------------+--------------+---------------+
# ...
# +-------+------------------------+--------------+---------------+
# | 6 | T -> %id . '(' E ')' | { ')', '+' } | '(' -> 7 |
# | | T -> %id . | { ')', '+' } | |
# +-------+------------------------+--------------+---------------+
# ...

if state1.core == state2.core:
# Merge the states.
# Which is combining lookaheads of the same items.
# Transitions should be mapped to the new states as well.

# Here is the merge of the two states in the example above:
# ...
# +-------+------------------------+-----------------+--------------+
# | 3 | T -> %id . '(' E ')' | { $, '+', ')' } | '(' -> 4 |
# | | T -> %id . | { $, '+', ')' } | |
# +-------+------------------------+-----------------+--------------+
# ...

automaton.merge_states(state1, state2)
```

the actual implementation is a bit more involved, but the idea is exactly this.
Luckily, it's documented extensively at
[automaton.rs](https://github.com/umut-sahin/dotlr/blob/main/src/automaton.rs)
(search for `to_lalr`). I highly recommend reading the comments in the
source to understand the nuances of the implementation.

## Any benchmarks?

Yes, even though `dotlr` isn't a performance focused project, I thought it'd be interesting to have
Expand All @@ -983,13 +1046,27 @@ This command prints the following in my own computer with an `Intel i7-12700K` C
```
...
Parsing JSON/Simple time: [262.04 ms 263.31 ms 264.60 ms]
thrpt: [94.218 MiB/s 94.680 MiB/s 95.138 MiB/s]
Parsing JSON/Simple LR(1)
time: [260.33 ms 265.74 ms 269.29 ms]
thrpt: [92.578 MiB/s 93.815 MiB/s 95.765 MiB/s]
...
Parsing JSON/Simple LALR(1)
time: [287.93 ms 288.71 ms 289.49 ms]
thrpt: [86.119 MiB/s 86.350 MiB/s 86.583 MiB/s]
...
Parsing JSON/Optimized LR(1)
time: [211.55 ms 211.71 ms 211.90 ms]
thrpt: [117.65 MiB/s 117.76 MiB/s 117.85 MiB/s]
...
Parsing JSON/Optimized time: [181.44 ms 181.63 ms 181.82 ms]
thrpt: [137.11 MiB/s 137.26 MiB/s 137.40 MiB/s]
Parsing JSON/Optimized LALR(1)
time: [192.66 ms 193.53 ms 194.39 ms]
thrpt: [128.25 MiB/s 128.82 MiB/s 129.40 MiB/s]
...
```
Expand Down
27 changes: 18 additions & 9 deletions benches/parsing_json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,26 @@ fn benchmark_parsing_json(criterion: &mut Criterion) {

for (name, definition) in grammars {
let grammar = Grammar::parse(definition).unwrap();
let parser = Parser::new(grammar).unwrap();
let tokens = parser.tokenize(input).unwrap();
group.bench_function(name, |b| {
b.iter_batched(
|| tokens.clone(),
|tokens| {
criterion::black_box(parser.parse(tokens).unwrap());
for lalr in [false, true] {
let parser = if lalr {
Parser::lalr(grammar.clone()).unwrap()
} else {
Parser::lr(grammar.clone()).unwrap()
};
let tokens = parser.tokenize(input).unwrap();
group.bench_function(
format!("{} {}(1)", name, if lalr { "LALR" } else { "LR" }),
|b| {
b.iter_batched(
|| tokens.clone(),
|tokens| {
criterion::black_box(parser.parse(tokens)).unwrap();
},
BatchSize::PerIteration,
);
},
BatchSize::PerIteration,
);
});
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion examples/calculator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fn main() -> ExitCode {

let grammar_string = include_str!("../assets/grammars/correct/calculator.lr");
let grammar = Grammar::parse(grammar_string).expect("invalid grammar");
let parser = Parser::new(grammar).expect("unsupported grammar");
let parser = Parser::lr(grammar).expect("unsupported grammar");

match args.next() {
Some(input) => calculate(&parser, &input),
Expand Down
2 changes: 1 addition & 1 deletion examples/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ fn main() {
let grammar_string = include_str!("../assets/grammars/correct/json.lr");
let grammar = Grammar::parse(grammar_string).expect("invalid grammar");

let parser = Parser::new(grammar).expect("unsupported grammar");
let parser = Parser::lalr(grammar).expect("unsupported grammar");

let input = include_str!("../assets/data/sample.json");
let tokens = parser.tokenize(input).expect("tokenization failed");
Expand Down
145 changes: 144 additions & 1 deletion src/automaton.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ impl Display for Item {


/// State of an LR(1) automaton.
#[derive(Debug, Default, Eq)]
#[derive(Clone, Debug, Default, Eq)]
pub struct State {
id: usize,
items: SmallVec<[Item; 2]>,
Expand Down Expand Up @@ -254,6 +254,149 @@ impl Automaton {
}
}

impl Automaton {
/// Converts the LR(1) automaton into an LALR(1) automaton.
pub fn to_lalr(self) -> Automaton {
// We'll start by computing the states that share the same core.
// Core of a state is its items without the lookahead.
// In the end we want `state_groups` to be something like:
// [
// { 0 }, -> New state 0 will be the copy of the original state 0
// { 1 }, -> New state 1 will be the copy of the original state 1
// { 2, 9 }, -> New state 2 will be the merge of the original states 2 and 9
// { 3, 6 }, -> New state 3 will be the merge of the original states 3 and 6
// { 4, 7 }, -> New state 4 will be the merge of the original states 4 and 7
// { 5, 8 }, -> New state 5 will be the merge of the original states 5 and 8
// { 10, 13 }, -> New state 6 will be the merge of the original states 10 and 13
// { 11, 14 }, -> New state 7 will be the merge of the original states 11 and 14
// { 12, 15 }, -> New state 8 will be the merge of the original states 12 and 15
// ]
let mut state_groups = Vec::<IndexSet<usize>>::new();
for (state_index, state) in self.states.iter().enumerate() {
let mut group = None;
for state_group in state_groups.iter_mut() {
assert!(!state_group.is_empty());

let candidate_index = state_group.iter().next().unwrap();
let candidate_state = &self.states[*candidate_index];

if state.items.len() == candidate_state.items.len() {
let mut can_be_merged = true;
for item in state.items.iter() {
let mut candidate_state_has_same_item_without_lookahead = false;
for candidate_item in candidate_state.items.iter() {
if item.dot == candidate_item.dot && item.rule == candidate_item.rule {
candidate_state_has_same_item_without_lookahead = true;
break;
}
}
if !candidate_state_has_same_item_without_lookahead {
can_be_merged = false;
break;
}
}
if can_be_merged {
group = Some(state_group);
}
}
}
match group {
Some(group) => {
group.insert(state_index);
},
None => {
state_groups.push(IndexSet::from([state_index]));
},
}
}

// Now we'll compute the mapping from the old states to the new states.
// In the end we want `state_map` to be something like:
// {
// 0: 0, -> Original state 0 will become the new state 0
// 1: 1, -> Original state 1 will become the new state 1
// 2: 2, -> Original state 2 will become the new state 2
// 3: 3, -> Original state 3 will become the new state 3
// 4: 4, -> Original state 4 will become the new state 4
// 5: 5, -> Original state 5 will become the new state 5
// 6: 3, -> Original state 6 will become the new state 3
// 7: 4, -> Original state 7 will become the new state 4
// 8: 5, -> Original state 8 will become the new state 5
// 9: 2, -> Original state 9 will become the new state 2
// 10: 6, -> Original state 10 will become the new state 6
// 11: 7, -> Original state 11 will become the new state 7
// 12: 8, -> Original state 12 will become the new state 8
// 13: 6, -> Original state 13 will become the new state 6
// 14: 7, -> Original state 14 will become the new state 7
// 15: 8, -> Original state 15 will become the new state 8
// }
let mut state_map = BTreeMap::<usize, usize>::new();
for (new_state_index, state_group) in state_groups.iter().enumerate() {
for old_state_index in state_group.iter().copied() {
state_map.insert(old_state_index, new_state_index);
}
}

// Finally, we compute the new states.
let mut new_states = Vec::<State>::with_capacity(state_groups.len());
for (id, state_group) in state_groups.into_iter().enumerate() {
// We'll create a new state for each group in `state_groups`.

// We make sure that the group is not empty, which shouldn't happen.
assert!(!state_group.is_empty());

// Get an iterator of the indices of the states to merge.
let mut state_indices = state_group.into_iter();

// Create the new state from the first original state.
let mut new_state = self.states[state_indices.next().unwrap()].clone();

// Set the id of the state to the index of the group.
new_state.id = id;

// Update the transitions of the new state according to `state_map`.
for next_state in new_state.transitions.values_mut() {
*next_state = state_map[next_state];
}

// Merge the new state with other states in the group.
for state_index in state_indices {
// Get the state to merge.
let state_to_merge = &self.states[state_index];

// Make sure the state is merged into the correct state.
assert_eq!(state_map[&state_to_merge.id], id);

// Make sure the transitions of the state are the same as the new state.
for (atomic_pattern, next_state) in state_to_merge.transitions.iter() {
assert!(new_state.transitions.contains_key(atomic_pattern));
assert_eq!(new_state.transitions[atomic_pattern], state_map[next_state])
}

// Extend the lookahead of the items of the new state.
for item in state_to_merge.items.iter() {
let mut merged = false;
for new_item in new_state.items.iter_mut() {
if new_item.dot == item.dot && new_item.rule == item.rule {
new_item.lookahead.extend(item.lookahead.iter().cloned());
merged = true;
break;
}
}
// Make sure the item existed in both states.
assert!(merged);
}
}

// Add the merged state to the new states.
new_states.push(new_state);
}

// Crate the LALR(1) automaton using the new states.
Automaton { states: new_states }
}
}

impl Automaton {
/// Gets the states of the automaton.
pub fn states(&self) -> &[State] {
Expand Down
2 changes: 1 addition & 1 deletion src/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ impl Display for Rule {


/// Grammar of a language.
#[derive(Debug)]
#[derive(Clone, Debug)]
pub struct Grammar {
symbols: IndexSet<Symbol>,
start_symbol: Symbol,
Expand Down
Loading

0 comments on commit 76ade26

Please sign in to comment.