diff --git a/Cargo.toml b/Cargo.toml index ef8a3d8..462a458 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ authors = [ ] [dependencies] +clap = { version = "4.5", optional = true, features = ["derive"] } colored = { version = "2.1" } dirs = { version = "5.0", optional = true } indexmap = { version = "2.3" } @@ -29,7 +30,7 @@ criterion = { version = "0.5", features = ["html_reports"] } [features] default = ["repl"] -repl = ["dirs", "rustyline"] +repl = ["clap", "dirs", "rustyline"] [profile.release] lto = "fat" diff --git a/README.md b/README.md index 1bf0090..f261d1a 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ * [5) Constructing ACTION and GOTO tables](#5-constructing-action-and-goto-tables) * [6) Tokenizing the input](#6-tokenizing-the-input) * [7) Parsing the tokens](#7-parsing-the-tokens) +* [Can I have an LALR(1) parser instead of an LR(1) parser?](#can-i-have-an-lalr1-parser-instead-of-an-lr1-parser) * [Any benchmarks?](#any-benchmarks) * [Can I modify it?](#can-i-modify-it) * [Which resources did you use when creating this?](#which-resources-did-you-use-when-creating-this) @@ -307,7 +308,7 @@ fn main() { return; } }; - let parser = match Parser::new(grammar) { + let parser = match Parser::lr(grammar) { Ok(parser) => parser, Err(error) => { eprintln!("parser error: {}", error); @@ -966,6 +967,68 @@ E └─ 1 ``` +## Can I have an LALR(1) parser instead of an LR(1) parser? + +Yes, `dotlr` supports both LR(1) and LALR(1) parsers! + +In the CLI, you can simply use the `--lalr` option: + +```shell +dotlr --lalr grammar.lr "1 + 2 * 3" +``` + +And in the API, you can use `Parser::lalr` instead of `Parser::lr`: + +```rust +Parser::lalr(grammar) +``` + +LALR(1) parser is construction is very similar to LR(1) parser construction. The only difference is +that after [4) Constructing the LR(1) automaton](#4-constructing-the-lr1-automaton), there is +another step to reduce the LR(1) automaton to an LALR(1) automaton, using the following algorithm: + +```python +# Iterate over the state pairs of the automaton. +for state1 in automaton.states: + for state2 in automaton.states: + # Check if the states share the same core. + # Which means their items are the same ignoring the lookaheads. + + # Here is an example: + # ... + # +-------+------------------------+--------------+---------------+ + # | 3 | T -> %id . '(' E ')' | { $, '+' } | '(' -> 4 | + # | | T -> %id . | { $, '+' } | | + # +-------+------------------------+--------------+---------------+ + # ... + # +-------+------------------------+--------------+---------------+ + # | 6 | T -> %id . '(' E ')' | { ')', '+' } | '(' -> 7 | + # | | T -> %id . | { ')', '+' } | | + # +-------+------------------------+--------------+---------------+ + # ... + + if state1.core == state2.core: + # Merge the states. + # Which is combining lookaheads of the same items. + # Transitions should be mapped to the new states as well. + + # Here is the merge of the two states in the example above: + # ... + # +-------+------------------------+-----------------+--------------+ + # | 3 | T -> %id . '(' E ')' | { $, '+', ')' } | '(' -> 4 | + # | | T -> %id . | { $, '+', ')' } | | + # +-------+------------------------+-----------------+--------------+ + # ... + + automaton.merge_states(state1, state2) +``` + +the actual implementation is a bit more involved, but the idea is exactly this. +Luckily, it's documented extensively at +[automaton.rs](https://github.com/umut-sahin/dotlr/blob/main/src/automaton.rs) +(search for `to_lalr`). I highly recommend reading the comments in the +source to understand the nuances of the implementation. + ## Any benchmarks? Yes, even though `dotlr` isn't a performance focused project, I thought it'd be interesting to have @@ -983,13 +1046,27 @@ This command prints the following in my own computer with an `Intel i7-12700K` C ``` ... -Parsing JSON/Simple time: [262.04 ms 263.31 ms 264.60 ms] - thrpt: [94.218 MiB/s 94.680 MiB/s 95.138 MiB/s] +Parsing JSON/Simple LR(1) + time: [260.33 ms 265.74 ms 269.29 ms] + thrpt: [92.578 MiB/s 93.815 MiB/s 95.765 MiB/s] + +... + +Parsing JSON/Simple LALR(1) + time: [287.93 ms 288.71 ms 289.49 ms] + thrpt: [86.119 MiB/s 86.350 MiB/s 86.583 MiB/s] + +... + +Parsing JSON/Optimized LR(1) + time: [211.55 ms 211.71 ms 211.90 ms] + thrpt: [117.65 MiB/s 117.76 MiB/s 117.85 MiB/s] ... -Parsing JSON/Optimized time: [181.44 ms 181.63 ms 181.82 ms] - thrpt: [137.11 MiB/s 137.26 MiB/s 137.40 MiB/s] +Parsing JSON/Optimized LALR(1) + time: [192.66 ms 193.53 ms 194.39 ms] + thrpt: [128.25 MiB/s 128.82 MiB/s 129.40 MiB/s] ... ``` diff --git a/benches/parsing_json.rs b/benches/parsing_json.rs index a31e6fe..9b280dd 100644 --- a/benches/parsing_json.rs +++ b/benches/parsing_json.rs @@ -24,17 +24,26 @@ fn benchmark_parsing_json(criterion: &mut Criterion) { for (name, definition) in grammars { let grammar = Grammar::parse(definition).unwrap(); - let parser = Parser::new(grammar).unwrap(); - let tokens = parser.tokenize(input).unwrap(); - group.bench_function(name, |b| { - b.iter_batched( - || tokens.clone(), - |tokens| { - criterion::black_box(parser.parse(tokens).unwrap()); + for lalr in [false, true] { + let parser = if lalr { + Parser::lalr(grammar.clone()).unwrap() + } else { + Parser::lr(grammar.clone()).unwrap() + }; + let tokens = parser.tokenize(input).unwrap(); + group.bench_function( + format!("{} {}(1)", name, if lalr { "LALR" } else { "LR" }), + |b| { + b.iter_batched( + || tokens.clone(), + |tokens| { + criterion::black_box(parser.parse(tokens)).unwrap(); + }, + BatchSize::PerIteration, + ); }, - BatchSize::PerIteration, ); - }); + } } } diff --git a/examples/calculator.rs b/examples/calculator.rs index a374381..aae85be 100644 --- a/examples/calculator.rs +++ b/examples/calculator.rs @@ -18,7 +18,7 @@ fn main() -> ExitCode { let grammar_string = include_str!("../assets/grammars/correct/calculator.lr"); let grammar = Grammar::parse(grammar_string).expect("invalid grammar"); - let parser = Parser::new(grammar).expect("unsupported grammar"); + let parser = Parser::lr(grammar).expect("unsupported grammar"); match args.next() { Some(input) => calculate(&parser, &input), diff --git a/examples/json.rs b/examples/json.rs index 432c6e7..a499022 100644 --- a/examples/json.rs +++ b/examples/json.rs @@ -143,7 +143,7 @@ fn main() { let grammar_string = include_str!("../assets/grammars/correct/json.lr"); let grammar = Grammar::parse(grammar_string).expect("invalid grammar"); - let parser = Parser::new(grammar).expect("unsupported grammar"); + let parser = Parser::lalr(grammar).expect("unsupported grammar"); let input = include_str!("../assets/data/sample.json"); let tokens = parser.tokenize(input).expect("tokenization failed"); diff --git a/src/automaton.rs b/src/automaton.rs index bb990c5..1729bc2 100644 --- a/src/automaton.rs +++ b/src/automaton.rs @@ -59,7 +59,7 @@ impl Display for Item { /// State of an LR(1) automaton. -#[derive(Debug, Default, Eq)] +#[derive(Clone, Debug, Default, Eq)] pub struct State { id: usize, items: SmallVec<[Item; 2]>, @@ -254,6 +254,149 @@ impl Automaton { } } +impl Automaton { + /// Converts the LR(1) automaton into an LALR(1) automaton. + pub fn to_lalr(self) -> Automaton { + // We'll start by computing the states that share the same core. + // Core of a state is its items without the lookahead. + // In the end we want `state_groups` to be something like: + // [ + // { 0 }, -> New state 0 will be the copy of the original state 0 + // { 1 }, -> New state 1 will be the copy of the original state 1 + // { 2, 9 }, -> New state 2 will be the merge of the original states 2 and 9 + // { 3, 6 }, -> New state 3 will be the merge of the original states 3 and 6 + // { 4, 7 }, -> New state 4 will be the merge of the original states 4 and 7 + // { 5, 8 }, -> New state 5 will be the merge of the original states 5 and 8 + // { 10, 13 }, -> New state 6 will be the merge of the original states 10 and 13 + // { 11, 14 }, -> New state 7 will be the merge of the original states 11 and 14 + // { 12, 15 }, -> New state 8 will be the merge of the original states 12 and 15 + // ] + let mut state_groups = Vec::>::new(); + for (state_index, state) in self.states.iter().enumerate() { + let mut group = None; + for state_group in state_groups.iter_mut() { + assert!(!state_group.is_empty()); + + let candidate_index = state_group.iter().next().unwrap(); + let candidate_state = &self.states[*candidate_index]; + + if state.items.len() == candidate_state.items.len() { + let mut can_be_merged = true; + for item in state.items.iter() { + let mut candidate_state_has_same_item_without_lookahead = false; + for candidate_item in candidate_state.items.iter() { + if item.dot == candidate_item.dot && item.rule == candidate_item.rule { + candidate_state_has_same_item_without_lookahead = true; + break; + } + } + if !candidate_state_has_same_item_without_lookahead { + can_be_merged = false; + break; + } + } + if can_be_merged { + group = Some(state_group); + } + } + } + match group { + Some(group) => { + group.insert(state_index); + }, + None => { + state_groups.push(IndexSet::from([state_index])); + }, + } + } + + // Now we'll compute the mapping from the old states to the new states. + // In the end we want `state_map` to be something like: + // { + // 0: 0, -> Original state 0 will become the new state 0 + // 1: 1, -> Original state 1 will become the new state 1 + // 2: 2, -> Original state 2 will become the new state 2 + // 3: 3, -> Original state 3 will become the new state 3 + // 4: 4, -> Original state 4 will become the new state 4 + // 5: 5, -> Original state 5 will become the new state 5 + // 6: 3, -> Original state 6 will become the new state 3 + // 7: 4, -> Original state 7 will become the new state 4 + // 8: 5, -> Original state 8 will become the new state 5 + // 9: 2, -> Original state 9 will become the new state 2 + // 10: 6, -> Original state 10 will become the new state 6 + // 11: 7, -> Original state 11 will become the new state 7 + // 12: 8, -> Original state 12 will become the new state 8 + // 13: 6, -> Original state 13 will become the new state 6 + // 14: 7, -> Original state 14 will become the new state 7 + // 15: 8, -> Original state 15 will become the new state 8 + // } + let mut state_map = BTreeMap::::new(); + for (new_state_index, state_group) in state_groups.iter().enumerate() { + for old_state_index in state_group.iter().copied() { + state_map.insert(old_state_index, new_state_index); + } + } + + // Finally, we compute the new states. + let mut new_states = Vec::::with_capacity(state_groups.len()); + for (id, state_group) in state_groups.into_iter().enumerate() { + // We'll create a new state for each group in `state_groups`. + + // We make sure that the group is not empty, which shouldn't happen. + assert!(!state_group.is_empty()); + + // Get an iterator of the indices of the states to merge. + let mut state_indices = state_group.into_iter(); + + // Create the new state from the first original state. + let mut new_state = self.states[state_indices.next().unwrap()].clone(); + + // Set the id of the state to the index of the group. + new_state.id = id; + + // Update the transitions of the new state according to `state_map`. + for next_state in new_state.transitions.values_mut() { + *next_state = state_map[next_state]; + } + + // Merge the new state with other states in the group. + for state_index in state_indices { + // Get the state to merge. + let state_to_merge = &self.states[state_index]; + + // Make sure the state is merged into the correct state. + assert_eq!(state_map[&state_to_merge.id], id); + + // Make sure the transitions of the state are the same as the new state. + for (atomic_pattern, next_state) in state_to_merge.transitions.iter() { + assert!(new_state.transitions.contains_key(atomic_pattern)); + assert_eq!(new_state.transitions[atomic_pattern], state_map[next_state]) + } + + // Extend the lookahead of the items of the new state. + for item in state_to_merge.items.iter() { + let mut merged = false; + for new_item in new_state.items.iter_mut() { + if new_item.dot == item.dot && new_item.rule == item.rule { + new_item.lookahead.extend(item.lookahead.iter().cloned()); + merged = true; + break; + } + } + // Make sure the item existed in both states. + assert!(merged); + } + } + + // Add the merged state to the new states. + new_states.push(new_state); + } + + // Crate the LALR(1) automaton using the new states. + Automaton { states: new_states } + } +} + impl Automaton { /// Gets the states of the automaton. pub fn states(&self) -> &[State] { diff --git a/src/grammar.rs b/src/grammar.rs index d4ecdd5..5ffe640 100644 --- a/src/grammar.rs +++ b/src/grammar.rs @@ -200,7 +200,7 @@ impl Display for Rule { /// Grammar of a language. -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct Grammar { symbols: IndexSet, start_symbol: Symbol, diff --git a/src/main.rs b/src/main.rs index d89dcb7..78762ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ use { + clap::Parser as Clap, colored::Colorize, dotlr::{ Grammar, @@ -9,49 +10,73 @@ use { error::ReadlineError, DefaultEditor, }, - std::process::ExitCode, + std::{ + path::PathBuf, + process::ExitCode, + }, }; +#[derive(Clap)] +struct Args { + /// Create an LALR(1) parser instead of an LR(1) parser. + #[arg(long)] + lalr: bool, + + /// Grammar to parse. + grammar: PathBuf, + + /// Input to parse. + input: Option, +} + fn main() -> ExitCode { - let mut args = std::env::args().skip(1); + let args = Args::parse(); - let grammar_file = match args.next() { - Some(arg) => arg, - None => { - eprintln!("{} grammar file is not specified", "usage error:".red().bold()); - return ExitCode::FAILURE; - }, - }; - let grammar_string = match std::fs::read_to_string(grammar_file) { + let grammar = match std::fs::read_to_string(args.grammar) { Ok(content) => content, Err(error) => { eprintln!("{} grammar file cannot be read ({})", "io error:".red().bold(), error); return ExitCode::FAILURE; }, }; - let grammar = match Grammar::parse(&grammar_string) { + let grammar = match Grammar::parse(&grammar) { Ok(grammar) => grammar, Err(error) => { eprintln!("{} {}", "grammar error:".red().bold(), error); return ExitCode::FAILURE; }, }; - let parser = match Parser::new(grammar) { - Ok(parser) => parser, - Err(error) => { - eprintln!("{} {}", "parser error:".red().bold(), error); - if let ParserError::Conflict { parser, .. } = error { - parser.dump(); + let parser = { + if args.lalr { + match Parser::lalr(grammar) { + Ok(parser) => parser, + Err(error) => { + eprintln!("{} {}", "lr parser error:".red().bold(), error); + if let ParserError::Conflict { parser, .. } = error { + parser.dump(); + } + return ExitCode::FAILURE; + }, } - return ExitCode::FAILURE; - }, + } else { + match Parser::lr(grammar) { + Ok(parser) => parser, + Err(error) => { + eprintln!("{} {}", "lalr parser error:".red().bold(), error); + if let ParserError::Conflict { parser, .. } = error { + parser.dump(); + } + return ExitCode::FAILURE; + }, + } + } }; println!(); parser.dump(); println!(); - match args.next() { + match args.input { Some(input) => { println!("{} {}", ">".cyan().bold(), input); parse(&parser, &input) diff --git a/src/parser.rs b/src/parser.rs index 419d7cc..b1b35e3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -13,36 +13,8 @@ pub struct Parser { impl Parser { /// Crates an LR(1) parser of a grammar. - pub fn new(grammar: Grammar) -> Result { - { - if grammar.rules().is_empty() { - return Err(ParserError::EmptyGrammar); - } - for rule in grammar.rules() { - for atomic_pattern in rule.pattern() { - match atomic_pattern { - AtomicPattern::Symbol(symbol) => { - if !grammar.symbols().contains(symbol) { - return Err(ParserError::UndefinedSymbol { - symbol: symbol.clone(), - rule: rule.clone(), - }); - } - }, - AtomicPattern::Token(token) => { - if let Token::Regex(regex_token) = token { - if !grammar.regular_expressions().contains_key(regex_token) { - return Err(ParserError::UndefinedRegexToken { - regex_token: regex_token.clone(), - rule: rule.clone(), - }); - } - } - }, - } - } - } - } + pub fn lr(grammar: Grammar) -> Result { + Parser::check_grammar_internal(&grammar)?; let first_table = FirstTable::construct(&grammar); let follow_table = FollowTable::construct(&grammar, &first_table); @@ -50,17 +22,20 @@ impl Parser { let parsing_tables = ParsingTables::construct(&grammar, &follow_table, &automaton)?; let parser = Parser { grammar, first_table, follow_table, automaton, parsing_tables }; - for (state, action_map) in parser.action_table().iter().enumerate() { - for (token, actions) in action_map.iter() { - if actions.len() > 1 { - let token = token.clone(); - let parser = Box::new(parser); - return Err(ParserError::Conflict { parser, state, token }); - } - } - } + parser.check_conflicts_internal() + } + + /// Crates an LALR(1) parser of a grammar. + pub fn lalr(grammar: Grammar) -> Result { + Parser::check_grammar_internal(&grammar)?; + + let first_table = FirstTable::construct(&grammar); + let follow_table = FollowTable::construct(&grammar, &first_table); + let automaton = Automaton::construct(&grammar, &first_table).to_lalr(); + let parsing_tables = ParsingTables::construct(&grammar, &follow_table, &automaton)?; - Ok(parser) + let parser = Parser { grammar, first_table, follow_table, automaton, parsing_tables }; + parser.check_conflicts_internal() } } @@ -159,6 +134,52 @@ impl Parser { } impl Parser { + /// Internal grammar checks. + fn check_grammar_internal(grammar: &Grammar) -> Result<(), ParserError> { + if grammar.rules().is_empty() { + return Err(ParserError::EmptyGrammar); + } + for rule in grammar.rules() { + for atomic_pattern in rule.pattern() { + match atomic_pattern { + AtomicPattern::Symbol(symbol) => { + if !grammar.symbols().contains(symbol) { + return Err(ParserError::UndefinedSymbol { + symbol: symbol.clone(), + rule: rule.clone(), + }); + } + }, + AtomicPattern::Token(token) => { + if let Token::Regex(regex_token) = token { + if !grammar.regular_expressions().contains_key(regex_token) { + return Err(ParserError::UndefinedRegexToken { + regex_token: regex_token.clone(), + rule: rule.clone(), + }); + } + } + }, + } + } + } + Ok(()) + } + + /// Internal conflict checks. + fn check_conflicts_internal(self) -> Result { + for (state, action_map) in self.action_table().iter().enumerate() { + for (token, actions) in action_map.iter() { + if actions.len() > 1 { + let token = token.clone(); + let parser = Box::new(self); + return Err(ParserError::Conflict { parser, state, token }); + } + } + } + Ok(self) + } + /// Internal parsing logic. fn parse_and_trace_internal<'i>( &self, diff --git a/tests/parser.rs b/tests/parser.rs index 66ed39e..dbd60a8 100644 --- a/tests/parser.rs +++ b/tests/parser.rs @@ -8,6 +8,7 @@ use { Item, Parser, ParserError, + RegexToken, Rule, State, Symbol, @@ -25,7 +26,7 @@ use { fn creating_parser_for_semantically_correct_grammars() { for grammar in common::grammars::CORRECT { let grammar = Grammar::parse(grammar).unwrap(); - assert!(Parser::new(grammar).is_ok()); + assert!(Parser::lr(grammar).is_ok()); } } @@ -33,7 +34,7 @@ fn creating_parser_for_semantically_correct_grammars() { fn failing_to_create_parser_for_semantically_incorrect_grammars() { for grammar in common::grammars::SEMANTICALLY_INCORRECT { let grammar = Grammar::parse(grammar).unwrap(); - assert!(Parser::new(grammar).is_err()); + assert!(Parser::lr(grammar).is_err()); } } @@ -41,28 +42,28 @@ fn failing_to_create_parser_for_semantically_incorrect_grammars() { #[test] fn raising_correct_error_when_creating_parser_for_empty_grammar() { let grammar = Grammar::parse(common::grammars::EMPTY).unwrap(); - let error = Parser::new(grammar).unwrap_err(); + let error = Parser::lr(grammar).unwrap_err(); assert_eq!(error.to_string(), "grammar is empty"); } #[test] fn raising_correct_error_when_creating_parser_for_undefined_symbol_grammar() { let grammar = Grammar::parse(common::grammars::UNDEFINED_SYMBOL).unwrap(); - let error = Parser::new(grammar).unwrap_err(); + let error = Parser::lr(grammar).unwrap_err(); assert_eq!(error.to_string(), "symbol F in rule S -> E '+' F is not defined"); } #[test] fn raising_correct_error_when_creating_parser_for_undefined_regex_token_grammar() { let grammar = Grammar::parse(common::grammars::UNDEFINED_REGEX_TOKEN).unwrap(); - let error = Parser::new(grammar).unwrap_err(); + let error = Parser::lr(grammar).unwrap_err(); assert_eq!(error.to_string(), "regex token %i in rule E -> %i '+' %i is not defined"); } #[test] fn raising_correct_error_when_creating_parser_for_shift_reduce_conflict_grammar() { let grammar = Grammar::parse(common::grammars::SHIFT_REDUCE_CONFLICT).unwrap(); - let error = Parser::new(grammar).unwrap_err(); + let error = Parser::lr(grammar).unwrap_err(); let error_string = error.to_string(); if let ParserError::Conflict { parser, token, state } = error { @@ -89,7 +90,7 @@ fn raising_correct_error_when_creating_parser_for_shift_reduce_conflict_grammar( #[test] fn raising_correct_error_when_creating_parser_for_reduce_reduce_conflict_grammar() { let grammar = Grammar::parse(common::grammars::REDUCE_REDUCE_CONFLICT).unwrap(); - let error = Parser::new(grammar).unwrap_err(); + let error = Parser::lr(grammar).unwrap_err(); let error_string = error.to_string(); if let ParserError::Conflict { parser, token, state } = error { @@ -109,11 +110,27 @@ fn raising_correct_error_when_creating_parser_for_reduce_reduce_conflict_grammar } } +#[test] +fn raising_correct_error_when_creating_lalr_parser_for_non_lalr_grammar() { + let grammar = Grammar::parse(common::grammars::NOT_LALR).unwrap(); + let error = Parser::lalr(grammar).unwrap_err(); + + let error_string = error.to_string(); + if let ParserError::Conflict { parser, token, state } = error { + assert_eq!(error_string, format!("conflict at state {} on {}", state, token)); + + let possible_actions = parser.action_table()[state].get(&token); + assert!(possible_actions.is_some()); + + assert!(possible_actions.unwrap().len() >= 2); + } +} + #[test] -fn correctly_creating_parser_for_binary_addition_grammar() { +fn correctly_creating_lr_parser_for_binary_addition_grammar() { let grammar = Grammar::parse(common::grammars::BINARY_ADDITION).unwrap(); - let parser = Parser::new(grammar).unwrap(); + let parser = Parser::lr(grammar).unwrap(); assert_eq!( parser.grammar().to_string().trim(), @@ -636,3 +653,819 @@ B -> '1' ); } } + +#[test] +fn correctly_creating_lalr_parser_for_g10_grammar() { + let grammar = Grammar::parse(common::grammars::G10).unwrap(); + let parser = Parser::lalr(grammar).unwrap(); + + assert_eq!( + parser.grammar().to_string().trim(), + r#" + +P -> E +E -> E '+' T +E -> T +T -> %id '(' E ')' +T -> %id + +%id -> /^[A-Za-z][A-Za-z0-9]+/ + + "# + .trim() + ); + + let first_table = parser.first_table(); + { + // +--------+-----------+ + // | Symbol | First Set | + // +--------+-----------+ + // | T | { %id } | + // +--------+-----------+ + // | E | { %id } | + // +--------+-----------+ + // | P | { %id } | + // +--------+-----------+ + + #[rustfmt::skip] + assert_eq!( + *first_table.deref(), + [ + ( + Symbol::from("T"), + [ + RegexToken::from("id").into(), + ] + .into(), + ), + ( + Symbol::from("E"), + [ + RegexToken::from("id").into(), + ] + .into(), + ), + ( + Symbol::from("P"), + [ + RegexToken::from("id").into(), + ] + .into(), + ), + ] + .into_iter() + .collect::>() + ); + } + + let follow_table = parser.follow_table(); + { + // +--------+-----------------+ + // | Symbol | Follow Set | + // +--------+-----------------+ + // | T | { $, '+', ')' } | + // +--------+-----------------+ + // | E | { $, '+', ')' } | + // +--------+-----------------+ + // | P | { $ } | + // +--------+-----------------+ + + #[rustfmt::skip] + assert_eq!( + *follow_table.deref(), + [ + ( + Symbol::from("T"), + [ + Token::Eof, + ConstantToken::from("+").into(), + ConstantToken::from(")").into(), + ] + .into(), + ), + ( + Symbol::from("E"), + [ + Token::Eof, + ConstantToken::from("+").into(), + ConstantToken::from(")").into(), + ] + .into(), + ), + ( + Symbol::from("P"), + [ + Token::Eof, + ] + .into(), + ), + ] + .into_iter() + .collect::>() + ); + } + + let automaton = parser.automaton(); + { + // +-------+------------------------+-----------------+--------------+ + // | State | Items | Lookaheads | Transitions | + // +-------+------------------------+-----------------+--------------+ + // | 0 | P -> . E | { $ } | E -> 1 | + // | | E -> . E '+' T | { $, '+' } | T -> 2 | + // | | E -> . T | { $, '+' } | %id -> 3 | + // | | T -> . %id '(' E ')' | { $, '+' } | | + // | | T -> . %id | { $, '+' } | | + // +-------+------------------------+-----------------+--------------+ + // | 1 | P -> E . | { $ } | '+' -> 7 | + // | | E -> E . '+' T | { $, '+' } | | + // +-------+------------------------+-----------------+--------------+ + // | 2 | E -> T . | { $, '+', ')' } | | + // +-------+------------------------+-----------------+--------------+ + // | 3 | T -> %id . '(' E ')' | { $, '+', ')' } | '(' -> 4 | + // | | T -> %id . | { $, '+', ')' } | | + // +-------+------------------------+-----------------+--------------+ + // | 4 | T -> %id '(' . E ')' | { $, '+', ')' } | T -> 2 | + // | | E -> . E '+' T | { ')', '+' } | %id -> 3 | + // | | E -> . T | { ')', '+' } | E -> 5 | + // | | T -> . %id '(' E ')' | { ')', '+' } | | + // | | T -> . %id | { ')', '+' } | | + // +-------+------------------------+-----------------+--------------+ + // | 5 | T -> %id '(' E . ')' | { $, '+', ')' } | ')' -> 6 | + // | | E -> E . '+' T | { ')', '+' } | '+' -> 7 | + // +-------+------------------------+-----------------+--------------+ + // | 6 | T -> %id '(' E ')' . | { ')', '+', $ } | | + // +-------+------------------------+-----------------+--------------+ + // | 7 | E -> E '+' . T | { ')', '+', $ } | %id -> 3 | + // | | T -> . %id '(' E ')' | { ')', '+', $ } | T -> 8 | + // | | T -> . %id | { ')', '+', $ } | | + // +-------+------------------------+-----------------+--------------+ + // | 8 | E -> E '+' T . | { ')', '+', $ } | | + // +-------+------------------------+-----------------+--------------+ + + #[rustfmt::skip] + assert_eq!( + automaton.states(), + [ + // State 0 + State::new( + 0, + [ + // P -> . E | { $ } + Item::new( + Rule::new( + "P", + [ + Symbol::from("E").into(), + ] + ), + 0, + [Token::Eof], + ), + // E -> . E '+' T | { $, '+' } + Item::new( + Rule::new( + "E", + [ + Symbol::from("E").into(), + ConstantToken::from("+").into(), + Symbol::from("T").into(), + ] + ), + 0, + [Token::Eof, ConstantToken::from("+").into()], + ), + // E -> . T | { $, '+' } + Item::new( + Rule::new( + "E", + [ + Symbol::from("T").into(), + ] + ), + 0, + [Token::Eof, ConstantToken::from("+").into()], + ), + // T -> . %id '(' E ')' | { $, '+' } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ConstantToken::from("(").into(), + Symbol::from("E").into(), + ConstantToken::from(")").into(), + ] + ), + 0, + [Token::Eof, ConstantToken::from("+").into()], + ), + // T -> . %id | { $, '+' } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ] + ), + 0, + [Token::Eof, ConstantToken::from("+").into()], + ), + ], + [ + // E -> 1 + (Symbol::from("E").into(), 1), + // T -> 2 + (Symbol::from("T").into(), 2), + // %id -> 3 + (RegexToken::from("id").into(), 3), + ], + ), + + // State 1 + State::new( + 1, + [ + // P -> E . | { $ } + Item::new( + Rule::new( + "P", + [ + Symbol::from("E").into(), + ] + ), + 1, + [Token::Eof], + ), + // E -> E . '+' T | { $, '+' } + Item::new( + Rule::new( + "E", + [ + Symbol::from("E").into(), + ConstantToken::from("+").into(), + Symbol::from("T").into(), + ] + ), + 1, + [Token::Eof, ConstantToken::from("+").into()], + ), + ], + [ + // '+' -> 7 + (ConstantToken::from("+").into(), 7), + ], + ), + + // State 2 + State::new( + 2, + [ + // E -> T . | { $, '+', ')' } + Item::new( + Rule::new( + "E", + [ + Symbol::from("T").into(), + ] + ), + 1, + [ + Token::Eof, + ConstantToken::from("+").into(), + ConstantToken::from(")").into(), + ], + ), + ], + [], + ), + + // State 3 + State::new( + 3, + [ + // T -> %id . '(' E ')' | { $, '+', ')' } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ConstantToken::from("(").into(), + Symbol::from("E").into(), + ConstantToken::from(")").into(), + ] + ), + 1, + [ + Token::Eof, + ConstantToken::from("+").into(), + ConstantToken::from(")").into(), + ], + ), + // T -> %id . | { $, '+', ')' } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ] + ), + 1, + [ + Token::Eof, + ConstantToken::from("+").into(), + ConstantToken::from(")").into(), + ], + ), + ], + [ + // '(' -> 4 + (ConstantToken::from("(").into(), 4), + ], + ), + + // State 4 + State::new( + 4, + [ + // T -> %id '(' . E ')' | { $, '+', ')' } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ConstantToken::from("(").into(), + Symbol::from("E").into(), + ConstantToken::from(")").into(), + ] + ), + 2, + [ + Token::Eof, + ConstantToken::from("+").into(), + ConstantToken::from(")").into(), + ], + ), + // E -> . E '+' T | { ')', '+' } + Item::new( + Rule::new( + "E", + [ + Symbol::from("E").into(), + ConstantToken::from("+").into(), + Symbol::from("T").into(), + ] + ), + 0, + [ConstantToken::from(")").into(), ConstantToken::from("+").into()], + ), + // E -> . T | { ')', '+' } + Item::new( + Rule::new( + "E", + [ + Symbol::from("T").into(), + ] + ), + 0, + [ConstantToken::from(")").into(), ConstantToken::from("+").into()], + ), + // T -> . %id '(' E ')' | { ')', '+' } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ConstantToken::from("(").into(), + Symbol::from("E").into(), + ConstantToken::from(")").into(), + ] + ), + 0, + [ConstantToken::from(")").into(), ConstantToken::from("+").into()], + ), + // T -> . %id | { ')', '+' } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ] + ), + 0, + [ConstantToken::from(")").into(), ConstantToken::from("+").into()], + ), + ], + [ + // T -> 2 + (Symbol::from("T").into(), 2), + // %id -> 3 + (RegexToken::from("id").into(), 3), + // 'E' -> 5 + (Symbol::from("E").into(), 5), + ], + ), + + // State 5 + State::new( + 5, + [ + // T -> %id '(' E . ')' | { $, '+', ')' } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ConstantToken::from("(").into(), + Symbol::from("E").into(), + ConstantToken::from(")").into(), + ] + ), + 3, + [ + Token::Eof, + ConstantToken::from("+").into(), + ConstantToken::from(")").into(), + ], + ), + // E -> E . '+' T | { ')', '+' } + Item::new( + Rule::new( + "E", + [ + Symbol::from("E").into(), + ConstantToken::from("+").into(), + Symbol::from("T").into(), + ] + ), + 1, + [ConstantToken::from(")").into(), ConstantToken::from("+").into()], + ), + ], + [ + // ')' -> 6 + (ConstantToken::from(")").into(), 6), + // '+' -> 7 + (ConstantToken::from("+").into(), 7), + ], + ), + + // State 6 + State::new( + 6, + [ + // T -> %id '(' E ')' . | { ')', '+', $ } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ConstantToken::from("(").into(), + Symbol::from("E").into(), + ConstantToken::from(")").into(), + ] + ), + 4, + [ + ConstantToken::from(")").into(), + ConstantToken::from("+").into(), + Token::Eof, + ], + ), + ], + [], + ), + + // State 7 + State::new( + 7, + [ + // E -> E '+' . T | { ')', '+', $ } + Item::new( + Rule::new( + "E", + [ + Symbol::from("E").into(), + ConstantToken::from("+").into(), + Symbol::from("T").into(), + ] + ), + 2, + [ + ConstantToken::from(")").into(), + ConstantToken::from("+").into(), + Token::Eof, + ], + ), + // T -> . %id '(' E ')' | { ')', '+', $ } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ConstantToken::from("(").into(), + Symbol::from("E").into(), + ConstantToken::from(")").into(), + ] + ), + 0, + [ + ConstantToken::from(")").into(), + ConstantToken::from("+").into(), + Token::Eof, + ], + ), + // T -> . %id | { ')', '+', $ } + Item::new( + Rule::new( + "T", + [ + RegexToken::from("id").into(), + ] + ), + 0, + [ + ConstantToken::from(")").into(), + ConstantToken::from("+").into(), + Token::Eof, + ], + ), + ], + [ + // %id -> 3 + (RegexToken::from("id").into(), 3), + // T -> 8 + (Symbol::from("T").into(), 8), + ], + ), + + // State 8 + State::new( + 8, + [ + // E -> E '+' T . | { ')', '+', $ } + Item::new( + Rule::new( + "E", + [ + Symbol::from("E").into(), + ConstantToken::from("+").into(), + Symbol::from("T").into(), + ] + ), + 3, + [ + ConstantToken::from(")").into(), + ConstantToken::from("+").into(), + Token::Eof, + ], + ), + ], + [], + ), + ] + ); + } + + let action_table = parser.action_table(); + { + // +-------+---------------------------------------+ + // | | Action | + // | State | ------------------------------------- | + // | | '+' '(' ')' %id $ | + // +-------+---------------------------------------+ + // | 0 | - - - s3 - | + // +-------+---------------------------------------+ + // | 1 | s7 - - - a1 | + // +-------+---------------------------------------+ + // | 2 | r3 - r3 - r3 | + // +-------+---------------------------------------+ + // | 3 | r5 s4 r5 - r5 | + // +-------+---------------------------------------+ + // | 4 | - - - s3 - | + // +-------+---------------------------------------+ + // | 5 | s7 - s6 - - | + // +-------+---------------------------------------+ + // | 6 | r4 - r4 - r4 | + // +-------+---------------------------------------+ + // | 7 | - - - s3 - | + // +-------+---------------------------------------+ + // | 8 | r2 - r2 - r2 | + // +-------+---------------------------------------+ + + #[rustfmt::skip] + assert_eq!( + action_table, + [ + // State 0 + IndexMap::>::from_iter( + [ + ( + RegexToken::from("id").into(), + IndexSet::from([Action::Shift { next_state: 3 }]), + ), + ], + ), + // State 1 + IndexMap::>::from( + [ + ( + ConstantToken::from("+").into(), + IndexSet::from([Action::Shift { next_state: 7 }]), + ), + ( + Token::Eof, + IndexSet::from([Action::Accept { rule_index: 0 }]), + ), + ], + ), + // State 2 + IndexMap::>::from( + [ + ( + ConstantToken::from("+").into(), + IndexSet::from([Action::Reduce { rule_index: 2 }]), + ), + ( + ConstantToken::from(")").into(), + IndexSet::from([Action::Reduce { rule_index: 2 }]), + ), + ( + Token::Eof, + IndexSet::from([Action::Reduce { rule_index: 2 }]), + ), + ], + ), + // State 3 + IndexMap::>::from( + [ + ( + ConstantToken::from("+").into(), + IndexSet::from([Action::Reduce { rule_index: 4 }]), + ), + ( + ConstantToken::from("(").into(), + IndexSet::from([Action::Shift { next_state: 4 }]), + ), + ( + ConstantToken::from(")").into(), + IndexSet::from([Action::Reduce { rule_index: 4 }]), + ), + ( + Token::Eof, + IndexSet::from([Action::Reduce { rule_index: 4 }]), + ), + ], + ), + // State 4 + IndexMap::>::from( + [ + ( + RegexToken::from("id").into(), + IndexSet::from([Action::Shift { next_state: 3 }]), + ), + ], + ), + // State 5 + IndexMap::>::from( + [ + ( + ConstantToken::from("+").into(), + IndexSet::from([Action::Shift { next_state: 7 }]), + ), + ( + ConstantToken::from(")").into(), + IndexSet::from([Action::Shift { next_state: 6 }]), + ), + ], + ), + // State 6 + IndexMap::>::from( + [ + ( + ConstantToken::from("+").into(), + IndexSet::from([Action::Reduce { rule_index: 3 }]), + ), + ( + ConstantToken::from(")").into(), + IndexSet::from([Action::Reduce { rule_index: 3 }]), + ), + ( + Token::Eof, + IndexSet::from([Action::Reduce { rule_index: 3 }]), + ), + ], + ), + // State 7 + IndexMap::>::from( + [ + ( + RegexToken::from("id").into(), + IndexSet::from([Action::Shift { next_state: 3 }]), + ), + ], + ), + // State 8 + IndexMap::>::from( + [ + ( + ConstantToken::from("+").into(), + IndexSet::from([Action::Reduce { rule_index: 1 }]), + ), + ( + ConstantToken::from(")").into(), + IndexSet::from([Action::Reduce { rule_index: 1 }]), + ), + ( + Token::Eof, + IndexSet::from([Action::Reduce { rule_index: 1 }]), + ), + ], + ), + ] + ); + } + + let goto_table = parser.goto_table(); + { + // +-------+-------------------+ + // | | Goto | + // | State | ----------------- | + // | | P E T | + // +-------+-------------------+ + // | 0 | - 1 2 | + // +-------+-------------------+ + // | 1 | - - - | + // +-------+-------------------+ + // | 2 | - - - | + // +-------+-------------------+ + // | 3 | - - - | + // +-------+-------------------+ + // | 4 | - 5 2 | + // +-------+-------------------+ + // | 5 | - - - | + // +-------+-------------------+ + // | 6 | - - - | + // +-------+-------------------+ + // | 7 | - - 8 | + // +-------+-------------------+ + // | 8 | - - - | + // +-------+-------------------+ + + #[rustfmt::skip] + assert_eq!( + goto_table, + [ + // State 0 + IndexMap::::from_iter( + [ + (Symbol::from("E"), 1), + (Symbol::from("T"), 2), + ], + ), + // State 1 + IndexMap::::from_iter( + [ + ], + ), + // State 2 + IndexMap::::from_iter( + [ + ], + ), + // State 3 + IndexMap::::from_iter( + [ + ], + ), + // State 4 + IndexMap::::from_iter( + [ + (Symbol::from("E"), 5), + (Symbol::from("T"), 2), + ], + ), + // State 5 + IndexMap::::from_iter( + [ + ], + ), + // State 6 + IndexMap::::from_iter( + [ + ], + ), + // State 7 + IndexMap::::from_iter( + [ + (Symbol::from("T"), 8), + ], + ), + // State 8 + IndexMap::::from_iter( + [ + ], + ), + ] + ); + } +} diff --git a/tests/parsing.rs b/tests/parsing.rs index 2b17b29..0244a31 100644 --- a/tests/parsing.rs +++ b/tests/parsing.rs @@ -10,7 +10,7 @@ use dotlr::{ #[test] fn raising_correct_error_when_encountering_unknown_token_during_parsing_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); - let parser = Parser::new(grammar).unwrap(); + let parser = Parser::lr(grammar).unwrap(); let error = parser.tokenize("a").unwrap_err(); assert_eq!(error.to_string(), "unknown token a"); @@ -19,7 +19,7 @@ fn raising_correct_error_when_encountering_unknown_token_during_parsing_calculat #[test] fn raising_correct_error_when_encountering_unexpected_token_during_parsing_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); - let parser = Parser::new(grammar).unwrap(); + let parser = Parser::lr(grammar).unwrap(); let tokens = parser.tokenize("1 + /").unwrap(); let error = parser.parse(tokens).unwrap_err(); @@ -29,7 +29,7 @@ fn raising_correct_error_when_encountering_unexpected_token_during_parsing_calcu #[test] fn raising_correct_error_when_encountering_unexpected_eof_during_parsing_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); - let parser = Parser::new(grammar).unwrap(); + let parser = Parser::lr(grammar).unwrap(); let tokens = parser.tokenize("1 + (2").unwrap(); let error = parser.parse(tokens).unwrap_err(); @@ -43,7 +43,7 @@ fn raising_correct_error_when_encountering_unexpected_eof_during_parsing_calcula #[test] fn correctly_trace_parsing_of_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); - let parser = Parser::new(grammar).unwrap(); + let parser = Parser::lr(grammar).unwrap(); let expression = "1 + 2 * 3 / (4 ^ 5)"; let tokens = parser.tokenize(expression).unwrap(); @@ -166,9 +166,9 @@ Expr } #[test] -fn correctly_parse_conditional_grammar() { +fn correctly_parsing_conditional_grammar() { let grammar = Grammar::parse(common::grammars::CONDITIONAL).unwrap(); - let parser = Parser::new(grammar).unwrap(); + let parser = Parser::lr(grammar).unwrap(); let expression = "if true { if_case } else { else_case }"; let tokens = parser.tokenize(expression).unwrap(); @@ -198,3 +198,105 @@ Conditional .trim(), ); } + +#[test] +fn correctly_parsing_json_grammar_with_lalr() { + let grammar = Grammar::parse(common::grammars::JSON).unwrap(); + let parser = Parser::lalr(grammar).unwrap(); + + let expression = include_str!("../assets/data/sample.json"); + let tokens = parser.tokenize(expression).unwrap(); + + let parse_tree = parser.parse(tokens).unwrap(); + assert_eq!( + parse_tree.to_string().trim(), + r#" + +Json +└─ Value + └─ Object + ├─ { + ├─ ObjectElements + │ ├─ ObjectElements + │ │ ├─ ObjectElements + │ │ │ ├─ ObjectElements + │ │ │ │ ├─ ObjectElements + │ │ │ │ │ ├─ ObjectElements + │ │ │ │ │ │ ├─ String + │ │ │ │ │ │ │ └─ "name" + │ │ │ │ │ │ ├─ : + │ │ │ │ │ │ └─ Value + │ │ │ │ │ │ └─ String + │ │ │ │ │ │ └─ "Sample" + │ │ │ │ │ ├─ , + │ │ │ │ │ ├─ String + │ │ │ │ │ │ └─ "rating" + │ │ │ │ │ ├─ : + │ │ │ │ │ └─ Value + │ │ │ │ │ └─ Number + │ │ │ │ │ └─ 4.2 + │ │ │ │ ├─ , + │ │ │ │ ├─ String + │ │ │ │ │ └─ "homepage" + │ │ │ │ ├─ : + │ │ │ │ └─ Value + │ │ │ │ └─ Null + │ │ │ │ └─ null + │ │ │ ├─ , + │ │ │ ├─ String + │ │ │ │ └─ "is_active" + │ │ │ ├─ : + │ │ │ └─ Value + │ │ │ └─ Boolean + │ │ │ └─ true + │ │ ├─ , + │ │ ├─ String + │ │ │ └─ "tags" + │ │ ├─ : + │ │ └─ Value + │ │ └─ Array + │ │ ├─ [ + │ │ ├─ ArrayElements + │ │ │ ├─ ArrayElements + │ │ │ │ ├─ ArrayElements + │ │ │ │ │ └─ Value + │ │ │ │ │ └─ String + │ │ │ │ │ └─ "a" + │ │ │ │ ├─ , + │ │ │ │ └─ Value + │ │ │ │ └─ String + │ │ │ │ └─ "b" + │ │ │ ├─ , + │ │ │ └─ Value + │ │ │ └─ String + │ │ │ └─ "c" + │ │ └─ ] + │ ├─ , + │ ├─ String + │ │ └─ "metadata" + │ ├─ : + │ └─ Value + │ └─ Object + │ ├─ { + │ ├─ ObjectElements + │ │ ├─ ObjectElements + │ │ │ ├─ String + │ │ │ │ └─ "foo" + │ │ │ ├─ : + │ │ │ └─ Value + │ │ │ └─ String + │ │ │ └─ "bar" + │ │ ├─ , + │ │ ├─ String + │ │ │ └─ "bar" + │ │ ├─ : + │ │ └─ Value + │ │ └─ Number + │ │ └─ 10 + │ └─ } + └─ } + + "# + .trim(), + ); +}