Glyphack · Glyphack · Oct 9, 2024 · Oct 8, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/benchmark/benches/parser_benchmark.rs b/benchmark/benches/parser_benchmark.rs
@@ -56,7 +56,7 @@ pub fn benchmark_parser(c: &mut Criterion) {
             &source,
             |b, source| {
                 b.iter(|| {
-                    let mut parser = Parser::new(source, path);
+                    let mut parser = Parser::new(source);
                     parser.parse().unwrap();
 
                     0

diff --git a/compat/src/lexer_compat.rs b/compat/src/lexer_compat.rs
@@ -430,7 +430,7 @@ fn check_tokens_match(
     }
 
     let python_token_value = python_token.value.clone();
-    let enderpy_token_value = enderpy_token.value.to_string();
+    let enderpy_token_value = enderpy_token.to_string(lexer.source);
     // The Python tokenizer sets values in a number of places where Enderpy simply relies
     // on kind to assume value. Handle those cases here.
     let value_matches = matches_python_name_token(python_token.value.as_str(), &enderpy_token.kind)
@@ -507,6 +507,8 @@ fn matches_python_name_token(python_token_value: &str, token_kind: &Kind) -> boo
         "while" => token_kind == &Kind::While,
         "with" => token_kind == &Kind::With,
         "yield" => token_kind == &Kind::Yield,
+        "match" => token_kind == &Kind::Match,
+        "type" => token_kind == &Kind::Type,
         _ => token_kind == &Kind::Identifier,
     }
 }
@@ -903,6 +905,12 @@ print(a)
         ]);
     }
 
+    // TODO: fstring middle offset is wrong in case of {{ or }}
+    #[test]
+    fn test_fstring_positions() {
+        python_tokenize_test_lexer(&["f\"{{{', '.join(dict_items)}}}\""]);
+    }
+
     #[test]
     #[should_panic]
     fn test_lex_unterminated_string_double_quotes() {

diff --git a/compat/src/parser_compat.rs b/compat/src/parser_compat.rs
@@ -110,7 +110,7 @@ fn remove_unimplemented_attributes(value: &mut Value) {
 }
 
 fn parse_enderpy_source(source: &str) -> Result<Value> {
-    let mut parser = Parser::new(source, "string");
+    let mut parser = Parser::new(source);
     let typed_ast = parser.parse().into_diagnostic()?;
     let ast = typed_ast.as_python_compat(&parser);
     Ok(ast)

diff --git a/enderpy/src/main.rs b/enderpy/src/main.rs
@@ -91,13 +91,12 @@ fn tokenize() -> Result<()> {
         let (start_line_num, start_line_column, end_line_num, end_line_column) =
             get_row_col_position(token.start, token.end, &lexer.line_starts);
         println!(
-            "{}-{}, {}-{}:   {} {} {} {}",
+            "{}-{}, {}-{}:   {} {} {}",
             start_line_num,
             start_line_column,
             end_line_num,
             end_line_column,
             token.kind,
-            token.value,
             token.start,
             token.end,
         );
@@ -108,7 +107,7 @@ fn tokenize() -> Result<()> {
 fn parse(file: &PathBuf) -> Result<()> {
     let source = fs::read_to_string(file).into_diagnostic()?;
     let file_path = file.to_str().unwrap_or("");
-    let mut parser = Parser::new(&source, file_path);
+    let mut parser = Parser::new(&source);
     let ast = parser.parse();
     println!("{:#?}", ast);
     Ok(())

diff --git a/parser/src/error.rs b/parser/src/error.rs
@@ -1,8 +1,6 @@
 use miette::Diagnostic;
 use thiserror::Error;
 
-use crate::parser::parser::Parser;
-
 #[derive(Error, Diagnostic, Debug, Clone)]
 pub enum ParsingError {
     #[error("Invalid syntax")]
@@ -16,28 +14,6 @@ pub enum ParsingError {
     },
 }
 
-impl From<Parser<'_>> for ParsingError {
-    fn from(err: Parser) -> Self {
-        let token = err.cur_token();
-        ParsingError::InvalidSyntax {
-            msg: token.value.to_string(),
-            advice: String::default(),
-            span: err.get_span_on_line(token.start, token.end),
-        }
-    }
-}
-
-impl From<&mut Parser<'_>> for ParsingError {
-    fn from(err: &mut Parser) -> Self {
-        let token = err.cur_token();
-        ParsingError::InvalidSyntax {
-            msg: token.value.to_string(),
-            advice: String::default(),
-            span: err.get_span_on_line(token.start, token.end),
-        }
-    }
-}
-
 #[derive(Error, Debug)]
 pub enum LexError {
     #[error("String not terminated")]

diff --git a/parser/src/lexer/mod.rs b/parser/src/lexer/mod.rs
@@ -3,7 +3,7 @@ use unicode_id_start::{is_id_continue, is_id_start};
 use crate::{
     error::LexError,
     get_row_col_position,
-    token::{Kind, Token, TokenValue},
+    token::{Kind, Token},
 };
 
 #[derive(Debug, Clone, Copy, PartialEq)]
@@ -101,7 +101,6 @@ impl<'a> Lexer<'a> {
             self.next_token_is_dedent -= 1;
             return Token {
                 kind: Kind::Dedent,
-                value: TokenValue::None,
                 start: self.current,
                 end: self.current,
             };
@@ -138,22 +137,12 @@ impl<'a> Lexer<'a> {
         if kind != Kind::Comment && kind != Kind::NL && kind != Kind::Dedent {
             self.non_logical_line_state = kind == Kind::NewLine;
         }
-        let value = self.parse_token_value(kind, start);
-        let end = match kind {
-            Kind::FStringMiddle => start + value.as_str().expect("").len() as u32,
-            _ => self.current,
-        };
-
+        let end = self.current;
         if kind == Kind::Dedent {
             start = end
         }
 
-        Token {
-            kind,
-            value,
-            start,
-            end,
-        }
+        Token { kind, start, end }
     }
 
     // peek_token is a side-effect free version of next_token
@@ -750,6 +739,8 @@ impl<'a> Lexer<'a> {
             "while" => Kind::While,
             "with" => Kind::With,
             "yield" => Kind::Yield,
+            "match" => Kind::Match,
+            "type" => Kind::Type,
             _ => Kind::Identifier,
         }
     }
@@ -1053,43 +1044,6 @@ impl<'a> Lexer<'a> {
         }
     }
 
-    fn parse_token_value(&mut self, kind: Kind, start: u32) -> TokenValue {
-        let kind_value = &self.source[start as usize..self.current as usize];
-        match kind {
-            Kind::Integer
-            | Kind::Hexadecimal
-            | Kind::Binary
-            | Kind::PointFloat
-            | Kind::Octal
-            | Kind::ExponentFloat
-            | Kind::ImaginaryInteger
-            | Kind::ImaginaryExponentFloat
-            | Kind::ImaginaryPointFloat => TokenValue::Number(kind_value.to_string()),
-            Kind::Identifier => match kind_value {
-                "type" => TokenValue::Type,
-                "match" => TokenValue::Match,
-                _ => TokenValue::Str(kind_value.to_string()),
-            },
-            Kind::StringLiteral
-            | Kind::FStringStart
-            | Kind::FStringEnd
-            | Kind::RawBytes
-            | Kind::RawFStringStart
-            | Kind::Bytes
-            | Kind::Unicode
-            | Kind::Comment => TokenValue::Str(kind_value.to_string()),
-            Kind::FStringMiddle => {
-                let value = kind_value.replace("{{", "{");
-                let value = value.replace("}}", "}");
-                TokenValue::Str(value)
-            }
-            Kind::Dedent => TokenValue::Indent(1),
-            Kind::Indent => TokenValue::Indent(1),
-            Kind::Error => TokenValue::Str(kind_value.to_string()),
-            _ => TokenValue::None,
-        }
-    }
-
     fn f_string_quote_count(&mut self, str_start: char) -> u8 {
         let mut count = 1;
         if self.peek() == Some(str_start) && self.double_peek() == Some(str_start) {
@@ -1115,15 +1069,14 @@ mod tests {
 
     fn snapshot_test_lexer_and_errors(test_case: &str) {
         let mut lexer = Lexer::new(test_case);
-        let mut tokens = vec![];
         let mut snapshot = String::from("");
         loop {
             let token = lexer.next_token();
             if token.kind == Kind::Eof {
                 break;
             }
-            snapshot += format!("{}\n", token).as_str();
-            tokens.push(token);
+            snapshot += token.display_token(test_case).as_str();
+            snapshot += "\n";
         }
         let mut settings = insta::Settings::clone_current();
         settings.set_snapshot_path("../../test_data/output/");
@@ -1136,15 +1089,14 @@ mod tests {
     fn snapshot_test_lexer(snap_name: &str, inputs: &[&str]) -> Result<(), LexError> {
         for (i, test_input) in inputs.iter().enumerate() {
             let mut lexer = Lexer::new(test_input);
-            let mut tokens = vec![];
             let mut snapshot = String::from("");
             loop {
                 let token = lexer.next_token();
                 if token.kind == Kind::Eof {
                     break;
                 }
-                snapshot += format!("{}\n", token).as_str();
-                tokens.push(token);
+                snapshot += token.display_token(test_input).as_str();
+                snapshot += "\n";
             }
             let mut settings = insta::Settings::clone_current();
             settings.set_snapshot_suffix(format!("{snap_name}-{i}"));

diff --git a/parser/src/parser/mod.rs b/parser/src/parser/mod.rs
@@ -11,31 +11,24 @@ use crate::{
     parser::ast::{Expression, JoinedStr},
 };
 pub fn is_at_compound_statement(token: &Token) -> bool {
-    let kind_is_statement = match token.kind {
+    match token.kind {
         Kind::If
         | Kind::While
         | Kind::For
         | Kind::Try
         | Kind::With
         | Kind::Def
         | Kind::Class
+        | Kind::Type
+        | Kind::Match
         // Decorator
         | Kind::MatrixMul
         | Kind::Async => true,
         _ => false,
-    };
-    if kind_is_statement {
-        return true;
-    }
-
-    // Match is a soft keyword so it's an identifier token
-    if Kind::Identifier == token.kind && token.value.to_string() == "match" {
-        return true;
     }
-
-    false
 }
 
+// TODO: performance
 pub fn extract_string_inside(val: String) -> String {
     let delimiters = vec!["\"\"\"", "\"", "'''", "'"];
     let mut result = String::new();