Fix token position according to python tokenize

Glyphack · Aug 4, 2024 · 9104f6f · 9104f6f
1 parent f48df1e
commit 9104f6f
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 32 deletions.
diff --git a/enderpy/src/main.rs b/enderpy/src/main.rs
@@ -85,20 +85,13 @@ fn tokenize() -> Result<()> {
             io::stdin().read_to_string(&mut source).into_diagnostic()?;
         }
     }
+    println!("{:?}", source.chars().collect::<Vec<char>>());
     let mut lexer = Lexer::new(&source);
     let tokens = lexer.lex();
+    println!("{:?}", lexer.line_starts);
     for token in tokens {
-        let (start_line_num, start_line_offset) =
-            match lexer.line_starts.binary_search(&token.start) {
-                Ok(idx) => (idx, lexer.line_starts[idx]),
-                Err(idx) => (idx - 1, lexer.line_starts[idx - 1]),
-            };
-        let start_line_column = token.start - start_line_offset;
-        let (end_line_num, end_line_offset) = match lexer.line_starts.binary_search(&token.end) {
-            Ok(idx) => (idx, lexer.line_starts[idx]),
-            Err(idx) => (idx - 1, lexer.line_starts[idx - 1]),
-        };
-        let end_line_column = token.end - end_line_offset;
+        let (start_line_num, start_line_column, end_line_num, end_line_column) =
+            token.get_row_col_position(&lexer.line_starts);
         println!(
             "{}-{}, {}-{}:   {} {} {} {}",
             start_line_num,

diff --git a/parser/src/lexer/compat.rs b/parser/src/lexer/compat.rs
@@ -1,8 +1,4 @@
-use miette::{bail, IntoDiagnostic, Result};
 use serde::{Deserialize, Serialize};
-use std::io::Write;
-
-use crate::runpython::{default_python_path, spawn_python_script_command};
 
 // Derived from:
 // https://github.com/python/cpython/blob/main/Lib/token.py
@@ -104,11 +100,15 @@ mod tests {
     use super::*;
     use crate::token::Kind;
     use crate::{lexer::Lexer, token::Token};
+    use miette::{bail, IntoDiagnostic, Result};
+    use std::io::Write;
     use tabled::{
         builder::Builder,
         settings::peaker::PriorityMax,
         settings::{Style, Width},
     };
+
+    use crate::runpython::{default_python_path, spawn_python_script_command};
     use terminal_size::{terminal_size, Width as TerminalWidth};
 
     fn lex_python_source(source: &str) -> Result<Vec<PythonToken>> {
@@ -390,6 +390,17 @@ def",
         ]);
     }
 
+    #[test]
+    fn test_logical_and_physical_lines() {
+        python_tokenize_test_lexer(&[
+            // This case the first line should have physical line
+            "
+a: int = 1
+print(a)
+",
+        ]);
+    }
+
     #[test]
     #[should_panic]
     fn test_lex_unterminated_string_double_quotes() {
@@ -705,19 +716,8 @@ def",
             ));
         }
 
-        let (mut enderpy_start_row, enderpy_start_col) = lexer.to_row_col(enderpy_token.start);
-        let (mut enderpy_end_row, mut enderpy_end_col) = lexer.to_row_col(enderpy_token.end);
-        // Python reserves the first row for a file encoding when detokenizing, so we add one
-        // to our row values to match.
-        enderpy_start_row += 1;
-        enderpy_end_row += 1;
-        if enderpy_token.kind == Kind::NewLine {
-            // enderpy has newline tokens span from the end of the first line to the beginning of
-            // the next line.
-            // Python adds the token to the end of the first line.
-            enderpy_end_row = enderpy_start_row;
-            enderpy_end_col = enderpy_start_col + 1;
-        }
+        let (enderpy_start_row, enderpy_start_col, enderpy_end_row, enderpy_end_col) =
+            enderpy_token.get_row_col_position(&lexer.line_starts);
         let python_token_start = python_token.start;
         let python_token_end = python_token.end;
         if enderpy_start_row != python_token_start.0

diff --git a/parser/src/lexer/mod.rs b/parser/src/lexer/mod.rs
@@ -61,7 +61,11 @@ pub struct Lexer<'a> {
     /// Array of all line starts offsets. Starts from line 0
     pub line_starts: Vec<u32>,
     peak_mode: bool,
+
+    /// Previous token was a Newline token
     prev_token_newline: bool,
+    /// Cursor at position after the indentation in line
+    indented: bool,
 }
 
 impl<'a> Lexer<'a> {
@@ -75,9 +79,10 @@ impl<'a> Lexer<'a> {
             nesting: 0,
             tokenization_mode_stack: vec![],
             next_token_is_dedent: 0,
-            line_starts: vec![],
+            line_starts: vec![0],
             peak_mode: false,
-            prev_token_newline: false,
+            prev_token_newline: true,
+            indented: false,
         }
     }
 
@@ -131,6 +136,10 @@ impl<'a> Lexer<'a> {
         let value = self.parse_token_value(kind, start);
         let end = self.current;
 
+        if kind == Kind::NewLine || kind == Kind::NL {
+            self.line_starts.push(self.current);
+        }
+
         Token {
             kind,
             value,
@@ -147,8 +156,10 @@ impl<'a> Lexer<'a> {
         let start_of_line = self.start_of_line;
         let next_token_is_dedent = self.next_token_is_dedent;
         let prev_token_newline = self.prev_token_newline;
+        let indented = self.indented;
         self.peak_mode = true;
         let token = self.next_token();
+        self.indented = indented;
         self.prev_token_newline = prev_token_newline;
         self.peak_mode = false;
         self.current = current;
@@ -261,7 +272,6 @@ impl<'a> Lexer<'a> {
 
     fn next_kind(&mut self) -> Result<Kind, LexError> {
         if self.start_of_line {
-            self.line_starts.push(self.current);
             if self.nesting == 0 {
                 if let Some(indent_kind) = self.match_indentation()? {
                     self.start_of_line = false; // WHY!?
@@ -517,7 +527,7 @@ impl<'a> Lexer<'a> {
                 '\n' | '\r' => {
                     self.current_line += 1;
                     self.start_of_line = true;
-                    if self.nesting == 0 && !self.prev_token_newline {
+                    if self.nesting == 0 && (!self.prev_token_newline) {
                         return Ok(Kind::NewLine);
                     } else {
                         return Ok(Kind::NL);
@@ -1060,7 +1070,11 @@ impl<'a> Lexer<'a> {
         count
     }
 
+    // This function is used in test. Not sure to keep it here or not
+    #[allow(dead_code)]
     fn to_row_col(&self, source_offset: u32) -> (u32, u32) {
+        println!("source_offset: {}", source_offset);
+        println!("line_starts: {:?}", self.line_starts);
         let (line_row, line_offset) = match self.line_starts.binary_search(&source_offset) {
             Ok(idx) => (idx, self.line_starts[idx]),
             Err(idx) => (idx - 1, self.line_starts[idx - 1]),

diff --git a/parser/src/token.rs b/parser/src/token.rs
@@ -10,6 +10,41 @@ pub struct Token {
     pub end: u32,
 }
 
+impl Token {
+    pub fn get_row_col_position(&self, line_starts: &[u32]) -> (u32, u32, u32, u32) {
+        let (start_line_num, start_line_offset) = match line_starts.binary_search(&self.start) {
+            Ok(idx) => (idx, line_starts[idx]),
+            Err(idx) => (idx - 1, line_starts[idx - 1]),
+        };
+        let start_line_column = self.start - start_line_offset;
+        // EOF token
+        if self.start == self.end {
+            return (
+                start_line_num as u32 + 1,
+                start_line_column,
+                start_line_num as u32 + 1,
+                start_line_column,
+            );
+        }
+        let (end_line_num, end_line_offset) = match line_starts.binary_search(&self.end) {
+            // Special case: this is a new line token
+            // When end line offset is exactly on line start it means that this is the new line
+            // token end offset. We want to set the new line token line number same for start and
+            // end.
+            Ok(idx) => (idx - 1, line_starts[idx - 1]),
+            Err(idx) => (idx - 1, line_starts[idx - 1]),
+        };
+        let end_line_column = self.end.saturating_sub(end_line_offset);
+
+        (
+            start_line_num as u32 + 1,
+            start_line_column,
+            end_line_num as u32 + 1,
+            end_line_column,
+        )
+    }
+}
+
 impl Display for Token {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let kind: &str = self.kind.into();