feat: distinguish between logical and physical lines

Glyphack · Jul 31, 2024 · f48df1e · f48df1e
1 parent 3dcf0e2
commit f48df1e
Show file tree

Hide file tree

Showing 29 changed files with 308 additions and 293 deletions.
diff --git a/enderpy/src/cli.rs b/enderpy/src/cli.rs
@@ -8,15 +8,14 @@ use clap::{Parser, Subcommand};
 pub struct Cli {
     #[command(subcommand)]
     pub command: Commands,
+    #[arg(short, long)]
+    pub file: Option<PathBuf>,
 }
 
 #[derive(Subcommand)]
 pub enum Commands {
     /// Print lexer tokens
-    Tokenize {
-        /// Path to source file
-        file: PathBuf,
-    },
+    Tokenize {},
     /// Print abstract syntax tree
     Parse {
         /// Path to source file

diff --git a/enderpy/src/main.rs b/enderpy/src/main.rs
@@ -1,5 +1,6 @@
 use std::{
-    fs, io,
+    fs::{self, File},
+    io::{self, Read},
     path::{Path, PathBuf},
 };
 
@@ -14,7 +15,7 @@ mod cli;
 fn main() -> Result<()> {
     let cli = Cli::parse();
     match &cli.command {
-        Commands::Tokenize { file } => tokenize(file),
+        Commands::Tokenize {} => tokenize(),
         Commands::Parse { file } => parse(file),
         Commands::Check { path } => check(path),
         Commands::Watch => watch(),
@@ -70,8 +71,20 @@ fn get_typeshed_path() -> Result<PathBuf> {
     Ok(path.join("typeshed"))
 }
 
-fn tokenize(file: &PathBuf) -> Result<()> {
-    let source = fs::read_to_string(file).into_diagnostic()?;
+fn tokenize() -> Result<()> {
+    let cli = Cli::parse();
+    let mut source = String::new();
+    match cli.file {
+        Some(path) => {
+            File::open(path)
+                .into_diagnostic()?
+                .read_to_string(&mut source)
+                .into_diagnostic()?;
+        }
+        None => {
+            io::stdin().read_to_string(&mut source).into_diagnostic()?;
+        }
+    }
     let mut lexer = Lexer::new(&source);
     let tokens = lexer.lex();
     for token in tokens {

diff --git a/parser/src/lexer/compat.rs b/parser/src/lexer/compat.rs
@@ -98,29 +98,10 @@ pub struct PythonToken {
     end: (u32, u32),
 }
 
-fn lex_python_source(source: &str) -> Result<Vec<PythonToken>> {
-    let mut process = spawn_python_script_command(
-        "parser/lex_python.py",
-        vec!["--stdin", "--output-format", "json"],
-        default_python_path()?,
-    )?;
-
-    // Get process stdin and write the input string.
-    if let Some(mut stdin) = process.stdin.take() {
-        stdin.write_all(source.as_bytes()).into_diagnostic()?;
-    } else {
-        bail!("Failed to open stdin when running `parser/lex_python.py`");
-    }
-    // Get process stdout and parse result.
-    let output = process.wait_with_output().into_diagnostic()?;
-    let python_tokens: Vec<PythonToken> =
-        serde_json::from_str(String::from_utf8_lossy(&output.stdout).as_ref()).into_diagnostic()?;
-    Ok(python_tokens)
-}
-
 #[cfg(test)]
 mod tests {
-    use super::{lex_python_source, PythonKind, PythonToken};
+
+    use super::*;
     use crate::token::Kind;
     use crate::{lexer::Lexer, token::Token};
     use tabled::{
@@ -130,6 +111,27 @@ mod tests {
     };
     use terminal_size::{terminal_size, Width as TerminalWidth};
 
+    fn lex_python_source(source: &str) -> Result<Vec<PythonToken>> {
+        let mut process = spawn_python_script_command(
+            "parser/lex_python.py",
+            vec!["--stdin", "--output-format", "json"],
+            default_python_path()?,
+        )?;
+
+        // Get process stdin and write the input string.
+        if let Some(mut stdin) = process.stdin.take() {
+            stdin.write_all(source.as_bytes()).into_diagnostic()?;
+        } else {
+            bail!("Failed to open stdin when running `parser/lex_python.py`");
+        }
+        // Get process stdout and parse result.
+        let output = process.wait_with_output().into_diagnostic()?;
+        let python_tokens: Vec<PythonToken> =
+            serde_json::from_str(String::from_utf8_lossy(&output.stdout).as_ref())
+                .into_diagnostic()?;
+        Ok(python_tokens)
+    }
+
     #[test]
     fn test_simple_compat() {
         let source = r#"
@@ -168,8 +170,6 @@ print(a)
             ";",
             "@",
             "=",
-            // TODO lex_python: Python lexer chokes on single backslash.
-            // "\\",
             "#",
             "$",
             "?",
@@ -317,18 +317,12 @@ print(a)
         python_tokenize_test_lexer(&["import a", "import a.b", "import a.b.c", "import a from b"]);
     }
 
-    // TODO lex_python: Decide whether to keep this test or not. The Python lexer + Enderpy lexer
-    // handle newlines in a nested context slightly differently.
-    // - Python increments the row counter.
-    // - Enderpy appends them to the original row.
-    //     #[test]
-    //     fn test_lex_other() {
-    //         python_tokenize_test_lexer(
-    //             &["(a,
-    //
-    // )"],
-    //         );
-    //     }
+    #[test]
+    fn test_lex_other() {
+        python_tokenize_test_lexer(&["(a,
+
+    )"]);
+    }
 
     #[test]
     fn test_lex_indentation() {
@@ -672,9 +666,7 @@ def",
             PythonKind::FstringMiddle => enderpy_token.kind == Kind::FStringMiddle,
             PythonKind::FstringEnd => enderpy_token.kind == Kind::FStringEnd,
             PythonKind::Comment => enderpy_token.kind == Kind::Comment,
-            // In Python, this represents a line break within a single statement. We don't
-            // currently make this distinction.
-            PythonKind::NL => enderpy_token.kind == Kind::NewLine,
+            PythonKind::NL => enderpy_token.kind == Kind::NL,
             PythonKind::ErrorToken => {
                 match python_token.value.as_str() {
                     // Python 3.11 chokes on these tokens.
@@ -701,7 +693,8 @@ def",
                 || matches_python_op_token(python_token.value.as_str(), &enderpy_token.kind)
                 || matches_python_indent_dedent_token(&python_token.kind, &enderpy_token.kind)
                 || (python_token.kind == PythonKind::EndMarker && enderpy_token.kind == Kind::Eof)
-                || (python_token.value.as_str() == "\n" && enderpy_token.kind == Kind::NewLine)
+                || (python_token.value.as_str() == "\n"
+                    && (matches!(enderpy_token.kind, Kind::NewLine | Kind::NL)))
                 || python_token_value == enderpy_token_value;
         if !value_matches {
             return Some(TokenMismatch::WrongValue(

diff --git a/parser/src/lexer/mod.rs b/parser/src/lexer/mod.rs
@@ -61,6 +61,7 @@ pub struct Lexer<'a> {
     /// Array of all line starts offsets. Starts from line 0
     pub line_starts: Vec<u32>,
     peak_mode: bool,
+    prev_token_newline: bool,
 }
 
 impl<'a> Lexer<'a> {
@@ -76,6 +77,7 @@ impl<'a> Lexer<'a> {
             next_token_is_dedent: 0,
             line_starts: vec![],
             peak_mode: false,
+            prev_token_newline: false,
         }
     }
 
@@ -125,6 +127,7 @@ impl<'a> Lexer<'a> {
             return self.next_token();
         }
 
+        self.prev_token_newline = kind == Kind::NewLine;
         let value = self.parse_token_value(kind, start);
         let end = self.current;
 
@@ -143,8 +146,10 @@ impl<'a> Lexer<'a> {
         let nesting = self.nesting;
         let start_of_line = self.start_of_line;
         let next_token_is_dedent = self.next_token_is_dedent;
+        let prev_token_newline = self.prev_token_newline;
         self.peak_mode = true;
         let token = self.next_token();
+        self.prev_token_newline = prev_token_newline;
         self.peak_mode = false;
         self.current = current;
         self.current_line = current_line;
@@ -255,11 +260,13 @@ impl<'a> Lexer<'a> {
     }
 
     fn next_kind(&mut self) -> Result<Kind, LexError> {
-        if self.start_of_line && self.nesting == 0 {
+        if self.start_of_line {
             self.line_starts.push(self.current);
-            if let Some(indent_kind) = self.match_indentation()? {
-                self.start_of_line = false; // WHY!?
-                return Ok(indent_kind);
+            if self.nesting == 0 {
+                if let Some(indent_kind) = self.match_indentation()? {
+                    self.start_of_line = false; // WHY!?
+                    return Ok(indent_kind);
+                }
             }
         }
 
@@ -510,7 +517,11 @@ impl<'a> Lexer<'a> {
                 '\n' | '\r' => {
                     self.current_line += 1;
                     self.start_of_line = true;
-                    return Ok(Kind::NewLine);
+                    if self.nesting == 0 && !self.prev_token_newline {
+                        return Ok(Kind::NewLine);
+                    } else {
+                        return Ok(Kind::NL);
+                    }
                 }
                 c if match_whitespace(c) => return Ok(Kind::WhiteSpace),
                 _ => {}

diff --git a/parser/src/parser/parser.rs b/parser/src/parser/parser.rs
@@ -176,12 +176,6 @@ impl<'a> Parser<'a> {
         }
     }
 
-    fn advance_to_next_line_or_semicolon(&mut self) {
-        while !self.eat(Kind::NewLine) && !self.eat(Kind::SemiColon) && !self.at(Kind::Eof) {
-            self.advance();
-        }
-    }
-
     /// Expect a `Kind` or return error
     pub fn expect(&mut self, kind: Kind) -> Result<(), ParsingError> {
         if !self.at(kind) {
@@ -330,7 +324,10 @@ impl<'a> Parser<'a> {
     ) {
         while self.eat(Kind::WhiteSpace) || self.eat(Kind::Comment) {}
 
-        if !matches!(self.cur_kind(), Kind::NewLine | Kind::SemiColon | Kind::Eof) {
+        if !matches!(
+            self.cur_kind(),
+            Kind::NewLine | Kind::NL | Kind::SemiColon | Kind::Eof
+        ) {
             panic!("Statement does not end in new line or semicolon {:?}", stmt);
         }
     }
@@ -1974,7 +1971,7 @@ impl<'a> Parser<'a> {
 
     fn consume_whitespace_and_newline(&mut self) -> bool {
         let mut consumed = false;
-        while matches!(self.cur_kind(), Kind::WhiteSpace | Kind::NewLine) {
+        while matches!(self.cur_kind(), Kind::WhiteSpace | Kind::NewLine | Kind::NL) {
             self.advance();
             consumed = true;
         }

diff --git a/parser/src/token.rs b/parser/src/token.rs
@@ -26,6 +26,7 @@ impl Display for Token {
 pub enum Kind {
     // Line structure
     NewLine, // \n
+    NL,      // Logical newline
     Indent,  // \t
     Dedent,  // \t
 
@@ -313,6 +314,7 @@ impl From<Kind> for &str {
             Kind::Less => "<",
             Kind::Greater => ">",
             Kind::NewLine => "NewLine",
+            Kind::NL => "NL",
             Kind::Identifier => "Identifier",
             Kind::False => "False",
             Kind::None => "None",

diff --git a/...t_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@indentation-3.snap b/...t_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@indentation-3.snap
@@ -14,7 +14,7 @@ description: "def f(x):\n    y = z\n\n    print(y)\n"
 16,17: =   (None)
 18,19: Identifier   (Str("z"))
 19,20: NewLine   (None)
-20,21: NewLine   (None)
+20,21: NL   (None)
 25,30: Identifier   (Str("print"))
 30,31: (   (None)
 31,32: Identifier   (Str("y"))

diff --git a/...t_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@indentation-4.snap b/...t_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@indentation-4.snap
@@ -6,13 +6,13 @@ description: "if a:\n\n    f = c\n\n    # Path: test_local.py\n"
 3,4: Identifier   (Str("a"))
 4,5: :   (None)
 5,6: NewLine   (None)
-6,7: NewLine   (None)
+6,7: NL   (None)
 7,11: Indent   (Indent(1))
 11,12: Identifier   (Str("f"))
 13,14: =   (None)
 15,16: Identifier   (Str("c"))
 16,17: NewLine   (None)
-17,18: NewLine   (None)
+17,18: NL   (None)
 22,43: Comment   (Str("# Path: test_local.py"))
 43,44: NewLine   (None)
 44,44: Dedent   (Indent(1))
diff --git a/...derpy_python_parser__lexer__tests__snapshot_test_lexer@newline-in-nested-structure-0.snap b/...derpy_python_parser__lexer__tests__snapshot_test_lexer@newline-in-nested-structure-0.snap
@@ -5,6 +5,6 @@ description: "(a,\n\n)"
 0,1: (   (None)
 1,2: Identifier   (Str("a"))
 2,3: ,   (None)
-3,4: NewLine   (None)
-4,5: NewLine   (None)
+3,4: NL   (None)
+4,5: NL   (None)
 5,6: )   (None)