Skip to content

Commit

Permalink
feat: distinguish between logical and physical lines
Browse files Browse the repository at this point in the history
  • Loading branch information
Glyphack committed Jul 31, 2024
1 parent 3dcf0e2 commit f48df1e
Show file tree
Hide file tree
Showing 29 changed files with 308 additions and 293 deletions.
7 changes: 3 additions & 4 deletions enderpy/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,14 @@ use clap::{Parser, Subcommand};
pub struct Cli {
#[command(subcommand)]
pub command: Commands,
#[arg(short, long)]
pub file: Option<PathBuf>,
}

#[derive(Subcommand)]
pub enum Commands {
/// Print lexer tokens
Tokenize {
/// Path to source file
file: PathBuf,
},
Tokenize {},
/// Print abstract syntax tree
Parse {
/// Path to source file
Expand Down
21 changes: 17 additions & 4 deletions enderpy/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::{
fs, io,
fs::{self, File},
io::{self, Read},
path::{Path, PathBuf},
};

Expand All @@ -14,7 +15,7 @@ mod cli;
fn main() -> Result<()> {
let cli = Cli::parse();
match &cli.command {
Commands::Tokenize { file } => tokenize(file),
Commands::Tokenize {} => tokenize(),
Commands::Parse { file } => parse(file),
Commands::Check { path } => check(path),
Commands::Watch => watch(),
Expand Down Expand Up @@ -70,8 +71,20 @@ fn get_typeshed_path() -> Result<PathBuf> {
Ok(path.join("typeshed"))
}

fn tokenize(file: &PathBuf) -> Result<()> {
let source = fs::read_to_string(file).into_diagnostic()?;
fn tokenize() -> Result<()> {
let cli = Cli::parse();
let mut source = String::new();
match cli.file {
Some(path) => {
File::open(path)
.into_diagnostic()?
.read_to_string(&mut source)
.into_diagnostic()?;
}
None => {
io::stdin().read_to_string(&mut source).into_diagnostic()?;
}
}
let mut lexer = Lexer::new(&source);
let tokens = lexer.lex();
for token in tokens {
Expand Down
71 changes: 32 additions & 39 deletions parser/src/lexer/compat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,29 +98,10 @@ pub struct PythonToken {
end: (u32, u32),
}

fn lex_python_source(source: &str) -> Result<Vec<PythonToken>> {
let mut process = spawn_python_script_command(
"parser/lex_python.py",
vec!["--stdin", "--output-format", "json"],
default_python_path()?,
)?;

// Get process stdin and write the input string.
if let Some(mut stdin) = process.stdin.take() {
stdin.write_all(source.as_bytes()).into_diagnostic()?;
} else {
bail!("Failed to open stdin when running `parser/lex_python.py`");
}
// Get process stdout and parse result.
let output = process.wait_with_output().into_diagnostic()?;
let python_tokens: Vec<PythonToken> =
serde_json::from_str(String::from_utf8_lossy(&output.stdout).as_ref()).into_diagnostic()?;
Ok(python_tokens)
}

#[cfg(test)]
mod tests {
use super::{lex_python_source, PythonKind, PythonToken};

use super::*;
use crate::token::Kind;
use crate::{lexer::Lexer, token::Token};
use tabled::{
Expand All @@ -130,6 +111,27 @@ mod tests {
};
use terminal_size::{terminal_size, Width as TerminalWidth};

fn lex_python_source(source: &str) -> Result<Vec<PythonToken>> {
let mut process = spawn_python_script_command(
"parser/lex_python.py",
vec!["--stdin", "--output-format", "json"],
default_python_path()?,
)?;

// Get process stdin and write the input string.
if let Some(mut stdin) = process.stdin.take() {
stdin.write_all(source.as_bytes()).into_diagnostic()?;
} else {
bail!("Failed to open stdin when running `parser/lex_python.py`");
}
// Get process stdout and parse result.
let output = process.wait_with_output().into_diagnostic()?;
let python_tokens: Vec<PythonToken> =
serde_json::from_str(String::from_utf8_lossy(&output.stdout).as_ref())
.into_diagnostic()?;
Ok(python_tokens)
}

#[test]
fn test_simple_compat() {
let source = r#"
Expand Down Expand Up @@ -168,8 +170,6 @@ print(a)
";",
"@",
"=",
// TODO lex_python: Python lexer chokes on single backslash.
// "\\",
"#",
"$",
"?",
Expand Down Expand Up @@ -317,18 +317,12 @@ print(a)
python_tokenize_test_lexer(&["import a", "import a.b", "import a.b.c", "import a from b"]);
}

// TODO lex_python: Decide whether to keep this test or not. The Python lexer + Enderpy lexer
// handle newlines in a nested context slightly differently.
// - Python increments the row counter.
// - Enderpy appends them to the original row.
// #[test]
// fn test_lex_other() {
// python_tokenize_test_lexer(
// &["(a,
//
// )"],
// );
// }
#[test]
fn test_lex_other() {
python_tokenize_test_lexer(&["(a,
)"]);
}

#[test]
fn test_lex_indentation() {
Expand Down Expand Up @@ -672,9 +666,7 @@ def",
PythonKind::FstringMiddle => enderpy_token.kind == Kind::FStringMiddle,
PythonKind::FstringEnd => enderpy_token.kind == Kind::FStringEnd,
PythonKind::Comment => enderpy_token.kind == Kind::Comment,
// In Python, this represents a line break within a single statement. We don't
// currently make this distinction.
PythonKind::NL => enderpy_token.kind == Kind::NewLine,
PythonKind::NL => enderpy_token.kind == Kind::NL,
PythonKind::ErrorToken => {
match python_token.value.as_str() {
// Python 3.11 chokes on these tokens.
Expand All @@ -701,7 +693,8 @@ def",
|| matches_python_op_token(python_token.value.as_str(), &enderpy_token.kind)
|| matches_python_indent_dedent_token(&python_token.kind, &enderpy_token.kind)
|| (python_token.kind == PythonKind::EndMarker && enderpy_token.kind == Kind::Eof)
|| (python_token.value.as_str() == "\n" && enderpy_token.kind == Kind::NewLine)
|| (python_token.value.as_str() == "\n"
&& (matches!(enderpy_token.kind, Kind::NewLine | Kind::NL)))
|| python_token_value == enderpy_token_value;
if !value_matches {
return Some(TokenMismatch::WrongValue(
Expand Down
21 changes: 16 additions & 5 deletions parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ pub struct Lexer<'a> {
/// Array of all line starts offsets. Starts from line 0
pub line_starts: Vec<u32>,
peak_mode: bool,
prev_token_newline: bool,
}

impl<'a> Lexer<'a> {
Expand All @@ -76,6 +77,7 @@ impl<'a> Lexer<'a> {
next_token_is_dedent: 0,
line_starts: vec![],
peak_mode: false,
prev_token_newline: false,
}
}

Expand Down Expand Up @@ -125,6 +127,7 @@ impl<'a> Lexer<'a> {
return self.next_token();
}

self.prev_token_newline = kind == Kind::NewLine;
let value = self.parse_token_value(kind, start);
let end = self.current;

Expand All @@ -143,8 +146,10 @@ impl<'a> Lexer<'a> {
let nesting = self.nesting;
let start_of_line = self.start_of_line;
let next_token_is_dedent = self.next_token_is_dedent;
let prev_token_newline = self.prev_token_newline;
self.peak_mode = true;
let token = self.next_token();
self.prev_token_newline = prev_token_newline;
self.peak_mode = false;
self.current = current;
self.current_line = current_line;
Expand Down Expand Up @@ -255,11 +260,13 @@ impl<'a> Lexer<'a> {
}

fn next_kind(&mut self) -> Result<Kind, LexError> {
if self.start_of_line && self.nesting == 0 {
if self.start_of_line {
self.line_starts.push(self.current);
if let Some(indent_kind) = self.match_indentation()? {
self.start_of_line = false; // WHY!?
return Ok(indent_kind);
if self.nesting == 0 {
if let Some(indent_kind) = self.match_indentation()? {
self.start_of_line = false; // WHY!?
return Ok(indent_kind);
}
}
}

Expand Down Expand Up @@ -510,7 +517,11 @@ impl<'a> Lexer<'a> {
'\n' | '\r' => {
self.current_line += 1;
self.start_of_line = true;
return Ok(Kind::NewLine);
if self.nesting == 0 && !self.prev_token_newline {
return Ok(Kind::NewLine);
} else {
return Ok(Kind::NL);
}
}
c if match_whitespace(c) => return Ok(Kind::WhiteSpace),
_ => {}
Expand Down
13 changes: 5 additions & 8 deletions parser/src/parser/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,6 @@ impl<'a> Parser<'a> {
}
}

fn advance_to_next_line_or_semicolon(&mut self) {
while !self.eat(Kind::NewLine) && !self.eat(Kind::SemiColon) && !self.at(Kind::Eof) {
self.advance();
}
}

/// Expect a `Kind` or return error
pub fn expect(&mut self, kind: Kind) -> Result<(), ParsingError> {
if !self.at(kind) {
Expand Down Expand Up @@ -330,7 +324,10 @@ impl<'a> Parser<'a> {
) {
while self.eat(Kind::WhiteSpace) || self.eat(Kind::Comment) {}

if !matches!(self.cur_kind(), Kind::NewLine | Kind::SemiColon | Kind::Eof) {
if !matches!(
self.cur_kind(),
Kind::NewLine | Kind::NL | Kind::SemiColon | Kind::Eof
) {
panic!("Statement does not end in new line or semicolon {:?}", stmt);
}
}
Expand Down Expand Up @@ -1974,7 +1971,7 @@ impl<'a> Parser<'a> {

fn consume_whitespace_and_newline(&mut self) -> bool {
let mut consumed = false;
while matches!(self.cur_kind(), Kind::WhiteSpace | Kind::NewLine) {
while matches!(self.cur_kind(), Kind::WhiteSpace | Kind::NewLine | Kind::NL) {
self.advance();
consumed = true;
}
Expand Down
2 changes: 2 additions & 0 deletions parser/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ impl Display for Token {
pub enum Kind {
// Line structure
NewLine, // \n
NL, // Logical newline
Indent, // \t
Dedent, // \t

Expand Down Expand Up @@ -313,6 +314,7 @@ impl From<Kind> for &str {
Kind::Less => "<",
Kind::Greater => ">",
Kind::NewLine => "NewLine",
Kind::NL => "NL",
Kind::Identifier => "Identifier",
Kind::False => "False",
Kind::None => "None",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ description: "def f(x):\n y = z\n\n print(y)\n"
16,17: = (None)
18,19: Identifier (Str("z"))
19,20: NewLine (None)
20,21: NewLine (None)
20,21: NL (None)
25,30: Identifier (Str("print"))
30,31: ( (None)
31,32: Identifier (Str("y"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ description: "if a:\n\n f = c\n\n # Path: test_local.py\n"
3,4: Identifier (Str("a"))
4,5: : (None)
5,6: NewLine (None)
6,7: NewLine (None)
6,7: NL (None)
7,11: Indent (Indent(1))
11,12: Identifier (Str("f"))
13,14: = (None)
15,16: Identifier (Str("c"))
16,17: NewLine (None)
17,18: NewLine (None)
17,18: NL (None)
22,43: Comment (Str("# Path: test_local.py"))
43,44: NewLine (None)
44,44: Dedent (Indent(1))
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ description: "(a,\n\n)"
0,1: ( (None)
1,2: Identifier (Str("a"))
2,3: , (None)
3,4: NewLine (None)
4,5: NewLine (None)
3,4: NL (None)
4,5: NL (None)
5,6: ) (None)
Loading

0 comments on commit f48df1e

Please sign in to comment.