Skip to content

Commit

Permalink
Fix token position according to python tokenize
Browse files Browse the repository at this point in the history
  • Loading branch information
Glyphack committed Aug 4, 2024
1 parent f48df1e commit 9104f6f
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 32 deletions.
15 changes: 4 additions & 11 deletions enderpy/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,13 @@ fn tokenize() -> Result<()> {
io::stdin().read_to_string(&mut source).into_diagnostic()?;
}
}
println!("{:?}", source.chars().collect::<Vec<char>>());
let mut lexer = Lexer::new(&source);
let tokens = lexer.lex();
println!("{:?}", lexer.line_starts);
for token in tokens {
let (start_line_num, start_line_offset) =
match lexer.line_starts.binary_search(&token.start) {
Ok(idx) => (idx, lexer.line_starts[idx]),
Err(idx) => (idx - 1, lexer.line_starts[idx - 1]),
};
let start_line_column = token.start - start_line_offset;
let (end_line_num, end_line_offset) = match lexer.line_starts.binary_search(&token.end) {
Ok(idx) => (idx, lexer.line_starts[idx]),
Err(idx) => (idx - 1, lexer.line_starts[idx - 1]),
};
let end_line_column = token.end - end_line_offset;
let (start_line_num, start_line_column, end_line_num, end_line_column) =
token.get_row_col_position(&lexer.line_starts);
println!(
"{}-{}, {}-{}: {} {} {} {}",
start_line_num,
Expand Down
34 changes: 17 additions & 17 deletions parser/src/lexer/compat.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
use miette::{bail, IntoDiagnostic, Result};
use serde::{Deserialize, Serialize};
use std::io::Write;

use crate::runpython::{default_python_path, spawn_python_script_command};

// Derived from:
// https://github.com/python/cpython/blob/main/Lib/token.py
Expand Down Expand Up @@ -104,11 +100,15 @@ mod tests {
use super::*;
use crate::token::Kind;
use crate::{lexer::Lexer, token::Token};
use miette::{bail, IntoDiagnostic, Result};
use std::io::Write;
use tabled::{
builder::Builder,
settings::peaker::PriorityMax,
settings::{Style, Width},
};

use crate::runpython::{default_python_path, spawn_python_script_command};
use terminal_size::{terminal_size, Width as TerminalWidth};

fn lex_python_source(source: &str) -> Result<Vec<PythonToken>> {
Expand Down Expand Up @@ -390,6 +390,17 @@ def",
]);
}

#[test]
fn test_logical_and_physical_lines() {
python_tokenize_test_lexer(&[
// This case the first line should have physical line
"
a: int = 1
print(a)
",
]);
}

#[test]
#[should_panic]
fn test_lex_unterminated_string_double_quotes() {
Expand Down Expand Up @@ -705,19 +716,8 @@ def",
));
}

let (mut enderpy_start_row, enderpy_start_col) = lexer.to_row_col(enderpy_token.start);
let (mut enderpy_end_row, mut enderpy_end_col) = lexer.to_row_col(enderpy_token.end);
// Python reserves the first row for a file encoding when detokenizing, so we add one
// to our row values to match.
enderpy_start_row += 1;
enderpy_end_row += 1;
if enderpy_token.kind == Kind::NewLine {
// enderpy has newline tokens span from the end of the first line to the beginning of
// the next line.
// Python adds the token to the end of the first line.
enderpy_end_row = enderpy_start_row;
enderpy_end_col = enderpy_start_col + 1;
}
let (enderpy_start_row, enderpy_start_col, enderpy_end_row, enderpy_end_col) =
enderpy_token.get_row_col_position(&lexer.line_starts);
let python_token_start = python_token.start;
let python_token_end = python_token.end;
if enderpy_start_row != python_token_start.0
Expand Down
22 changes: 18 additions & 4 deletions parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,11 @@ pub struct Lexer<'a> {
/// Array of all line starts offsets. Starts from line 0
pub line_starts: Vec<u32>,
peak_mode: bool,

/// Previous token was a Newline token
prev_token_newline: bool,
/// Cursor at position after the indentation in line
indented: bool,
}

impl<'a> Lexer<'a> {
Expand All @@ -75,9 +79,10 @@ impl<'a> Lexer<'a> {
nesting: 0,
tokenization_mode_stack: vec![],
next_token_is_dedent: 0,
line_starts: vec![],
line_starts: vec![0],
peak_mode: false,
prev_token_newline: false,
prev_token_newline: true,
indented: false,
}
}

Expand Down Expand Up @@ -131,6 +136,10 @@ impl<'a> Lexer<'a> {
let value = self.parse_token_value(kind, start);
let end = self.current;

if kind == Kind::NewLine || kind == Kind::NL {
self.line_starts.push(self.current);
}

Token {
kind,
value,
Expand All @@ -147,8 +156,10 @@ impl<'a> Lexer<'a> {
let start_of_line = self.start_of_line;
let next_token_is_dedent = self.next_token_is_dedent;
let prev_token_newline = self.prev_token_newline;
let indented = self.indented;
self.peak_mode = true;
let token = self.next_token();
self.indented = indented;
self.prev_token_newline = prev_token_newline;
self.peak_mode = false;
self.current = current;
Expand Down Expand Up @@ -261,7 +272,6 @@ impl<'a> Lexer<'a> {

fn next_kind(&mut self) -> Result<Kind, LexError> {
if self.start_of_line {
self.line_starts.push(self.current);
if self.nesting == 0 {
if let Some(indent_kind) = self.match_indentation()? {
self.start_of_line = false; // WHY!?
Expand Down Expand Up @@ -517,7 +527,7 @@ impl<'a> Lexer<'a> {
'\n' | '\r' => {
self.current_line += 1;
self.start_of_line = true;
if self.nesting == 0 && !self.prev_token_newline {
if self.nesting == 0 && (!self.prev_token_newline) {
return Ok(Kind::NewLine);
} else {
return Ok(Kind::NL);
Expand Down Expand Up @@ -1060,7 +1070,11 @@ impl<'a> Lexer<'a> {
count
}

// This function is used in test. Not sure to keep it here or not
#[allow(dead_code)]
fn to_row_col(&self, source_offset: u32) -> (u32, u32) {
println!("source_offset: {}", source_offset);
println!("line_starts: {:?}", self.line_starts);
let (line_row, line_offset) = match self.line_starts.binary_search(&source_offset) {
Ok(idx) => (idx, self.line_starts[idx]),
Err(idx) => (idx - 1, self.line_starts[idx - 1]),
Expand Down
35 changes: 35 additions & 0 deletions parser/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,41 @@ pub struct Token {
pub end: u32,
}

impl Token {
pub fn get_row_col_position(&self, line_starts: &[u32]) -> (u32, u32, u32, u32) {
let (start_line_num, start_line_offset) = match line_starts.binary_search(&self.start) {
Ok(idx) => (idx, line_starts[idx]),
Err(idx) => (idx - 1, line_starts[idx - 1]),
};
let start_line_column = self.start - start_line_offset;
// EOF token
if self.start == self.end {
return (
start_line_num as u32 + 1,
start_line_column,
start_line_num as u32 + 1,
start_line_column,
);
}
let (end_line_num, end_line_offset) = match line_starts.binary_search(&self.end) {
// Special case: this is a new line token
// When end line offset is exactly on line start it means that this is the new line
// token end offset. We want to set the new line token line number same for start and
// end.
Ok(idx) => (idx - 1, line_starts[idx - 1]),
Err(idx) => (idx - 1, line_starts[idx - 1]),
};
let end_line_column = self.end.saturating_sub(end_line_offset);

(
start_line_num as u32 + 1,
start_line_column,
end_line_num as u32 + 1,
end_line_column,
)
}
}

impl Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let kind: &str = self.kind.into();
Expand Down

0 comments on commit 9104f6f

Please sign in to comment.