Skip to content

Commit

Permalink
lf.coding.PythonCodeParser to improve on smart detection on string …
Browse files Browse the repository at this point in the history
…quotes.

PiperOrigin-RevId: 573248961
  • Loading branch information
daiyip authored and langfun authors committed Oct 13, 2023
1 parent 9a0a141 commit aa8f73e
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 5 deletions.
46 changes: 42 additions & 4 deletions langfun/core/coding/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,6 @@ def clean(self, code_text: str) -> str:
and c in ('\'', '"')
and i > 0
and code_text[i - 1] != '\\'):

# Handle ''' and """.
if code_text[i: i + 3] == c * 3:
c = c * 3
Expand All @@ -223,11 +222,32 @@ def clean(self, code_text: str) -> str:
elif quote_char == c:
# NOTE(daiyip): at times, LM forgets to escape quotes inside a string.
# Thus we do some smart checking here to automatically correct such
# case.
if i < len(code_text) - 1 and code_text[i + 1] not in '.,]}) \t\n+*':
c = f'\\{c}'
# case. This logic here is pretty involved in handling special cases.
# We might want to revisit them later.

# Peek forward to see if it could be a valid string.
nt, nnt_start = _next_token(code_text, i + 1)
if nt in (',', '[', ']', '}', ')', '+', '*', '%', '\n'):
end_quote = True
elif nt == ' ':
# Detect if . could be a method invocation.
# NOTE(daiyip): 'in' and 'not in' might have false positives. But
# given the chance is low, we do not complicate the reasoning logic
# for now.
nnt, _ = _next_token(code_text, nnt_start, skip_whitespace=True)
end_quote = nnt in ('+', '*', '%', '#', '[', 'in', 'not')
elif nt == '.':
# Detect if . could be method invocation on string.
nnt, nnnt_start = _next_token(code_text, nnt_start)
nnnt, _ = _next_token(code_text, nnnt_start)
end_quote = nnt.isidentifier() and nnnt == '('
else:
end_quote = False

if end_quote:
quote_char = None
else:
c = f'\\{c}'
# Detect comment.
elif c == '#' and quote_char is None:
in_comment = True
Expand Down Expand Up @@ -259,6 +279,24 @@ def clean(self, code_text: str) -> str:
return inspect.cleandoc(code).strip()


def _next_token(
text: str,
start: int = 0,
skip_whitespace: bool = False
) -> tuple[str, int]:
"""Find the next token in a string with a start position."""
token_start = start
if skip_whitespace:
while token_start < len(text) and text[token_start] in ' \t':
token_start += 1
token_end = token_start + 1
if text[token_start].isalpha():
while (token_end < len(text)
and text[token_end].isalpha() or text[token_end] in '_'):
token_end += 1
return text[token_start:token_end], token_end


# Key in the returned dict that represents the final result.
_FINAL_RESULT_KEY = '__result__'

Expand Down
62 changes: 61 additions & 1 deletion langfun/core/coding/python_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ class A:
"""
)

def test_clean_with_auto_correction(self):
def test_clean_with_auto_escape(self):
self.assert_clean(
"""
```python
Expand All @@ -238,6 +238,66 @@ def test_clean_with_auto_correction(self):
x = 'John\\'s home'
"""
)
self.assert_clean(
"""
```python
x = 'Girls' home'
```
""",
"""
x = 'Girls\\' home'
"""
)
self.assert_clean(
"""
```python
x = 'These are the girls'.'
```
""",
"""
x = 'These are the girls\\'.'
"""
)
self.assert_clean(
"""
```python
x = 'girls'.split('')
```
""",
"""
x = 'girls'.split('')
"""
)
self.assert_clean(
"""
```python
x = 'girls' + 'boys'
```
""",
"""
x = 'girls' + 'boys'
"""
)
self.assert_clean(
"""
```python
x = 'girls' in ['girls', 'boys']
```
""",
"""
x = 'girls' in ['girls', 'boys']
"""
)
self.assert_clean(
"""
```python
x = 'girls' not in ['girls', 'boys']
```
""",
"""
x = 'girls' not in ['girls', 'boys']
"""
)
self.assert_clean(
"""
```python
Expand Down

0 comments on commit aa8f73e

Please sign in to comment.