From aa8f73e2b97b2333591a9f38d61ae05de68ad092 Mon Sep 17 00:00:00 2001
From: Daiyi Peng <daiyip@google.com>
Date: Fri, 13 Oct 2023 10:01:05 -0700
Subject: [PATCH] `lf.coding.PythonCodeParser` to improve on smart detection on
 string quotes.

PiperOrigin-RevId: 573248961
---
 langfun/core/coding/python.py      | 46 ++++++++++++++++++++--
 langfun/core/coding/python_test.py | 62 +++++++++++++++++++++++++++++-
 2 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/langfun/core/coding/python.py b/langfun/core/coding/python.py
index 26f9080..dec35d6 100644
--- a/langfun/core/coding/python.py
+++ b/langfun/core/coding/python.py
@@ -212,7 +212,6 @@ def clean(self, code_text: str) -> str:
           and c in ('\'', '"')
           and i > 0
           and code_text[i - 1] != '\\'):
-
         # Handle ''' and """.
         if code_text[i: i + 3] == c * 3:
           c = c * 3
@@ -223,11 +222,32 @@ def clean(self, code_text: str) -> str:
         elif quote_char == c:
           # NOTE(daiyip): at times, LM forgets to escape quotes inside a string.
           # Thus we do some smart checking here to automatically correct such
-          # case.
-          if i < len(code_text) - 1 and code_text[i + 1] not in '.,]}) \t\n+*':
-            c = f'\\{c}'
+          # case. This logic here is pretty involved in handling special cases.
+          # We might want to revisit them later.
+
+          # Peek forward to see if it could be a valid string.
+          nt, nnt_start = _next_token(code_text, i + 1)
+          if nt in (',', '[', ']', '}', ')', '+', '*', '%', '\n'):
+            end_quote = True
+          elif nt == ' ':
+            # Detect if . could be a method invocation.
+            # NOTE(daiyip): 'in' and 'not in' might have false positives. But
+            # given the chance is low, we do not complicate the reasoning logic
+            # for now.
+            nnt, _ = _next_token(code_text, nnt_start, skip_whitespace=True)
+            end_quote = nnt in ('+', '*', '%', '#', '[', 'in', 'not')
+          elif nt == '.':
+            # Detect if . could be method invocation on string.
+            nnt, nnnt_start = _next_token(code_text, nnt_start)
+            nnnt, _ = _next_token(code_text, nnnt_start)
+            end_quote = nnt.isidentifier() and nnnt == '('
           else:
+            end_quote = False
+
+          if end_quote:
             quote_char = None
+          else:
+            c = f'\\{c}'
       # Detect comment.
       elif c == '#' and quote_char is None:
         in_comment = True
@@ -259,6 +279,24 @@ def clean(self, code_text: str) -> str:
     return inspect.cleandoc(code).strip()
 
 
+def _next_token(
+    text: str,
+    start: int = 0,
+    skip_whitespace: bool = False
+    ) -> tuple[str, int]:
+  """Find the next token in a string with a start position."""
+  token_start = start
+  if skip_whitespace:
+    while token_start < len(text) and text[token_start] in ' \t':
+      token_start += 1
+  token_end = token_start + 1
+  if text[token_start].isalpha():
+    while (token_end < len(text)
+           and text[token_end].isalpha() or text[token_end] in '_'):
+      token_end += 1
+  return text[token_start:token_end], token_end
+
+
 # Key in the returned dict that represents the final result.
 _FINAL_RESULT_KEY = '__result__'
 
diff --git a/langfun/core/coding/python_test.py b/langfun/core/coding/python_test.py
index 4e0dd4f..c1d12ac 100644
--- a/langfun/core/coding/python_test.py
+++ b/langfun/core/coding/python_test.py
@@ -227,7 +227,7 @@ class A:
         """
     )
 
-  def test_clean_with_auto_correction(self):
+  def test_clean_with_auto_escape(self):
     self.assert_clean(
         """
         ```python
@@ -238,6 +238,66 @@ def test_clean_with_auto_correction(self):
         x = 'John\\'s home'
         """
     )
+    self.assert_clean(
+        """
+        ```python
+        x = 'Girls' home'
+        ```
+        """,
+        """
+        x = 'Girls\\' home'
+        """
+    )
+    self.assert_clean(
+        """
+        ```python
+        x = 'These are the girls'.'
+        ```
+        """,
+        """
+        x = 'These are the girls\\'.'
+        """
+    )
+    self.assert_clean(
+        """
+        ```python
+        x = 'girls'.split('')
+        ```
+        """,
+        """
+        x = 'girls'.split('')
+        """
+    )
+    self.assert_clean(
+        """
+        ```python
+        x = 'girls' + 'boys'
+        ```
+        """,
+        """
+        x = 'girls' + 'boys'
+        """
+    )
+    self.assert_clean(
+        """
+        ```python
+        x = 'girls' in ['girls', 'boys']
+        ```
+        """,
+        """
+        x = 'girls' in ['girls', 'boys']
+        """
+    )
+    self.assert_clean(
+        """
+        ```python
+        x = 'girls' not in ['girls', 'boys']
+        ```
+        """,
+        """
+        x = 'girls' not in ['girls', 'boys']
+        """
+    )
     self.assert_clean(
         """
         ```python