Fix wrong conversion of unicode chars (#7419)

* fix wrong character conversion in unicode to latex formatter * fix checkstyle issue * fix typo, add test for unicode to latex formatter * add issue link to changelog
JabRef · Feb 3, 2021 · 242a494 · 242a494
1 parent bdfd8c8
commit 242a494
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -46,6 +46,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 - We fixed an issue where the file path is invisible in dark theme. [#7382](https://github.com/JabRef/jabref/issues/7382)
 - We fixed an issue where the secondary sorting is not working for some special fields. [#7015](https://github.com/JabRef/jabref/issues/7015)
 - We fixed an issue where changing the font size makes the font size field too small. [#7085](https://github.com/JabRef/jabref/issues/7085)
+- We fixed an issue where the Unicode to Latex formatter produced wrong results for characters with a codepoint higher than Character.MAX_VALUE. [#7387](https://github.com/JabRef/jabref/issues/7387)
 
 ### Removed
 

diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java
@@ -38,7 +38,11 @@ public String format(String text) {
                 Integer cpNext = result.codePointAt(i + 1);
                 String code = HTMLUnicodeConversionMaps.ESCAPED_ACCENTS.get(cpNext);
                 if (code == null) {
-                    sb.append((char) cpCurrent);
+                    // skip next index to avoid reading surrogate as a separate char
+                    if (!Character.isBmpCodePoint(cpCurrent)) {
+                        i++;
+                    }
+                    sb.appendCodePoint(cpCurrent);
                 } else {
                     sb.append("{\\").append(code).append('{').append((char) cpCurrent).append("}}");
                     consumed = true;

diff --git a/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java b/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java
@@ -760,7 +760,8 @@ public class HTMLUnicodeConversionMaps {
             {"64259", "", "ffi"}, // ffi ligature (which LaTeX solves by itself)
             {"64260", "", "ffl"}, // ffl ligature (which LaTeX solves by itself)
             {"119978", "Oscr", "$\\mathcal{O}$"}, // script capital O -- possibly use \mathscr
-            {"119984", "Uscr", "$\\mathcal{U}$"} // script capital U -- possibly use \mathscr
+            {"119984", "Uscr", "$\\mathcal{U}$"}, // script capital U -- possibly use \mathscr
+            {"120598", "", "$\\epsilon$"}, // mathematical italic epsilon U+1D716 -- requires amsmath
 
     };
 

diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java
@@ -24,6 +24,11 @@ void formatMultipleUnicodeCharacters() {
         assertEquals("{{\\aa}}{\\\"{a}}{\\\"{o}}", formatter.format("\u00E5\u00E4\u00F6"));
     }
 
+    @Test
+    void formatHighCodepointUnicodeCharacter() {
+        assertEquals("$\\epsilon$", formatter.format("\uD835\uDF16"));
+    }
+
     @Test
     void formatExample() {
         assertEquals("M{\\\"{o}}nch", formatter.format(formatter.getExampleInput()));