Add new integrity and clean up for non NFC (JabRef#11056)

* JabRef#10506 Added new integrity and clean up for non NFC * JabRef#10506 Add key in JabRef_en.properties * JabRef#10506 Fixed checkstyle and markedown error * JabRef#10506 fixed CHANGELOG.md error * Update CHANGELOG.md * Removed whitespaces unnecessary whitespace changes * Removed unnecessary whitespace change * Update PersonNamesChecker.java * Update PersonNamesChecker.java * JabRef#10506 Remove unnecessary comment line in FieldCheckers * Fix issues * Fix CHANGELOG.md --------- Co-authored-by: Harshit.Gupta7 <[email protected]> Co-authored-by: Harshit Gupta <[email protected]> Co-authored-by: Carl Christian Snethlage <[email protected]>
JabRef · Mar 19, 2024 · 13b015a · 13b015a
1 parent f8f5f4e
commit 13b015a
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
 - We added the possibility to show the BibTeX source in the [web search](https://docs.jabref.org/collect/import-using-online-bibliographic-database) import screen. [#560](https://github.com/koppor/jabref/issues/560)
 - We added a fetcher for [ISIDORE](https://isidore.science/), simply paste in the link into the text field or the last 6 digits in the link that identify that paper. [#10423](https://github.com/JabRef/jabref/issues/10423)
 - When importing entries form the "Citation relations" tab, the field [cites](https://docs.jabref.org/advanced/entryeditor/entrylinks) is now filled according to the relationship between the entries. [#10572](https://github.com/JabRef/jabref/pull/10752)
+- We added a new integrity check and clean up option for strings having Unicode characters not encoded in [Unicode "Normalization Form Canonical Composition" (NFC)](https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms"). [#10506](https://github.com/JabRef/jabref/issues/10506)
 - We added a new group icon column to the main table showing the icons of the entry's groups. [#10801](https://github.com/JabRef/jabref/pull/10801)
 - When deleting an entry, the files linked to the entry are now optionally deleted as well. [#10509](https://github.com/JabRef/jabref/issues/10509)
 - We added support to move the file to the system trash (instead of deleting it). [#10591](https://github.com/JabRef/jabref/pull/10591)

diff --git a/src/main/java/org/jabref/logic/formatter/Formatters.java b/src/main/java/org/jabref/logic/formatter/Formatters.java
@@ -22,6 +22,7 @@
 import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter;
 import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter;
 import org.jabref.logic.formatter.bibtexfields.NormalizePagesFormatter;
+import org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter;
 import org.jabref.logic.formatter.bibtexfields.OrdinalsToSuperscriptFormatter;
 import org.jabref.logic.formatter.bibtexfields.RegexFormatter;
 import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter;
@@ -87,6 +88,7 @@ public static List<Formatter> getOthers() {
                 new EscapeAmpersandsFormatter(),
                 new EscapeDollarSignFormatter(),
                 new ShortenDOIFormatter(),
+                new NormalizeUnicodeFormatter(),
                 new ReplaceUnicodeLigaturesFormatter(),
                 new UnprotectTermsFormatter()
         );

diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatter.java
@@ -0,0 +1,39 @@
+package org.jabref.logic.formatter.bibtexfields;
+
+import java.text.Normalizer;
+
+import org.jabref.logic.cleanup.Formatter;
+
+/**
+ * Clean up field values by formatting Unicode values by using the <a href="https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms">Normal form "Normalization Form Canonical Composition" (NFC)</a>: Characters are decomposed and then recomposed by canonical equivalence.
+ *
+ * The {@link org.jabref.logic.integrity.UnicodeNormalFormCanonicalCompositionCheck} is for checking the presence of other Unicode representations.
+ */
+public class NormalizeUnicodeFormatter extends Formatter {
+
+    @Override
+    public String getName() {
+        return "Normalize Unicode";
+    }
+
+    @Override
+    public String getKey() {
+        return "NORMALIZE_UNICODE";
+    }
+
+    @Override
+    public String getDescription() {
+        return "Normalize Unicode characters in BibTeX fields.";
+    }
+
+    @Override
+    public String getExampleInput() {
+        return "H\u00E9ll\u00F4 W\u00F6rld";
+    }
+
+    @Override
+    public String format(String value) {
+        String normalizedValue = Normalizer.normalize(value, Normalizer.Form.NFC);
+        return normalizedValue;
+    }
+}
diff --git a/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java b/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java
@@ -52,6 +52,7 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
             entryCheckers.addAll(List.of(
                     new ASCIICharacterChecker(),
                     new NoBibtexFieldChecker(),
+                    new UnicodeNormalFormCanonicalCompositionCheck(),
                     new BibTeXEntryTypeChecker())
             );
         }

diff --git a/src/main/java/org/jabref/logic/integrity/UnicodeNormalFormCanonicalCompositionCheck.java b/src/main/java/org/jabref/logic/integrity/UnicodeNormalFormCanonicalCompositionCheck.java
@@ -0,0 +1,26 @@
+package org.jabref.logic.integrity;
+
+import java.text.Normalizer;
+import java.util.List;
+
+import org.jabref.logic.l10n.Localization;
+import org.jabref.model.entry.BibEntry;
+
+/**
+ * Detect any Unicode characters that is not in NFC format. NFC:  <a href="https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms">Normal form "Normalization Form Canonical Composition" (NFC)</a>: Characters are decomposed and then recomposed by canonical equivalence.
+ *
+ * Normalizer: {@link org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter}
+ */
+public class UnicodeNormalFormCanonicalCompositionCheck implements EntryChecker {
+
+    @Override
+    public List<IntegrityMessage> check(BibEntry entry) {
+        return entry.getFieldMap()
+                    .entrySet()
+                    .stream()
+                    .filter(field -> !Normalizer.isNormalized(field.getValue(), Normalizer.Form.NFC))
+                    .map(field -> new IntegrityMessage(Localization.lang("Value is not in Unicode's Normalization Form \"Canonical Composition\" (NFC) format"), entry,
+                            field.getKey()))
+                    .toList();
+    }
+}
diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties
@@ -2632,6 +2632,8 @@ More\ options...=More options...
 Treat\ all\ duplicates\ entries\ the\ same\ way=Treat all duplicates entries the same way
 Ask\ every\ time=Ask every time
 
+Value\ is\ not\ in\ Unicode's\ Normalization\ Form\ "Canonical\ Composition"\ (NFC)\ format=Value is not in Unicode's Normalization Form "Canonical Composition" (NFC) format
+
 Group\ icons=Group icons
 Source\ URL=Source URL
 Redownload\ file=Redownload file

diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatterTest.java
@@ -0,0 +1,20 @@
+package org.jabref.logic.formatter.bibtexfields;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class NormalizeUnicodeFormatterTest {
+
+    private NormalizeUnicodeFormatter formatter = new NormalizeUnicodeFormatter();
+
+    @ParameterizedTest
+    @CsvSource({
+            "John, John",
+            "\u00C5, \u0041\u030A"
+    })
+    void format(String expected, String input) {
+        assertEquals(expected, formatter.format(input));
+    }
+}
diff --git a/src/test/java/org/jabref/logic/integrity/UnicodeNormalFormCanonicalCompositionCheckTest.java b/src/test/java/org/jabref/logic/integrity/UnicodeNormalFormCanonicalCompositionCheckTest.java
@@ -0,0 +1,44 @@
+package org.jabref.logic.integrity;
+
+import java.util.List;
+
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.field.StandardField;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class UnicodeNormalFormCanonicalCompositionCheckTest {
+    UnicodeNormalFormCanonicalCompositionCheck checker = new UnicodeNormalFormCanonicalCompositionCheck();
+
+    @Test
+    void asciiStringShouldReturnEmptyList() {
+        BibEntry entry = new BibEntry()
+                .withField(StandardField.TITLE, "Some Title")
+                .withField(StandardField.AUTHOR, "John Doe");
+        assertEquals(List.of(), checker.check(entry));
+    }
+
+    @Test
+    void normalizedStringShouldReturnEmptyList() {
+        BibEntry entry = new BibEntry()
+                .withField(StandardField.TITLE, "Caf́é")
+                .withField(StandardField.AUTHOR, "John Doe");
+        assertEquals(List.of(), checker.check(entry));
+    }
+
+    @Test
+    void nonNormalizedLetterAWithAcuteShouldReturnIntegrityMessage() {
+        BibEntry entry = new BibEntry()
+                .withField(StandardField.TITLE, "\u0041\u0301");
+        assertEquals(List.of(new IntegrityMessage("Value is not in Unicode's Normalization Form \"Canonical Composition\" (NFC) format", entry, StandardField.TITLE)), checker.check(entry));
+    }
+
+    @Test
+    void checkWithNormalizedLetterAWithAcuteShouldReturnIntegrityMessage() {
+        BibEntry entry = new BibEntry()
+                .withField(StandardField.TITLE, "\u00C1");
+        assertEquals(List.of(), checker.check(entry));
+    }
+}