forked from JabRef/jabref
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add new integrity and clean up for non NFC (JabRef#11056)
* JabRef#10506 Added new integrity and clean up for non NFC * JabRef#10506 Add key in JabRef_en.properties * JabRef#10506 Fixed checkstyle and markedown error * JabRef#10506 fixed CHANGELOG.md error * Update CHANGELOG.md * Removed whitespaces unnecessary whitespace changes * Removed unnecessary whitespace change * Update PersonNamesChecker.java * Update PersonNamesChecker.java * JabRef#10506 Remove unnecessary comment line in FieldCheckers * Fix issues * Fix CHANGELOG.md --------- Co-authored-by: Harshit.Gupta7 <[email protected]> Co-authored-by: Harshit Gupta <[email protected]> Co-authored-by: Carl Christian Snethlage <[email protected]>
- Loading branch information
1 parent
f8f5f4e
commit 13b015a
Showing
8 changed files
with
135 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
39 changes: 39 additions & 0 deletions
39
src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package org.jabref.logic.formatter.bibtexfields; | ||
|
||
import java.text.Normalizer; | ||
|
||
import org.jabref.logic.cleanup.Formatter; | ||
|
||
/** | ||
* Clean up field values by formatting Unicode values by using the <a href="https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms">Normal form "Normalization Form Canonical Composition" (NFC)</a>: Characters are decomposed and then recomposed by canonical equivalence. | ||
* | ||
* The {@link org.jabref.logic.integrity.UnicodeNormalFormCanonicalCompositionCheck} is for checking the presence of other Unicode representations. | ||
*/ | ||
public class NormalizeUnicodeFormatter extends Formatter { | ||
|
||
@Override | ||
public String getName() { | ||
return "Normalize Unicode"; | ||
} | ||
|
||
@Override | ||
public String getKey() { | ||
return "NORMALIZE_UNICODE"; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return "Normalize Unicode characters in BibTeX fields."; | ||
} | ||
|
||
@Override | ||
public String getExampleInput() { | ||
return "H\u00E9ll\u00F4 W\u00F6rld"; | ||
} | ||
|
||
@Override | ||
public String format(String value) { | ||
String normalizedValue = Normalizer.normalize(value, Normalizer.Form.NFC); | ||
return normalizedValue; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
26 changes: 26 additions & 0 deletions
26
src/main/java/org/jabref/logic/integrity/UnicodeNormalFormCanonicalCompositionCheck.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
package org.jabref.logic.integrity; | ||
|
||
import java.text.Normalizer; | ||
import java.util.List; | ||
|
||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.model.entry.BibEntry; | ||
|
||
/** | ||
* Detect any Unicode characters that is not in NFC format. NFC: <a href="https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms">Normal form "Normalization Form Canonical Composition" (NFC)</a>: Characters are decomposed and then recomposed by canonical equivalence. | ||
* | ||
* Normalizer: {@link org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter} | ||
*/ | ||
public class UnicodeNormalFormCanonicalCompositionCheck implements EntryChecker { | ||
|
||
@Override | ||
public List<IntegrityMessage> check(BibEntry entry) { | ||
return entry.getFieldMap() | ||
.entrySet() | ||
.stream() | ||
.filter(field -> !Normalizer.isNormalized(field.getValue(), Normalizer.Form.NFC)) | ||
.map(field -> new IntegrityMessage(Localization.lang("Value is not in Unicode's Normalization Form \"Canonical Composition\" (NFC) format"), entry, | ||
field.getKey())) | ||
.toList(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
20 changes: 20 additions & 0 deletions
20
src/test/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package org.jabref.logic.formatter.bibtexfields; | ||
|
||
import org.junit.jupiter.params.ParameterizedTest; | ||
import org.junit.jupiter.params.provider.CsvSource; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
||
class NormalizeUnicodeFormatterTest { | ||
|
||
private NormalizeUnicodeFormatter formatter = new NormalizeUnicodeFormatter(); | ||
|
||
@ParameterizedTest | ||
@CsvSource({ | ||
"John, John", | ||
"\u00C5, \u0041\u030A" | ||
}) | ||
void format(String expected, String input) { | ||
assertEquals(expected, formatter.format(input)); | ||
} | ||
} |
44 changes: 44 additions & 0 deletions
44
src/test/java/org/jabref/logic/integrity/UnicodeNormalFormCanonicalCompositionCheckTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
package org.jabref.logic.integrity; | ||
|
||
import java.util.List; | ||
|
||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
|
||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
||
public class UnicodeNormalFormCanonicalCompositionCheckTest { | ||
UnicodeNormalFormCanonicalCompositionCheck checker = new UnicodeNormalFormCanonicalCompositionCheck(); | ||
|
||
@Test | ||
void asciiStringShouldReturnEmptyList() { | ||
BibEntry entry = new BibEntry() | ||
.withField(StandardField.TITLE, "Some Title") | ||
.withField(StandardField.AUTHOR, "John Doe"); | ||
assertEquals(List.of(), checker.check(entry)); | ||
} | ||
|
||
@Test | ||
void normalizedStringShouldReturnEmptyList() { | ||
BibEntry entry = new BibEntry() | ||
.withField(StandardField.TITLE, "Caf́é") | ||
.withField(StandardField.AUTHOR, "John Doe"); | ||
assertEquals(List.of(), checker.check(entry)); | ||
} | ||
|
||
@Test | ||
void nonNormalizedLetterAWithAcuteShouldReturnIntegrityMessage() { | ||
BibEntry entry = new BibEntry() | ||
.withField(StandardField.TITLE, "\u0041\u0301"); | ||
assertEquals(List.of(new IntegrityMessage("Value is not in Unicode's Normalization Form \"Canonical Composition\" (NFC) format", entry, StandardField.TITLE)), checker.check(entry)); | ||
} | ||
|
||
@Test | ||
void checkWithNormalizedLetterAWithAcuteShouldReturnIntegrityMessage() { | ||
BibEntry entry = new BibEntry() | ||
.withField(StandardField.TITLE, "\u00C1"); | ||
assertEquals(List.of(), checker.check(entry)); | ||
} | ||
} |