Skip to content

Commit

Permalink
Add new integrity and clean up for non NFC (JabRef#11056)
Browse files Browse the repository at this point in the history
* JabRef#10506 Added new integrity and clean up for non NFC

* JabRef#10506 Add key in JabRef_en.properties

* JabRef#10506 Fixed checkstyle and  markedown error

* JabRef#10506 fixed CHANGELOG.md error

* Update CHANGELOG.md

* Removed whitespaces unnecessary whitespace changes

* Removed unnecessary whitespace change

* Update PersonNamesChecker.java

* Update PersonNamesChecker.java

* JabRef#10506 Remove unnecessary comment line in FieldCheckers

* Fix issues

* Fix CHANGELOG.md

---------

Co-authored-by: Harshit.Gupta7 <[email protected]>
Co-authored-by: Harshit Gupta <[email protected]>
Co-authored-by: Carl Christian Snethlage <[email protected]>
  • Loading branch information
4 people authored Mar 19, 2024
1 parent f8f5f4e commit 13b015a
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
- We added the possibility to show the BibTeX source in the [web search](https://docs.jabref.org/collect/import-using-online-bibliographic-database) import screen. [#560](https://github.com/koppor/jabref/issues/560)
- We added a fetcher for [ISIDORE](https://isidore.science/), simply paste in the link into the text field or the last 6 digits in the link that identify that paper. [#10423](https://github.com/JabRef/jabref/issues/10423)
- When importing entries form the "Citation relations" tab, the field [cites](https://docs.jabref.org/advanced/entryeditor/entrylinks) is now filled according to the relationship between the entries. [#10572](https://github.com/JabRef/jabref/pull/10752)
- We added a new integrity check and clean up option for strings having Unicode characters not encoded in [Unicode "Normalization Form Canonical Composition" (NFC)](https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms"). [#10506](https://github.com/JabRef/jabref/issues/10506)
- We added a new group icon column to the main table showing the icons of the entry's groups. [#10801](https://github.com/JabRef/jabref/pull/10801)
- When deleting an entry, the files linked to the entry are now optionally deleted as well. [#10509](https://github.com/JabRef/jabref/issues/10509)
- We added support to move the file to the system trash (instead of deleting it). [#10591](https://github.com/JabRef/jabref/pull/10591)
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/logic/formatter/Formatters.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizePagesFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter;
import org.jabref.logic.formatter.bibtexfields.OrdinalsToSuperscriptFormatter;
import org.jabref.logic.formatter.bibtexfields.RegexFormatter;
import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter;
Expand Down Expand Up @@ -87,6 +88,7 @@ public static List<Formatter> getOthers() {
new EscapeAmpersandsFormatter(),
new EscapeDollarSignFormatter(),
new ShortenDOIFormatter(),
new NormalizeUnicodeFormatter(),
new ReplaceUnicodeLigaturesFormatter(),
new UnprotectTermsFormatter()
);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.jabref.logic.formatter.bibtexfields;

import java.text.Normalizer;

import org.jabref.logic.cleanup.Formatter;

/**
* Clean up field values by formatting Unicode values by using the <a href="https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms">Normal form "Normalization Form Canonical Composition" (NFC)</a>: Characters are decomposed and then recomposed by canonical equivalence.
*
* The {@link org.jabref.logic.integrity.UnicodeNormalFormCanonicalCompositionCheck} is for checking the presence of other Unicode representations.
*/
public class NormalizeUnicodeFormatter extends Formatter {

@Override
public String getName() {
return "Normalize Unicode";
}

@Override
public String getKey() {
return "NORMALIZE_UNICODE";
}

@Override
public String getDescription() {
return "Normalize Unicode characters in BibTeX fields.";
}

@Override
public String getExampleInput() {
return "H\u00E9ll\u00F4 W\u00F6rld";
}

@Override
public String format(String value) {
String normalizedValue = Normalizer.normalize(value, Normalizer.Form.NFC);
return normalizedValue;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
entryCheckers.addAll(List.of(
new ASCIICharacterChecker(),
new NoBibtexFieldChecker(),
new UnicodeNormalFormCanonicalCompositionCheck(),
new BibTeXEntryTypeChecker())
);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.jabref.logic.integrity;

import java.text.Normalizer;
import java.util.List;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;

/**
* Detect any Unicode characters that is not in NFC format. NFC: <a href="https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms">Normal form "Normalization Form Canonical Composition" (NFC)</a>: Characters are decomposed and then recomposed by canonical equivalence.
*
* Normalizer: {@link org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter}
*/
public class UnicodeNormalFormCanonicalCompositionCheck implements EntryChecker {

@Override
public List<IntegrityMessage> check(BibEntry entry) {
return entry.getFieldMap()
.entrySet()
.stream()
.filter(field -> !Normalizer.isNormalized(field.getValue(), Normalizer.Form.NFC))
.map(field -> new IntegrityMessage(Localization.lang("Value is not in Unicode's Normalization Form \"Canonical Composition\" (NFC) format"), entry,
field.getKey()))
.toList();
}
}
2 changes: 2 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2632,6 +2632,8 @@ More\ options...=More options...
Treat\ all\ duplicates\ entries\ the\ same\ way=Treat all duplicates entries the same way
Ask\ every\ time=Ask every time
Value\ is\ not\ in\ Unicode's\ Normalization\ Form\ "Canonical\ Composition"\ (NFC)\ format=Value is not in Unicode's Normalization Form "Canonical Composition" (NFC) format
Group\ icons=Group icons
Source\ URL=Source URL
Redownload\ file=Redownload file
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package org.jabref.logic.formatter.bibtexfields;

import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;

import static org.junit.jupiter.api.Assertions.assertEquals;

class NormalizeUnicodeFormatterTest {

private NormalizeUnicodeFormatter formatter = new NormalizeUnicodeFormatter();

@ParameterizedTest
@CsvSource({
"John, John",
"\u00C5, \u0041\u030A"
})
void format(String expected, String input) {
assertEquals(expected, formatter.format(input));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package org.jabref.logic.integrity;

import java.util.List;

import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class UnicodeNormalFormCanonicalCompositionCheckTest {
UnicodeNormalFormCanonicalCompositionCheck checker = new UnicodeNormalFormCanonicalCompositionCheck();

@Test
void asciiStringShouldReturnEmptyList() {
BibEntry entry = new BibEntry()
.withField(StandardField.TITLE, "Some Title")
.withField(StandardField.AUTHOR, "John Doe");
assertEquals(List.of(), checker.check(entry));
}

@Test
void normalizedStringShouldReturnEmptyList() {
BibEntry entry = new BibEntry()
.withField(StandardField.TITLE, "Caf́é")
.withField(StandardField.AUTHOR, "John Doe");
assertEquals(List.of(), checker.check(entry));
}

@Test
void nonNormalizedLetterAWithAcuteShouldReturnIntegrityMessage() {
BibEntry entry = new BibEntry()
.withField(StandardField.TITLE, "\u0041\u0301");
assertEquals(List.of(new IntegrityMessage("Value is not in Unicode's Normalization Form \"Canonical Composition\" (NFC) format", entry, StandardField.TITLE)), checker.check(entry));
}

@Test
void checkWithNormalizedLetterAWithAcuteShouldReturnIntegrityMessage() {
BibEntry entry = new BibEntry()
.withField(StandardField.TITLE, "\u00C1");
assertEquals(List.of(), checker.check(entry));
}
}

0 comments on commit 13b015a

Please sign in to comment.