Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/avoid empty sentences #1235

Merged
merged 3 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ public List<BoundingBox> getCoordinates() {

List<BoundingBox> theBoxes = null;
// non graphic elements
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
//theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
List<BoundingBox> result = new ArrayList<BoundingBox>();
Expand All @@ -291,7 +291,7 @@ public List<BoundingBox> getCoordinates() {
// here we bound all figure graphics in one single box (given that we can have hundred graphics
// in a single figure)
BoundingBox theGraphicsBox = null;
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
if (theGraphicsBox == null) {
theGraphicsBox = graphicObject.getBoundingBox();
Expand All @@ -307,8 +307,8 @@ public List<BoundingBox> getCoordinates() {
theBoxes.add(theGraphicsBox);
}

List<BoundingBox> result = new ArrayList<BoundingBox>();
if (theBoxes != null && theBoxes.size() > 0) {
List<BoundingBox> result = new ArrayList<>();
if (CollectionUtils.isNotEmpty(theBoxes)) {
BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
List<BoundingBox> mergedBox = VectorGraphicBoxCalculator.mergeBoxes(theBoxes);
result.addAll(mergedBox);
Expand All @@ -329,6 +329,7 @@ public boolean isCompleteForTEI() {

public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (!isCompleteForTEI()) {
LOGGER.warn("Found a figure that is badly formatted but it should have been spotted before. We ignore it now.");
return null;
}
Element figureElement = XmlBuilderUtils.teiElement("figure");
Expand All @@ -339,18 +340,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (config.isGenerateTeiCoordinates("figure")) {
List<BoundingBox> theBoxes = null;
// non graphic elements
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
}

// if (getBitmapGraphicObjects() != null && !getBitmapGraphicObjects().isEmpty()) {
// -> note: this was restricted to the bitmap objects only... the bounding box calculation
// with vector graphics might need some double check

// here we bound all figure graphics in one single box (given that we can have hundred graphics
// here we bound all figure graphics in one single box (given that we can have a hundred graphics
// in a single figure)
BoundingBox theGraphicsBox = null;
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
if (theGraphicsBox == null) {
theGraphicsBox = graphicObject.getBoundingBox();
Expand All @@ -366,24 +367,24 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
theBoxes.add(theGraphicsBox);
}

if (theBoxes != null && theBoxes.size() > 0) {
if (CollectionUtils.isNotEmpty(theBoxes)) {
String coords = Joiner.on(";").join(theBoxes);
XmlBuilderUtils.addCoords(figureElement, coords);
}
}
if (header != null) {

if (StringUtils.isNotBlank(header)) {
Element head = XmlBuilderUtils.teiElement("head",
LayoutTokensUtil.normalizeText(header.toString()));
figureElement.appendChild(head);

}
if (label != null) {

if (StringUtils.isNotBlank(label)) {
Element labelEl = XmlBuilderUtils.teiElement("label",
LayoutTokensUtil.normalizeText(label.toString()));
figureElement.appendChild(labelEl);
}
if (caption != null) {

if (StringUtils.isNotBlank(caption)) {
Element desc = XmlBuilderUtils.teiElement("figDesc");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
Expand All @@ -392,12 +393,12 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references
if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

Expand Down Expand Up @@ -435,7 +436,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
// LayoutTokensUtil.normalizeText(caption.toString()));
}

if (desc != null && config.isWithSentenceSegmentation()) {
if (config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the figure caption, for that we need to introduce
Expand All @@ -453,7 +454,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

figureElement.appendChild(desc);
}
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
Element go = XmlBuilderUtils.teiElement("graphic");
String uri = graphicObject.getURI();
Expand Down
17 changes: 10 additions & 7 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
Expand Down Expand Up @@ -69,6 +70,7 @@ public boolean isCompleteForTEI() {
@Override
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (!isCompleteForTEI()) {
LOGGER.warn("Found a table that is badly formatted but it should have been spotted before. We ignore it now.");
return null;
}

Expand Down Expand Up @@ -98,7 +100,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}*/

Element desc = null;
if (caption != null) {
if (StringUtils.isNotBlank(caption)) {
// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references

Expand All @@ -111,16 +113,17 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

MarkerType citationMarkerType = null;
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
}

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
citationMarkerType = markerTypes.get(0);
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
Expand All @@ -144,7 +147,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
desc.appendChild(textNode(clusterContent));
}

if (desc != null && config.isWithSentenceSegmentation()) {
if (config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the table caption, for that we need to introduce
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1887,7 +1887,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara

// in xom, the following gives all the text under the element, for the whole subtree
String text = curParagraph.getValue();
if (StringUtils.isEmpty(text))
if (StringUtils.isBlank(text))
return;

// identify ref nodes, ref spans and ref positions
Expand Down
Loading