Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NUTCH-1749 Optionally exclude title from content field #285

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1002,7 +1002,7 @@

<!-- target: ant-eclipse-download =================================== -->
<target name="ant-eclipse-download" description="--> downloads the ant-eclipse binary.">
<get src="http://downloads.sourceforge.net/project/ant-eclipse/ant-eclipse/1.0/ant-eclipse-1.0.bin.tar.bz2"
<get src="http://freefr.dl.sourceforge.net/project/ant-eclipse/ant-eclipse/1.0/ant-eclipse-1.0.bin.tar.bz2"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see a good reason to change this URL.

dest="${build.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" />

<untar src="${build.dir}/ant-eclipse-1.0.bin.tar.bz2"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@

import java.net.URL;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Stack;
import java.util.*;

import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
Expand Down Expand Up @@ -102,10 +99,11 @@ public void setConf(Configuration conf) {
}

/**
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* This method takes a {@link StringBuffer}, a DOM {@link Node}
* and an excluded element {@link Set}, and will
* append all the content text found beneath the DOM node to the
* <code>StringBuffer</code>.
*
* <code>StringBuffer</code> without the mentioned element names in the <code>Set</code>.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We return the textual content, not the element names. It should be something:

without the text from the excluded elements

*
* <p>
*
* If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
Expand All @@ -116,36 +114,50 @@ public void setConf(Configuration conf) {
*
* @return true if nested anchors were found
*/
public boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
private boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors, Set<String> excludedElementNames) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Formatting

if (getTextHelper(sb, node, abortOnNestedAnchors, 0, excludedElementNames)) {
return true;
}
return false;
}

/**
* This is a convinience method, equivalent to
* {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
* {@link #getText(StringBuffer,Node,boolean, Set) getText(sb, node, false, excludedElementNames)}.
*
*/
public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
public void getText(StringBuffer sb, Node node, Set<String> excludedElementNames) {
getText(sb, node, false, excludedElementNames);
}

// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private boolean getTextHelper(StringBuffer sb, Node node,
boolean abortOnNestedAnchors, int anchorDepth) {
boolean abortOnNestedAnchors, int anchorDepth, Set<String> excludedElementNames) {
boolean abort = false;
NodeWalker walker = new NodeWalker(node);
Set<String> lcExcludedElementNames = new HashSet<>();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should avoid duplicating the exclusion set. This method is executed many times. We could use a TreeSet when it is invoked, delegating the comparison to the Set implementation.

if (excludedElementNames != null) {
for (String excludedElementName : excludedElementNames) {
if (excludedElementName != null) {
lcExcludedElementNames.add(excludedElementName.toLowerCase());
}
}
}

while (walker.hasNext()) {

Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();

if (nodeName != null) {
if (lcExcludedElementNames.contains(nodeName.toLowerCase())) {
walker.skipChildren();
}
}

if ("script".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
Expand Down Expand Up @@ -244,7 +256,7 @@ public boolean getTitle(StringBuffer sb, Node node) {

if (nodeType == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(nodeName)) {
getText(sb, currentNode);
getText(sb, currentNode, null);
return true;
}
}
Expand Down Expand Up @@ -380,7 +392,7 @@ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

StringBuffer linkText = new StringBuffer();
getText(linkText, currentNode, true);
getText(linkText, currentNode, true, null);
if (linkText.toString().trim().length() == 0) {
// try harder - use img alt if present
NodeWalker subWalker = new NodeWalker(currentNode);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
package org.apache.nutch.parse.html;

import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Map;
import java.util.*;
import java.net.URL;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
Expand Down Expand Up @@ -49,6 +48,7 @@ public class HtmlParser implements Parser {
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
// NUTCH-2042 (cf. TIKA-357): increased to 8 kB
private static final int CHUNK_SIZE = 8192;
public static final String ELEMENT_NAMES_SEPARATOR = ",";

// NUTCH-1006 Meta equiv with single quotes not accepted
private static Pattern metaPattern = Pattern.compile(
Expand Down Expand Up @@ -132,6 +132,9 @@ private static String sniffCharacterEncoding(byte[] content) {
public ParseResult getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();

String excludedElementNamesString = getConf().get("html.content.exclude.element.names");
Set<String> excludedElementNames = excludedElementNamesString == null ? null : new HashSet<>(Arrays.asList(excludedElementNamesString.split(ELEMENT_NAMES_SEPARATOR)));

URL base;
try {
base = new URL(content.getBaseUrl());
Expand Down Expand Up @@ -195,7 +198,7 @@ public ParseResult getParse(Content content) {
if (LOG.isTraceEnabled()) {
LOG.trace("Getting text...");
}
utils.getText(sb, root); // extract text
utils.getText(sb, root, excludedElementNames); // extract text
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@
import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.*;

import org.cyberneko.html.parsers.*;
import org.junit.Assert;
Expand Down Expand Up @@ -168,6 +167,40 @@ public class TestDOMContentUtils {
"my title", "my title", "my title", "my title", "", "", "", "title",
"title", "title", "" };

private static final Set<String>[] contentRemoveTags = new Set[]{
null,
Collections.singleton("title"),
Collections.singleton("title"),
Collections.singleton("title"),
Collections.singleton("title"),
new HashSet(Arrays.asList("title", "h1")),
Collections.singleton("title"),
Collections.singleton("title"),
Collections.singleton("title"),
Collections.singleton("title"),
Collections.singleton("title"),
Collections.singleton("title"),
Collections.singleton("title"),
null
};

private static final String[] answerContent = {
"title body anchor",
"body home bots",
"separate this from this",
"body home 1 2",
"",
"",
"Whitespace test whitespace test "
+ "This is a whitespace test . Newlines should appear as space too. "
+ "Tabs are spaces too. This is a break -> and the line after break . "
+ "one two three space here space there no space "
+ "one two two three three four put some text here and there. "
+ "End this madness ! . . . .", "ignore ignore", "test1 test2",
"test1 test2", "anchor1 anchor2 anchor3",
"anchor1 anchor2 anchor3 anchor4 anchor5", "", ""
};

// note: should be in page-order
private static Outlink[][] answerOutlinks;

Expand Down Expand Up @@ -265,7 +298,7 @@ public void testGetText() {
setup();
for (int i = 0; i < testPages.length; i++) {
StringBuffer sb = new StringBuffer();
utils.getText(sb, testDOMs[i]);
utils.getText(sb, testDOMs[i], null);
String text = sb.toString();
Assert.assertTrue(
"expecting text: " + answerText[i]
Expand All @@ -292,7 +325,23 @@ public void testGetTitle() {
}

@Test
public void testGetOutlinks() {
public void testGetContent() throws Exception {
if (testDOMs[0] == null)
setup();
for (int i = 0; i < testPages.length; i++) {
StringBuffer sb = new StringBuffer();
utils.getText(sb, testDOMs[i], contentRemoveTags[i]);
String text = sb.toString();
Assert.assertTrue(
"expecting text: " + answerContent[i]
+ System.getProperty("line.separator")
+ System.getProperty("line.separator") + "got text: " + text,
equalsIgnoreWhitespace(answerContent[i], text));
}
}

@Test
public void testGetOutlinks() throws Exception {
if (testDOMs[0] == null)
setup();
for (int i = 0; i < testPages.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
Expand All @@ -49,10 +45,10 @@ public class DOMContentUtils {
private boolean keepNodenames;

private static class LinkParams {

private String elName;
private String attrName;
private int childLen;

private LinkParams(String elName, String attrName, int childLen) {
this.elName = elName;
this.attrName = attrName;
Expand All @@ -62,12 +58,12 @@ private LinkParams(String elName, String attrName, int childLen) {
public String toString() {
return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
}
}

}
private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();

private HashSet<String> ignoredTags = new HashSet<String>();
private Configuration conf;

public DOMContentUtils(Configuration conf) {
setConf(conf);
}
Expand Down Expand Up @@ -107,50 +103,65 @@ public void setConf(Configuration conf) {
}

/**
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* This method takes a {@link StringBuffer}, a DOM {@link Node}
* and an excluded element {@link Set}, and will
* append all the content text found beneath the DOM node to the
* <code>StringBuffer</code>.
*
* <code>StringBuffer</code> without the mentioned element names in the <code>Set</code>.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as the previous comment.

*
* <p>
*
*
* If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
* and the <code>StringBuffer</code> will not contain any text encountered
* after a nested anchor is found.
*
*
* <p>
*
*
* @return true if nested anchors were found
*/
private boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
boolean abortOnNestedAnchors, Set<String> excludedElementNames) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0, excludedElementNames)) {
return true;
}
return false;
}

/**
* This is a convinience method, equivalent to
* {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*
* {@link #getText(StringBuffer, Node, boolean, Set) getText(sb, node, false, excludedElementNames)}.
*
*/
public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
public void getText(StringBuffer sb, Node node, Set<String> excludedElementNames) {
getText(sb, node, false, excludedElementNames);
}

// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private boolean getTextHelper(StringBuffer sb, Node node,
boolean abortOnNestedAnchors, int anchorDepth) {
boolean abortOnNestedAnchors, int anchorDepth, Set<String> excludedElementNames) {
boolean abort = false;
NodeWalker walker = new NodeWalker(node);
Set<String> lcExcludedElementNames = new HashSet<>();
if (excludedElementNames != null) {
for (String excludedElementName : excludedElementNames) {
if (excludedElementName != null) {
lcExcludedElementNames.add(excludedElementName.toLowerCase());
}
}
}

while (walker.hasNext()) {

Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();

if (nodeName != null) {
if (lcExcludedElementNames.contains(nodeName.toLowerCase())) {
walker.skipChildren();
}
}

if ("script".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
Expand Down Expand Up @@ -230,7 +241,7 @@ private void appendSpace(StringBuffer buffer) {
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append the content text found beneath the first <code>title</code> node to
* the <code>StringBuffer</code>.
*
*
* @return true if a title node was found, false otherwise
*/
public boolean getTitle(StringBuffer sb, Node node) {
Expand All @@ -249,7 +260,7 @@ public boolean getTitle(StringBuffer sb, Node node) {

if (nodeType == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(nodeName)) {
getText(sb, currentNode);
getText(sb, currentNode, null);
return true;
}
}
Expand Down Expand Up @@ -385,7 +396,7 @@ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

StringBuffer linkText = new StringBuffer();
getText(linkText, currentNode, true);
getText(linkText, currentNode, true, null);

NamedNodeMap attrs = currentNode.getAttributes();
String target = null;
Expand Down
Loading