-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
NUTCH-1749 Optionally exclude title from content field #285
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,10 +19,7 @@ | |
|
||
import java.net.URL; | ||
import java.net.MalformedURLException; | ||
import java.util.Collection; | ||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.Stack; | ||
import java.util.*; | ||
|
||
import org.apache.nutch.parse.Outlink; | ||
import org.apache.nutch.util.NodeWalker; | ||
|
@@ -102,10 +99,11 @@ public void setConf(Configuration conf) { | |
} | ||
|
||
/** | ||
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will | ||
* This method takes a {@link StringBuffer}, a DOM {@link Node} | ||
* and an excluded element {@link Set}, and will | ||
* append all the content text found beneath the DOM node to the | ||
* <code>StringBuffer</code>. | ||
* | ||
* <code>StringBuffer</code> without the mentioned element names in the <code>Set</code>. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We return the textual content, not the element names. It should be something:
|
||
* | ||
* <p> | ||
* | ||
* If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted | ||
|
@@ -116,36 +114,50 @@ public void setConf(Configuration conf) { | |
* | ||
* @return true if nested anchors were found | ||
*/ | ||
public boolean getText(StringBuffer sb, Node node, | ||
boolean abortOnNestedAnchors) { | ||
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { | ||
private boolean getText(StringBuffer sb, Node node, | ||
boolean abortOnNestedAnchors, Set<String> excludedElementNames) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Formatting |
||
if (getTextHelper(sb, node, abortOnNestedAnchors, 0, excludedElementNames)) { | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
/** | ||
* This is a convinience method, equivalent to | ||
* {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. | ||
* {@link #getText(StringBuffer,Node,boolean, Set) getText(sb, node, false, excludedElementNames)}. | ||
* | ||
*/ | ||
public void getText(StringBuffer sb, Node node) { | ||
getText(sb, node, false); | ||
public void getText(StringBuffer sb, Node node, Set<String> excludedElementNames) { | ||
getText(sb, node, false, excludedElementNames); | ||
} | ||
|
||
// returns true if abortOnNestedAnchors is true and we find nested | ||
// anchors | ||
private boolean getTextHelper(StringBuffer sb, Node node, | ||
boolean abortOnNestedAnchors, int anchorDepth) { | ||
boolean abortOnNestedAnchors, int anchorDepth, Set<String> excludedElementNames) { | ||
boolean abort = false; | ||
NodeWalker walker = new NodeWalker(node); | ||
Set<String> lcExcludedElementNames = new HashSet<>(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should avoid duplicating the exclusion set. This method is executed many times. We could use a |
||
if (excludedElementNames != null) { | ||
for (String excludedElementName : excludedElementNames) { | ||
if (excludedElementName != null) { | ||
lcExcludedElementNames.add(excludedElementName.toLowerCase()); | ||
} | ||
} | ||
} | ||
|
||
while (walker.hasNext()) { | ||
|
||
Node currentNode = walker.nextNode(); | ||
String nodeName = currentNode.getNodeName(); | ||
short nodeType = currentNode.getNodeType(); | ||
|
||
if (nodeName != null) { | ||
if (lcExcludedElementNames.contains(nodeName.toLowerCase())) { | ||
walker.skipChildren(); | ||
} | ||
} | ||
|
||
if ("script".equalsIgnoreCase(nodeName)) { | ||
walker.skipChildren(); | ||
} | ||
|
@@ -244,7 +256,7 @@ public boolean getTitle(StringBuffer sb, Node node) { | |
|
||
if (nodeType == Node.ELEMENT_NODE) { | ||
if ("title".equalsIgnoreCase(nodeName)) { | ||
getText(sb, currentNode); | ||
getText(sb, currentNode, null); | ||
return true; | ||
} | ||
} | ||
|
@@ -380,7 +392,7 @@ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) { | |
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { | ||
|
||
StringBuffer linkText = new StringBuffer(); | ||
getText(linkText, currentNode, true); | ||
getText(linkText, currentNode, true, null); | ||
if (linkText.toString().trim().length() == 0) { | ||
// try harder - use img alt if present | ||
NodeWalker subWalker = new NodeWalker(currentNode); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,11 +19,7 @@ | |
|
||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.*; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.io.MapWritable; | ||
|
@@ -49,10 +45,10 @@ public class DOMContentUtils { | |
private boolean keepNodenames; | ||
|
||
private static class LinkParams { | ||
|
||
private String elName; | ||
private String attrName; | ||
private int childLen; | ||
|
||
private LinkParams(String elName, String attrName, int childLen) { | ||
this.elName = elName; | ||
this.attrName = attrName; | ||
|
@@ -62,12 +58,12 @@ private LinkParams(String elName, String attrName, int childLen) { | |
public String toString() { | ||
return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; | ||
} | ||
} | ||
|
||
} | ||
private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>(); | ||
|
||
private HashSet<String> ignoredTags = new HashSet<String>(); | ||
private Configuration conf; | ||
|
||
public DOMContentUtils(Configuration conf) { | ||
setConf(conf); | ||
} | ||
|
@@ -107,50 +103,65 @@ public void setConf(Configuration conf) { | |
} | ||
|
||
/** | ||
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will | ||
* This method takes a {@link StringBuffer}, a DOM {@link Node} | ||
* and an excluded element {@link Set}, and will | ||
* append all the content text found beneath the DOM node to the | ||
* <code>StringBuffer</code>. | ||
* | ||
* <code>StringBuffer</code> without the mentioned element names in the <code>Set</code>. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as the previous comment. |
||
* | ||
* <p> | ||
* | ||
* | ||
* If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted | ||
* and the <code>StringBuffer</code> will not contain any text encountered | ||
* after a nested anchor is found. | ||
* | ||
* | ||
* <p> | ||
* | ||
* | ||
* @return true if nested anchors were found | ||
*/ | ||
private boolean getText(StringBuffer sb, Node node, | ||
boolean abortOnNestedAnchors) { | ||
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { | ||
boolean abortOnNestedAnchors, Set<String> excludedElementNames) { | ||
if (getTextHelper(sb, node, abortOnNestedAnchors, 0, excludedElementNames)) { | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
/** | ||
* This is a convinience method, equivalent to | ||
* {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. | ||
* | ||
* {@link #getText(StringBuffer, Node, boolean, Set) getText(sb, node, false, excludedElementNames)}. | ||
* | ||
*/ | ||
public void getText(StringBuffer sb, Node node) { | ||
getText(sb, node, false); | ||
public void getText(StringBuffer sb, Node node, Set<String> excludedElementNames) { | ||
getText(sb, node, false, excludedElementNames); | ||
} | ||
|
||
// returns true if abortOnNestedAnchors is true and we find nested | ||
// anchors | ||
private boolean getTextHelper(StringBuffer sb, Node node, | ||
boolean abortOnNestedAnchors, int anchorDepth) { | ||
boolean abortOnNestedAnchors, int anchorDepth, Set<String> excludedElementNames) { | ||
boolean abort = false; | ||
NodeWalker walker = new NodeWalker(node); | ||
Set<String> lcExcludedElementNames = new HashSet<>(); | ||
if (excludedElementNames != null) { | ||
for (String excludedElementName : excludedElementNames) { | ||
if (excludedElementName != null) { | ||
lcExcludedElementNames.add(excludedElementName.toLowerCase()); | ||
} | ||
} | ||
} | ||
|
||
while (walker.hasNext()) { | ||
|
||
Node currentNode = walker.nextNode(); | ||
String nodeName = currentNode.getNodeName(); | ||
short nodeType = currentNode.getNodeType(); | ||
|
||
if (nodeName != null) { | ||
if (lcExcludedElementNames.contains(nodeName.toLowerCase())) { | ||
walker.skipChildren(); | ||
} | ||
} | ||
|
||
if ("script".equalsIgnoreCase(nodeName)) { | ||
walker.skipChildren(); | ||
} | ||
|
@@ -230,7 +241,7 @@ private void appendSpace(StringBuffer buffer) { | |
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will | ||
* append the content text found beneath the first <code>title</code> node to | ||
* the <code>StringBuffer</code>. | ||
* | ||
* | ||
* @return true if a title node was found, false otherwise | ||
*/ | ||
public boolean getTitle(StringBuffer sb, Node node) { | ||
|
@@ -249,7 +260,7 @@ public boolean getTitle(StringBuffer sb, Node node) { | |
|
||
if (nodeType == Node.ELEMENT_NODE) { | ||
if ("title".equalsIgnoreCase(nodeName)) { | ||
getText(sb, currentNode); | ||
getText(sb, currentNode, null); | ||
return true; | ||
} | ||
} | ||
|
@@ -385,7 +396,7 @@ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) { | |
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { | ||
|
||
StringBuffer linkText = new StringBuffer(); | ||
getText(linkText, currentNode, true); | ||
getText(linkText, currentNode, true, null); | ||
|
||
NamedNodeMap attrs = currentNode.getAttributes(); | ||
String target = null; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see a good reason to change this URL.