apache · steeveb972 · Feb 12, 2018 · jorgelbg · Sep 4, 2019 · jorgelbg
diff --git a/build.xml b/build.xml
@@ -1002,7 +1002,7 @@
 
   <!-- target: ant-eclipse-download   =================================== -->
   <target name="ant-eclipse-download" description="--> downloads the ant-eclipse binary.">
-    <get src="http://downloads.sourceforge.net/project/ant-eclipse/ant-eclipse/1.0/ant-eclipse-1.0.bin.tar.bz2"
+    <get src="http://freefr.dl.sourceforge.net/project/ant-eclipse/ant-eclipse/1.0/ant-eclipse-1.0.bin.tar.bz2"
          dest="${build.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" />
 
     <untar src="${build.dir}/ant-eclipse-1.0.bin.tar.bz2"

diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -19,10 +19,7 @@
 
 import java.net.URL;
 import java.net.MalformedURLException;
-import java.util.Collection;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Stack;
+import java.util.*;
 
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
@@ -102,10 +99,11 @@ public void setConf(Configuration conf) {
   }
 
   /**
-   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * This method takes a {@link StringBuffer}, a DOM {@link Node}
+   * and an excluded element {@link Set}, and will
    * append all the content text found beneath the DOM node to the
-   * <code>StringBuffer</code>.
-   * 
+   * <code>StringBuffer</code> without the mentioned element names in the <code>Set</code>.
+   *
    * <p>
    * 
    * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
@@ -116,36 +114,50 @@ public void setConf(Configuration conf) {
    * 
    * @return true if nested anchors were found
    */
-  public boolean getText(StringBuffer sb, Node node,
-      boolean abortOnNestedAnchors) {
-    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+  private boolean getText(StringBuffer sb, Node node,
+                          boolean abortOnNestedAnchors, Set<String> excludedElementNames) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0, excludedElementNames)) {
       return true;
     }
     return false;
   }
 
   /**
    * This is a convinience method, equivalent to
-   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * {@link #getText(StringBuffer,Node,boolean, Set) getText(sb, node, false, excludedElementNames)}.
    * 
    */
-  public void getText(StringBuffer sb, Node node) {
-    getText(sb, node, false);
+  public void getText(StringBuffer sb, Node node, Set<String> excludedElementNames) {
+    getText(sb, node, false, excludedElementNames);
   }
 
   // returns true if abortOnNestedAnchors is true and we find nested
   // anchors
   private boolean getTextHelper(StringBuffer sb, Node node,
-      boolean abortOnNestedAnchors, int anchorDepth) {
+                                boolean abortOnNestedAnchors, int anchorDepth, Set<String> excludedElementNames) {
     boolean abort = false;
     NodeWalker walker = new NodeWalker(node);
+    Set<String> lcExcludedElementNames = new HashSet<>();
+    if (excludedElementNames != null) {
+      for (String excludedElementName : excludedElementNames) {
+        if (excludedElementName != null) {
+          lcExcludedElementNames.add(excludedElementName.toLowerCase());
+        }
+      }
+    }
 
     while (walker.hasNext()) {
 
       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
 
+      if (nodeName != null) {
+        if (lcExcludedElementNames.contains(nodeName.toLowerCase())) {
+          walker.skipChildren();
+        }
+      }
+
       if ("script".equalsIgnoreCase(nodeName)) {
         walker.skipChildren();
       }
@@ -244,7 +256,7 @@ public boolean getTitle(StringBuffer sb, Node node) {
 
       if (nodeType == Node.ELEMENT_NODE) {
         if ("title".equalsIgnoreCase(nodeName)) {
-          getText(sb, currentNode);
+          getText(sb, currentNode, null);
           return true;
         }
       }
@@ -380,7 +392,7 @@ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
           if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
 
             StringBuffer linkText = new StringBuffer();
-            getText(linkText, currentNode, true);
+            getText(linkText, currentNode, true, null);
             if (linkText.toString().trim().length() == 0) {
               // try harder - use img alt if present
               NodeWalker subWalker = new NodeWalker(currentNode);

diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -18,8 +18,7 @@
 package org.apache.nutch.parse.html;
 
 import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-import java.util.Map;
+import java.util.*;
 import java.net.URL;
 import java.net.MalformedURLException;
 import java.nio.charset.StandardCharsets;
@@ -49,6 +48,7 @@ public class HtmlParser implements Parser {
   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
   // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
   private static final int CHUNK_SIZE = 8192;
+  public static final String ELEMENT_NAMES_SEPARATOR = ",";
 
   // NUTCH-1006 Meta equiv with single quotes not accepted
   private static Pattern metaPattern = Pattern.compile(
@@ -132,6 +132,9 @@ private static String sniffCharacterEncoding(byte[] content) {
   public ParseResult getParse(Content content) {
     HTMLMetaTags metaTags = new HTMLMetaTags();
 
+    String excludedElementNamesString = getConf().get("html.content.exclude.element.names");
+    Set<String> excludedElementNames = excludedElementNamesString == null ? null : new HashSet<>(Arrays.asList(excludedElementNamesString.split(ELEMENT_NAMES_SEPARATOR)));
+
     URL base;
     try {
       base = new URL(content.getBaseUrl());
@@ -195,7 +198,7 @@ public ParseResult getParse(Content content) {
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting text...");
       }
-      utils.getText(sb, root); // extract text
+      utils.getText(sb, root, excludedElementNames); // extract text
       text = sb.toString();
       sb.setLength(0);
       if (LOG.isTraceEnabled()) {

diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -24,8 +24,7 @@
 import java.io.ByteArrayInputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.util.ArrayList;
-import java.util.StringTokenizer;
+import java.util.*;
 
 import org.cyberneko.html.parsers.*;
 import org.junit.Assert;
@@ -168,6 +167,40 @@ public class TestDOMContentUtils {
       "my title", "my title", "my title", "my title", "", "", "", "title",
       "title", "title", "" };
 
+  private static final Set<String>[] contentRemoveTags = new Set[]{
+          null,
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          new HashSet(Arrays.asList("title", "h1")),
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          Collections.singleton("title"),
+          null
+  };
+
+  private static final String[] answerContent = {
+          "title body anchor",
+          "body home bots",
+          "separate this from this",
+          "body home 1 2",
+          "",
+          "",
+          "Whitespace test whitespace test "
+                  + "This is a whitespace test . Newlines should appear as space too. "
+                  + "Tabs are spaces too. This is a break -> and the line after break . "
+                  + "one two three space here space there no space "
+                  + "one two two three three four put some text here and there. "
+                  + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+          "test1 test2", "anchor1 anchor2 anchor3",
+          "anchor1 anchor2 anchor3 anchor4 anchor5", "", ""
+  };
+
   // note: should be in page-order
   private static Outlink[][] answerOutlinks;
 
@@ -265,7 +298,7 @@ public void testGetText() {
       setup();
     for (int i = 0; i < testPages.length; i++) {
       StringBuffer sb = new StringBuffer();
-      utils.getText(sb, testDOMs[i]);
+      utils.getText(sb, testDOMs[i], null);
       String text = sb.toString();
       Assert.assertTrue(
           "expecting text: " + answerText[i]
@@ -292,7 +325,23 @@ public void testGetTitle() {
   }
 
   @Test
-  public void testGetOutlinks() {
+  public void testGetContent() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getText(sb, testDOMs[i], contentRemoveTags[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerContent[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerContent[i], text));
+    }
+  }
+
+  @Test
+  public void testGetOutlinks() throws Exception {
     if (testDOMs[0] == null)
       setup();
     for (int i = 0; i < testPages.length; i++) {

diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -19,11 +19,7 @@
 
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
+import java.util.*;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.MapWritable;
@@ -49,10 +45,10 @@ public class DOMContentUtils {
   private boolean keepNodenames;
 
   private static class LinkParams {
+
     private String elName;
     private String attrName;
     private int childLen;
-
     private LinkParams(String elName, String attrName, int childLen) {
       this.elName = elName;
       this.attrName = attrName;
@@ -62,12 +58,12 @@ private LinkParams(String elName, String attrName, int childLen) {
     public String toString() {
       return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
     }
-  }
 
+  }
   private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
+
   private HashSet<String> ignoredTags = new HashSet<String>();
   private Configuration conf;
-
   public DOMContentUtils(Configuration conf) {
     setConf(conf);
   }
@@ -107,50 +103,65 @@ public void setConf(Configuration conf) {
   }
 
   /**
-   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * This method takes a {@link StringBuffer}, a DOM {@link Node}
+   * and an excluded element {@link Set}, and will
    * append all the content text found beneath the DOM node to the
-   * <code>StringBuffer</code>.
-   * 
+   * <code>StringBuffer</code> without the mentioned element names in the <code>Set</code>.
+   *
    * <p>
-   * 
+   *
    * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
    * and the <code>StringBuffer</code> will not contain any text encountered
    * after a nested anchor is found.
-   * 
+   *
    * <p>
-   * 
+   *
    * @return true if nested anchors were found
    */
   private boolean getText(StringBuffer sb, Node node,
-      boolean abortOnNestedAnchors) {
-    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+                          boolean abortOnNestedAnchors, Set<String> excludedElementNames) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0, excludedElementNames)) {
       return true;
     }
     return false;
   }
 
   /**
    * This is a convinience method, equivalent to
-   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
-   * 
+   * {@link #getText(StringBuffer, Node, boolean, Set) getText(sb, node, false, excludedElementNames)}.
+   *
    */
-  public void getText(StringBuffer sb, Node node) {
-    getText(sb, node, false);
+  public void getText(StringBuffer sb, Node node, Set<String> excludedElementNames) {
+    getText(sb, node, false, excludedElementNames);
   }
 
   // returns true if abortOnNestedAnchors is true and we find nested
   // anchors
   private boolean getTextHelper(StringBuffer sb, Node node,
-      boolean abortOnNestedAnchors, int anchorDepth) {
+                                boolean abortOnNestedAnchors, int anchorDepth, Set<String> excludedElementNames) {
     boolean abort = false;
     NodeWalker walker = new NodeWalker(node);
+    Set<String> lcExcludedElementNames = new HashSet<>();
+    if (excludedElementNames != null) {
+      for (String excludedElementName : excludedElementNames) {
+        if (excludedElementName != null) {
+          lcExcludedElementNames.add(excludedElementName.toLowerCase());
+        }
+      }
+    }
 
     while (walker.hasNext()) {
 
       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
 
+      if (nodeName != null) {
+        if (lcExcludedElementNames.contains(nodeName.toLowerCase())) {
+          walker.skipChildren();
+        }
+      }
+
       if ("script".equalsIgnoreCase(nodeName)) {
         walker.skipChildren();
       }
@@ -230,7 +241,7 @@ private void appendSpace(StringBuffer buffer) {
    * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
    * append the content text found beneath the first <code>title</code> node to
    * the <code>StringBuffer</code>.
-   * 
+   *
    * @return true if a title node was found, false otherwise
    */
   public boolean getTitle(StringBuffer sb, Node node) {
@@ -249,7 +260,7 @@ public boolean getTitle(StringBuffer sb, Node node) {
 
       if (nodeType == Node.ELEMENT_NODE) {
         if ("title".equalsIgnoreCase(nodeName)) {
-          getText(sb, currentNode);
+          getText(sb, currentNode, null);
           return true;
         }
       }
@@ -385,7 +396,7 @@ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
           if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
 
             StringBuffer linkText = new StringBuffer();
-            getText(linkText, currentNode, true);
+            getText(linkText, currentNode, true, null);
 
             NamedNodeMap attrs = currentNode.getAttributes();
             String target = null;