From c4c059b93cb75d365b3c964f2c95447b682a8526 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 10 Mar 2016 23:19:14 +0100 Subject: [PATCH] NUTCH-1870 XSL parse filter - apply patch contributed by @albinscode - load configuration files from classpath - address thread-safety --- build.xml | 12 + conf/nutch-default.xml | 16 + conf/parse-xsl-rules.xml.template | 18 ++ conf/parse-xsl-transform.xsl.template | 26 ++ default.properties | 3 +- src/plugin/build.xml | 3 + src/plugin/parse-xsl/build.xml | 65 +++++ src/plugin/parse-xsl/conf/documents.xsd | 27 ++ src/plugin/parse-xsl/conf/rules.xsd | 29 ++ src/plugin/parse-xsl/ivy.xml | 46 +++ src/plugin/parse-xsl/plugin.xml | 49 ++++ .../parse-xsl/sample/sample1/book1.html | 38 +++ src/plugin/parse-xsl/sample/sample1/rules.xml | 26 ++ .../sample/sample1/transformer_book.xsl | 66 +++++ .../apache/nutch/parse/xsl/RulesManager.java | 235 +++++++++++++++ .../nutch/parse/xsl/XslIndexFilter.java | 185 ++++++++++++ .../nutch/parse/xsl/XslParseFilter.java | 272 +++++++++++++++++ .../apache/nutch/parse/xsl/package-info.java | 22 ++ .../nutch/parse/xsl/AbstractCrawlTest.java | 274 ++++++++++++++++++ .../nutch/parse/xsl/TestParseTechnical.java | 87 ++++++ .../apache/nutch/parse/xsl/TestSample1.java | 78 +++++ .../nutch/parse/xsl/TestXslIndexFilter.java | 50 ++++ 22 files changed, 1626 insertions(+), 1 deletion(-) create mode 100644 conf/parse-xsl-rules.xml.template create mode 100644 conf/parse-xsl-transform.xsl.template create mode 100644 src/plugin/parse-xsl/build.xml create mode 100644 src/plugin/parse-xsl/conf/documents.xsd create mode 100644 src/plugin/parse-xsl/conf/rules.xsd create mode 100644 src/plugin/parse-xsl/ivy.xml create mode 100644 src/plugin/parse-xsl/plugin.xml create mode 100644 src/plugin/parse-xsl/sample/sample1/book1.html create mode 100644 src/plugin/parse-xsl/sample/sample1/rules.xml create mode 100644 src/plugin/parse-xsl/sample/sample1/transformer_book.xsl create mode 100644 src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java create mode 100644 src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java create mode 100644 src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java create mode 100644 src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java create mode 100644 src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java create mode 100644 src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java create mode 100644 src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java create mode 100644 src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java diff --git a/build.xml b/build.xml index 65e8f3fcec..d51cd7aaae 100644 --- a/build.xml +++ b/build.xml @@ -208,6 +208,11 @@ + + + + + @@ -704,6 +709,11 @@ + + + + + @@ -1140,6 +1150,8 @@ + + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index a42e6a9b80..b0384dc1e4 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1933,6 +1933,22 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> + + + + parser.xsl.rulesFile + parse-xsl-rules.xml + + Rule file for plugin parse-xsl: it may contain multiple + rules. Every rule assigns a XSL transformer to all documents + matched by a given URL pattern. Transformers are specified + in separate XML files referenced from the rule file. + A transformer can specify multiple index fields to be filled + by XSL statements from the DOM tree of the parsed document. + + + + diff --git a/conf/parse-xsl-rules.xml.template b/conf/parse-xsl-rules.xml.template new file mode 100644 index 0000000000..490896febe --- /dev/null +++ b/conf/parse-xsl-rules.xml.template @@ -0,0 +1,18 @@ + + + + + + + + + diff --git a/conf/parse-xsl-transform.xsl.template b/conf/parse-xsl-transform.xsl.template new file mode 100644 index 0000000000..0f70131e77 --- /dev/null +++ b/conf/parse-xsl-transform.xsl.template @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/default.properties b/default.properties index bb987d9666..cb48974585 100644 --- a/default.properties +++ b/default.properties @@ -213,4 +213,5 @@ plugins.misc=\ org.creativecommons.nutch*:\ org.apache.nutch.microformats.reltag*:\ org.apache.nutch.any23* - + org.apache.nutch.parse.xsl* + diff --git a/src/plugin/build.xml b/src/plugin/build.xml index d8826e88d9..4a303a96f3 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -66,6 +66,7 @@ + @@ -134,6 +135,7 @@ + @@ -210,6 +212,7 @@ + diff --git a/src/plugin/parse-xsl/build.xml b/src/plugin/parse-xsl/build.xml new file mode 100644 index 0000000000..8ab8e2b061 --- /dev/null +++ b/src/plugin/parse-xsl/build.xml @@ -0,0 +1,65 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/parse-xsl/conf/documents.xsd b/src/plugin/parse-xsl/conf/documents.xsd new file mode 100644 index 0000000000..601672f3ec --- /dev/null +++ b/src/plugin/parse-xsl/conf/documents.xsd @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/plugin/parse-xsl/conf/rules.xsd b/src/plugin/parse-xsl/conf/rules.xsd new file mode 100644 index 0000000000..e0a1c5e5a5 --- /dev/null +++ b/src/plugin/parse-xsl/conf/rules.xsd @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/plugin/parse-xsl/ivy.xml b/src/plugin/parse-xsl/ivy.xml new file mode 100644 index 0000000000..e85e8bcf0c --- /dev/null +++ b/src/plugin/parse-xsl/ivy.xml @@ -0,0 +1,46 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/parse-xsl/plugin.xml b/src/plugin/parse-xsl/plugin.xml new file mode 100644 index 0000000000..ff14c0a5f0 --- /dev/null +++ b/src/plugin/parse-xsl/plugin.xml @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/parse-xsl/sample/sample1/book1.html b/src/plugin/parse-xsl/sample/sample1/book1.html new file mode 100644 index 0000000000..fb8a491d9a --- /dev/null +++ b/src/plugin/parse-xsl/sample/sample1/book1.html @@ -0,0 +1,38 @@ + + + +Buy Nutch for dummies! + + + + +

Nutch for dummies

+ + +
The ultimate book to master all nutch powerful mechanisms !
+ + +
Isbn: 123654987789
+ + +
    Authors +
  • Mr Allan A. +
  • Mrs Mulan B. +
+Price: free + +
Collection from nowhere
+ + +
    Other related books +
  • Lucene explained to your grandmother +
  • How I met Solr? +
  • Feels better with Elastic Search +
+ + + + + \ No newline at end of file diff --git a/src/plugin/parse-xsl/sample/sample1/rules.xml b/src/plugin/parse-xsl/sample/sample1/rules.xml new file mode 100644 index 0000000000..4142533eb5 --- /dev/null +++ b/src/plugin/parse-xsl/sample/sample1/rules.xml @@ -0,0 +1,26 @@ + + + + + + + + + + diff --git a/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl b/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl new file mode 100644 index 0000000000..b13b1b64d2 --- /dev/null +++ b/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java new file mode 100644 index 0000000000..63a68db42f --- /dev/null +++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java @@ -0,0 +1,235 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.xsl; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.bind.JAXB; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.stream.StreamSource; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.parse.xsl.xml.rule.Rules; +import org.apache.nutch.parse.xsl.xml.rule.TRule; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Manage a set of Transformers. It allows to avoid having several instances of + * Transformers with XSL to load each time for performance matter. The decision + * to use a given Transformer is determined by a set DO NOT make this class a + * singleton otherwise it will produce thread safety problems related to Xsl + * transformers not thread safe. + * + * @see Transformer + * + */ +public class RulesManager { + + /** All the rules used to determine which xsl parser to use */ + protected Rules rules = null; + + /** + * Transformer factory. Thread-local because {@link TransformerFactory} + * "is NOT guaranteed to be thread safe". + */ + protected ThreadLocal factory = new ThreadLocal() { + @Override + protected TransformerFactory initialValue() { + return TransformerFactory.newInstance(); + } + }; + + /** A RuleTransformer holds transformations defined in one XSLT file. */ + protected class RuleTransformer { + + String xslFile; + byte[] xslSource; + ThreadLocal transformer = new ThreadLocal() { + @Override + protected Transformer initialValue() { + ByteArrayInputStream input = new ByteArrayInputStream(xslSource); + StreamSource streamSource = new StreamSource(input); + Transformer t = null; + try { + t = factory.get().newTransformer(streamSource); + } catch (TransformerConfigurationException e) { + LOG.warn("Failed to create transformer for xsl file {}: {}", xslFile, + StringUtils.stringifyException(e)); + } + return t; + } + }; + + public RuleTransformer(Configuration conf, String xslFile) + throws IOException { + this.xslFile = xslFile; + InputStream stream = conf.getConfResourceAsInputStream(xslFile); + xslSource = IOUtils.toByteArray(stream); + } + + public Transformer getTransformer() { + return transformer.get(); + } + + } + + /** A map containing all transformers given their file name as key */ + protected Map transformers = new HashMap(); + + /** The XSLT file to use for transformation */ + public static final String CONF_XML_RULES = "parser.xsl.rulesFile"; + + private static final Logger LOG = LoggerFactory.getLogger(RulesManager.class); + + /** + * Default constructor forbidden. + */ + @SuppressWarnings("unused") + private RulesManager() { + } + + /** + * Instantiates an object using the Nutch/Hadoop {@link Configuration} + * containing the property defining the rules. All rules and transformation + * files are load from the class path. + * + * @param conf + * configuration + */ + public RulesManager(Configuration conf) { + + String rulesFile = conf.get(RulesManager.CONF_XML_RULES); + if (rulesFile != null) { + Reader rulesXmlReader = conf.getConfResourceAsReader(rulesFile); + + if (rulesXmlReader != null) { + LOG.debug("Reading parse-xsl rules file `{}'", rulesFile); + rules = JAXB.unmarshal(rulesXmlReader, Rules.class); + + // load transformation files + for (TRule rule : rules.getRule()) { + final String xslFile = rule.getTransformer().getFile(); + + if (xslFile != null) { + LOG.debug("Reading parse-xsl transformation file `{}'", xslFile); + try { + RuleTransformer rt = new RuleTransformer(conf, xslFile); + transformers.put(xslFile, rt); + } catch (IOException e) { + LOG.error("Failed to read parse-xsl transformation file {}: {}", + xslFile, StringUtils.stringifyException(e)); + } + } + } + + } else { + LOG.error( + "Failed to open parse-xsl rules file `{}' defined by property {}", + rulesFile, RulesManager.CONF_XML_RULES); + LOG.error(System.getProperty("java.class.path")); + } + + } else { + LOG.warn("Plugin parse-xsl active but no rules file defined!"); + } + } + + /** + * Match URL against regular expressions to assign it to a transformer file. + * + * @param url + * the URL to filter + * @return the transformer file path that matches the rules or null if no rule + * does match + */ + public String getTransformerFilePath(String url) { + + String xslFile = null; + + if (rules == null) { + // no rules defined + return xslFile; + } + + // Search for a matching rule by applying defined regex + // The first matching rule will be applied + for (TRule rule : rules.getRule()) { + if (url.matches(rule.getMatches())) { + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Url %s is matching regex rule %s", url, + rule.getMatches())); + } + xslFile = rule.getTransformer().getFile(); + + break; + } + } + if (xslFile == null) { + LOG.debug("No filter found for url: {}", url); + } + + return xslFile; + } + + /** + * Get the first transformer matching a URL. + * + * @param url + * the url to filter + * @return the transformer that suits the rules + * @throws Exception + */ + public Transformer getTransformer(String url) { + Transformer transformer = null; + String xslFile = getTransformerFilePath(url); + if (xslFile != null) { + return transformers.get(xslFile).getTransformer(); + } + return transformer; + } + + /** + * Check whether a URL matches any rule. + * + * @param url + * the URL to test match in rules file + * @return true if the URL is matching any rule. + * @throws Exception + */ + public boolean matches(String url) throws Exception { + return this.getTransformerFilePath(url) != null; + } + + /** + * @return the current set of rules defined in the xml file + */ + public Rules getRules() { + return rules; + } + +} diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java new file mode 100644 index 0000000000..f1cc663b2e --- /dev/null +++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.xsl; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; + +import javax.xml.parsers.DocumentBuilderFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.NodeList; + +import com.sun.org.apache.xpath.internal.XPathAPI; + +/** + * This class allows to: + *
    + *
  • index automatically fields defined in rules file. + *
  • exclude urls that are not declared in the rules file. + */ +public class XslIndexFilter implements IndexingFilter { + + private static final String NAME_ATTRIBUTE = "name"; + + private static final String FIELD_TAG = "//field"; + + private Configuration conf; + + private static final Logger LOG = LoggerFactory + .getLogger(XslParseFilter.class); + + private static HashMap> transformers = new HashMap>(); + + // Rules file to use + private String rulesFile; + + // The XXX + private RulesManager manager; + + /** + * @return the current configuration. + */ + @Override + public Configuration getConf() { + return this.conf; + } + + /** + * Sets the current configuration. + */ + @Override + public void setConf(Configuration conf) { + this.conf = conf; + + // Getting rules file + this.rulesFile = this.conf.get(RulesManager.CONF_XML_RULES); + + // create rules manager and load all configuration files + manager = new RulesManager(conf); + } + + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + NutchDocument result = null; + if (doc == null) + return result; + + try { + + // Getting transformer file path associated to rule if exists + String xsltFilePath = null; + try { + xsltFilePath = manager.getTransformerFilePath(url.toString()); + } catch (Exception e) { + LOG.info("Xslt not found"); + } + + // The url matches a rule, we keep it + if (xsltFilePath != null) { + // We keep the document + result = doc; + List fields = XslIndexFilter.transformers.get(xsltFilePath); + // List was never loaded + if (fields == null) { + fields = this.extractFields(xsltFilePath); + } + + // All the fields defined in the xsl file will be put directly + // into the Nutch document + // Fields defined by the xsl plugin are only stored in parse + // meta. + if (parse != null && parse.getData() != null + && parse.getData().getParseMeta() != null) { + for (String field : fields) { + for (String value : parse.getData().getParseMeta().getValues(field)) { + doc.add(field, value); + } + } + } + + } + // The document is indexed anyway because explicitly decided + else if (!manager.getRules().isFilterUrlsWithNoRule()) { + result = doc; + LOG.info("The url " + + url.toString() + + " has been kept because it has been explicitly specified in the rules"); + } + // The document is not indexed + else { + LOG.info("The url " + url.toString() + + " has been filtered because no xsl file fits the defined rules"); + } + + } catch (Exception e) { + String message = "Cannot index data"; + if (url != null && url.toString() != null) { + message += " from " + url.toString(); + } + LOG.error(message, e); + } + + return result; + } + + /** + * + * @param xsltFilePath + * the path of the xsl file + * @return the list of fields defined in xsl file + * @throws Exception + */ + protected List extractFields(String xsltFilePath) throws Exception { + List fields = new ArrayList(); + // Creating xsl DOM document + Document document = DocumentBuilderFactory.newInstance() + .newDocumentBuilder().parse(new File(xsltFilePath)); + NodeList list = XPathAPI.selectNodeList(document, FIELD_TAG); + HashSet hashedFields = new HashSet(); + // Populating list + for (int i = 0; i < list.getLength(); i++) { + NamedNodeMap attributes = list.item(i).getAttributes(); + if (attributes != null && attributes.getNamedItem(NAME_ATTRIBUTE) != null) { + hashedFields + .add(attributes.getNamedItem(NAME_ATTRIBUTE).getNodeValue()); + } + } + // Keeps list + fields.addAll(hashedFields); + XslIndexFilter.transformers.put(xsltFilePath, fields); + + return fields; + } + +} diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java new file mode 100644 index 0000000000..0cb252f599 --- /dev/null +++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java @@ -0,0 +1,272 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.xsl; + +import java.io.File; +import java.io.FileOutputStream; + +import javax.xml.bind.JAXB; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMResult; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.xsl.xml.document.Documents; +import org.apache.nutch.parse.xsl.xml.document.TDocument; +import org.apache.nutch.parse.xsl.xml.document.TField; +import org.apache.nutch.protocol.Content; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Node; + +import com.sun.org.apache.xpath.internal.XPathAPI; + +/** + * This is a parse filter plugin (@see HtmlParseFilter) A class to apply an xsl + * transformation on an html page. Instead of coding java, a simple xpath can be + * used. + * + */ +public class XslParseFilter implements HtmlParseFilter { + + /** Specifies whether to use html parse TagSoup or NekoHtml */ + public enum PARSER { + /** TagSoup parser */ + TAGSOUP { + @Override + public String toString() { + return "tagsoup"; + } + }, + /** Neko parser */ + NEKO { + @Override + public String toString() { + return "neko"; + } + } + } + + /** + * The output of the transformation for debug purpose (log level "DEBUG" shall + * be activated) + */ + public static final String CONF_XSLT_OUTPUT_DEBUG_FILE = "parser.xsl.output.debug.file"; + + /** Whether to use Saxon or Standard JVM XSLT parser */ + public static final String CONF_XSLT_USE_SAXON = "parser.xsl.useSaxon"; + + /** + * Whether to use Neko or Tagsoup. + * + * @Warning this configuration property is set by Nutch and not by the current + * plugin. see HtmlParser + */ + public static final String CONF_HTML_PARSER = "parser.html.impl"; + + private static final Logger LOG = LoggerFactory + .getLogger(XslParseFilter.class); + + private Configuration conf; + + // The html parser to use (default is neko. Otherwise Tag Soup) + private String parser; + // The xsl parser to use (default from jvm or Saxon) + private boolean ifSaxonParser; + // Debug file to use + private String debugFile; + + // The XXX + private RulesManager manager; + + /** + * Default constructor forbidden. + */ + public XslParseFilter() { + super(); + } + + /** + * @param content + * full content to parse + * @param parseResult + * result of the parse process + * @param metaTags + * metatags set in the document + * @param document + * the DOM document to parse + * @return the resulting {@link ParseResult} + */ + @Override + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment document) { + + if (manager == null) { + // no RulesManager, nothing to do + return parseResult; + } + + Transformer transformer = manager.getTransformer(content.getUrl()); + if (transformer == null) { + return parseResult; + } + + try { + // We are selecting the HTML tag with a XPath to convert the + // DocumentFragment to a more natural + // HTML document that can be further processed with XSL. + // TODO applying an "html" xpath is a dirty trick to change. + String xpath = "html"; + + // For neko, all tags are UPPER CASE. + // For tagsoup, it is in lower case. + // This is decided by the html parser plugin + if (this.parser.equals(PARSER.NEKO.toString())) { + xpath = xpath.toUpperCase(); + } else { + // TODO Tag soup is not working. To be investigated. + throw new Exception("tag soup parser not implemented."); + } + + Node doc = XPathAPI.selectSingleNode(document, xpath); + + Parse parse = parseResult.get(content.getUrl()); + + DOMResult result = new DOMResult(); + // At this state, thanks to the HtmlParser that is using + // HtmlParseFilter interface, we got + // a DOM object properly built (with Neko or TagSoup). + transformer.transform(new DOMSource(doc), result); + + // Storing the xml output for debug purpose + if (LOG.isDebugEnabled() && this.debugFile != null) { + XslParseFilter.saveDOMOutput(doc, new File(debugFile)); + // XslParseFilter.saveDOMOutput(result.getNode(), new File(debugFile)); + } + + XslParseFilter.updateMetadata(result.getNode(), parse); + + } catch (Exception e) { + LOG.warn("Cannot extract HTML tags. The XSL processing will not be run.", + e); + } + + return parseResult; + } + + /** + * @param node + * the node that is used to provide metadata information. + * @param data + * the data to update This is a simple format like the following: + * Check the documents.xsd to figure out the structure. + */ + protected static void updateMetadata(Node node, Parse data) { + + Documents documents = JAXB.unmarshal(new DOMSource(node), Documents.class); + + // No document unmarshalled + if (documents == null) { + LOG.debug("No metadata to update"); + return; + } + + // Browsing documents + for (TDocument document : documents.getDocument()) { + + // There are metadata to process + for (TField field : document.getField()) { + String value = field.getValue(); + // Trim values by default + if (value != null) { + value = value.trim(); + // Do not keep string with 0 size + if (value.length() != 0) { + // Adds the meta to the parse meta list + data.getData().getParseMeta().add(field.getName(), value); + } + if (LOG.isDebugEnabled()) + LOG.debug("Content " + field.getName() + " has value: '" + value + + "'"); + } + } + } + + } + + /** + * + * @param node + * the DOM node to save. + * @param file + * the file where to write the DOM. + */ + private static void saveDOMOutput(Node node, File file) { + FileOutputStream fos = null; + + try { + fos = new FileOutputStream(file); + + TransformerFactory.newInstance().newTransformer() + .transform(new DOMSource(node), new StreamResult(fos)); + } catch (Exception e) { + LOG.warn("Cannot store DOM node to file: " + file.getAbsolutePath(), e); + } finally { + if (fos != null) + try { + fos.close(); + } catch (Exception e) { + LOG.warn("Cannot close xml file stream.", e); + } + } + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + + // Setting the parser from conf + parser = this.conf.get(CONF_HTML_PARSER, PARSER.NEKO.toString()); + // Setting the parser to use from conf + ifSaxonParser = this.conf.getBoolean(CONF_XSLT_USE_SAXON, false); + // Debug file to use + debugFile = this.conf.get(CONF_XSLT_OUTPUT_DEBUG_FILE); + + // TODO: use saxon for xslt 2.0 compliancy + if (this.ifSaxonParser) { + System.setProperty("javax.xml.transform.TransformerFactory", + "net.sf.saxon.TransformerFactoryImpl"); + } + + // create rules manager and load all configuration files + manager = new RulesManager(conf); + } + +} diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java new file mode 100644 index 0000000000..08d2442aca --- /dev/null +++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse and index filter to extract field content via XSL statements. + */ +package org.apache.nutch.parse.xsl; + diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java new file mode 100644 index 0000000000..4886c32cdd --- /dev/null +++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.xsl; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.text.NumberFormat; +import java.util.Date; + +import org.junit.Test; + +import org.apache.hadoop.conf.Configuration; +import org.apache.html.dom.HTMLDocumentImpl; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.html.DOMBuilder; +import org.apache.nutch.parse.xsl.XslParseFilter.PARSER; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.cyberneko.html.parsers.DOMFragmentParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.DocumentFragment; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +/** + * A class to group all classic methods to simulate a crawl without running + * Nutch like setting a configuration, providing a DocumentFragment, etc... All + * your tests related to parse-xsl shall extend this test. + * + * + */ +public abstract class AbstractCrawlTest { + + /** The logger used for current and derived classes */ + protected static final Logger LOG = LoggerFactory + .getLogger(AbstractCrawlTest.class); + + /** + * the configuration to use with current crawler Never access this property. @see + * AbstractCrawlTest#getConfiguration() + */ + private Configuration configuration = null; + + protected String sampleDir = System.getProperty("test.data", "."); + + private long startDate; + + /** + * @param parseFilter + * the filter to use + * @param filePath + * the file to crawl + * @param url + * the url that identifies the file to crawl (only used to set the + * unique key) + * @return the resulting content after the crawl + * @throws Exception + */ + protected ParseResult simulateCrawl(PARSER parseFilter, String filePath, + String url) throws Exception { + ParseResult result = null; + FileInputStream is = null; + try { + // Opening test file + File file = new File(filePath); + is = new FileInputStream(file); + byte[] bytes = new byte[0]; + + // Setting the void content + Content content = new Content(url, "", bytes, "text/html", + new Metadata(), this.getConfiguration()); + + // Parse document with related parser + DocumentFragment document = null; + if (parseFilter == PARSER.NEKO) { + document = parseNeko(new InputSource(is)); + + } else { + document = parseTagSoup(new InputSource(is)); + } + + // Creates a parser with dedicated method + HtmlParseFilter filter = new XslParseFilter(); + // Setting configuration + filter.setConf(this.getConfiguration()); + + ParseData data = new ParseData(); + + // Initializing the parse result + ParseResult parseResult = ParseResult.createParseResult(url, + new ParseImpl("no text", data)); + + // Extracting metadata + result = filter.filter(content, parseResult, null, document); + } catch (Exception e) { + throw new Exception("Cannot simulate crawl", e); + } finally { + if (is != null) { + try { + is.close(); + } catch (IOException e) { + LOG.error("Cannot close input stream", e); + } + } + } + return result; + } + + /** + * Constructs a an html DOM structure. + * + * @param input + * the html/xml input stream + * @return DocumentFragment the document that has been created. + * @throws Exception + */ + protected static DocumentFragment parseTagSoup(InputSource input) + throws Exception { + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + DocumentFragment frag = doc.createDocumentFragment(); + DOMBuilder builder = new DOMBuilder(doc, frag); + org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser(); + reader.setContentHandler(builder); + reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); + reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false); + reader + .setProperty("http://xml.org/sax/properties/lexical-handler", builder); + reader.parse(input); + return frag; + } + + /** + * Constructs a an html DOM structure. + * + * @param input + * the html/xml input stream + * @return DocumentFragment the document that has been created. + * @throws Exception + */ + protected static DocumentFragment parseNeko(InputSource input) + throws Exception { + DOMFragmentParser parser = new DOMFragmentParser(); + try { + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + parser.setFeature("http://cyberneko.org/html/features/augmentations", + true); + parser.setProperty( + "http://cyberneko.org/html/properties/default-encoding", "UTF-8"); + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/ignore-specified-charset", + true); + parser + .setFeature( + "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", + false); + parser.setFeature( + "http://cyberneko.org/html/features/balance-tags/document-fragment", + true); + parser + .setFeature("http://cyberneko.org/html/features/balance-tags", true); + parser.setFeature("http://cyberneko.org/html/features/report-errors", + true); + parser.setProperty("http://cyberneko.org/html/properties/names/elems", + "lower"); + + System.out.println(LOG.isTraceEnabled()); + + } catch (SAXException e) { + LOG.error("Cannot set parser features", e); + } + // convert Document to DocumentFragment + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + doc.setErrorChecking(false); + DocumentFragment res = doc.createDocumentFragment(); + DocumentFragment frag = doc.createDocumentFragment(); + parser.parse(input, frag); + res.appendChild(frag); + + try { + while (true) { + frag = doc.createDocumentFragment(); + parser.parse(input, frag); + if (!frag.hasChildNodes()) + break; + // if (LOG.isInfoEnabled()) { + LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes."); + System.out.println(" - new frag, " + frag.getChildNodes().getLength() + + " nodes."); + // } + res.appendChild(frag); + } + } catch (Exception e) { + LOG.error("Error: ", e); + System.out.println(e); + } + + return res; + } + + /** + * + * @return the current configuration. + */ + public Configuration getConfiguration() { + if (this.configuration == null) { + this.configuration = NutchConfiguration.create(); + } + return this.configuration; + } + + /** + * To display some memory related information. Can be used for benchmark test + */ + private void displayMemoryUsage() { + Runtime runtime = Runtime.getRuntime(); + + NumberFormat format = NumberFormat.getInstance(); + + long maxMemory = runtime.maxMemory(); + long allocatedMemory = runtime.totalMemory(); + long freeMemory = runtime.freeMemory(); + + System.out.println("free memory: " + format.format(freeMemory / 1024)); + System.out.println("allocated memory: " + + format.format(allocatedMemory / 1024)); + System.out.println("max memory: " + format.format(maxMemory / 1024)); + System.out.println("total free memory: " + + format.format((freeMemory + (maxMemory - allocatedMemory)) / 1024)); + } + + /** + * Can be called before each test to get the run test date. + */ + protected void startTest() { + System.out.println("Starting test..."); + this.displayMemoryUsage(); + this.startDate = new Date().getTime(); + } + + /** + * Can be called at the end of a test to evaluate the elapsed time. + */ + private void endTest() { + this.displayMemoryUsage(); + System.out.println("Test took " + (new Date().getTime() - this.startDate) + + " ms"); + System.out.println("Test ended."); + } + +} diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java new file mode 100644 index 0000000000..3e76c01c3d --- /dev/null +++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.xsl; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.FileReader; + +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathFactory; + +import org.junit.Test; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +/** + * + */ +public class TestParseTechnical extends AbstractCrawlTest { + + /** + * Executes some xpath on neko parsed document + */ + @Test + public void testXpathNeko() { + try { + DocumentFragment doc = parseNeko(new InputSource( + new FileReader(new File(sampleDir, "sample1/book1.html")))); + XPath xpath = XPathFactory.newInstance().newXPath(); + NodeList result = (NodeList) xpath.compile("//DIV").evaluate(doc, + XPathConstants.NODESET); + assertNotNull(result); + assertEquals(3, result.getLength()); + System.out.println(result.getLength()); + result = (NodeList) xpath.compile("//HTML").evaluate(doc, + XPathConstants.NODESET); + assertNotNull(result); + System.out.println(result.getLength()); + assertEquals(1, result.getLength()); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * Executes some xpath on TagSoup parsed document + * TODO not working with TagSoup. Investigate why. + */ + @Test + public void testXpathTagSoup() { + try { + DocumentFragment doc = parseTagSoup(new InputSource( + new FileReader(new File(sampleDir, "sample1/book1.html")))); + XPath xpath = XPathFactory.newInstance().newXPath(); + NodeList result = (NodeList) xpath.compile("//div").evaluate(doc, + XPathConstants.NODESET); + assertNotNull(result); + assertEquals(3, result.getLength()); + System.out.println(result.getLength()); + result = (NodeList) xpath.compile("//html").evaluate(doc, + XPathConstants.NODESET); + assertNotNull(result); + System.out.println(result.getLength()); + assertEquals(1, result.getLength()); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java new file mode 100644 index 0000000000..b3a9396f03 --- /dev/null +++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java @@ -0,0 +1,78 @@ +package org.apache.nutch.parse.xsl; + +import static org.junit.Assert.*; + +import java.io.File; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.ParseResult; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.nutch.parse.xsl.XslParseFilter.PARSER; +import org.junit.Test; + +/** + * + * This sample test will show you how to test the crawling of a page by + * simulating a crawl. All the thing that you have to do is to inherit from + * AbstractCrawlTest. + * + */ +public class TestSample1 extends AbstractCrawlTest { + + /** + * Loads the rules xml file that will route your transformers from urls. + */ + public TestSample1() { + this.getConfiguration().set(RulesManager.CONF_XML_RULES, "sample1/rules.xml"); + } + + @Test + public void testBook1() { + String url = "http://www.sample1.com/book?1245"; + + try { + ParseResult parseResult = simulateCrawl(PARSER.NEKO, + new File(sampleDir, "sample1/book1.html").toString(), url); + assertNotNull(parseResult); + + Metadata parsedMetadata = parseResult.get(url).getData().getParseMeta(); + // Asserts we have metadata + assertNotNull(parsedMetadata); + // Title check + assertEquals("Nutch for dummies", parsedMetadata.get("title")); + // Description check + assertEquals( + "The ultimate book to master all nutch powerful mechanisms !", + parsedMetadata.get("description")); + // Isbn check + assertEquals("123654987789", parsedMetadata.get("isbn")); + // Authors check + assertEquals("Mr Allan A.", parsedMetadata.getValues("author")[0]); + assertEquals("Mrs Mulan B.", parsedMetadata.getValues("author")[1]); + // Price check + assertEquals("free", parsedMetadata.get("price")); + // Collection check + assertEquals("Collection from nowhere", parsedMetadata.get("collection")); + + } catch (Exception e) { + e.printStackTrace(); + fail("testBook1 exception"); + } + } + +} diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java new file mode 100644 index 0000000000..3285770418 --- /dev/null +++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.xsl; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; + +import java.io.File; +import java.util.List; + +import org.junit.Test; + +/** + * + * Testing the filter that will auto import fields defined in the xsl file. + * + */ +public class TestXslIndexFilter extends AbstractCrawlTest { + + /** + * Test the fields fetch from xsl file. + */ + @Test + public void testFields() { + XslIndexFilter filter = new XslIndexFilter(); + try { + List list = filter.extractFields( + new File(sampleDir, "sample1/transformer_book.xsl").toString()); + assertNotNull(list); + assertEquals(6, list.size()); + } catch (Exception e) { + fail(); + } + } +}