diff --git a/conf/parse-xsl-rules.xml.template b/conf/parse-xsl-rules.xml.template
new file mode 100644
index 0000000000..490896febe
--- /dev/null
+++ b/conf/parse-xsl-rules.xml.template
@@ -0,0 +1,18 @@
+
+
+
+
+
+
+
+
+
diff --git a/conf/parse-xsl-transform.xsl.template b/conf/parse-xsl-transform.xsl.template
new file mode 100644
index 0000000000..0f70131e77
--- /dev/null
+++ b/conf/parse-xsl-transform.xsl.template
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/default.properties b/default.properties
index bb987d9666..cb48974585 100644
--- a/default.properties
+++ b/default.properties
@@ -213,4 +213,5 @@ plugins.misc=\
org.creativecommons.nutch*:\
org.apache.nutch.microformats.reltag*:\
org.apache.nutch.any23*
-
+ org.apache.nutch.parse.xsl*
+
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index d8826e88d9..4a303a96f3 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -66,6 +66,7 @@
+
@@ -134,6 +135,7 @@
+
@@ -210,6 +212,7 @@
+
diff --git a/src/plugin/parse-xsl/build.xml b/src/plugin/parse-xsl/build.xml
new file mode 100644
index 0000000000..8ab8e2b061
--- /dev/null
+++ b/src/plugin/parse-xsl/build.xml
@@ -0,0 +1,65 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/parse-xsl/conf/documents.xsd b/src/plugin/parse-xsl/conf/documents.xsd
new file mode 100644
index 0000000000..601672f3ec
--- /dev/null
+++ b/src/plugin/parse-xsl/conf/documents.xsd
@@ -0,0 +1,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/plugin/parse-xsl/conf/rules.xsd b/src/plugin/parse-xsl/conf/rules.xsd
new file mode 100644
index 0000000000..e0a1c5e5a5
--- /dev/null
+++ b/src/plugin/parse-xsl/conf/rules.xsd
@@ -0,0 +1,29 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/plugin/parse-xsl/ivy.xml b/src/plugin/parse-xsl/ivy.xml
new file mode 100644
index 0000000000..e85e8bcf0c
--- /dev/null
+++ b/src/plugin/parse-xsl/ivy.xml
@@ -0,0 +1,46 @@
+
+
+
+
+
+
+
+
+
+ Apache Nutch
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/parse-xsl/plugin.xml b/src/plugin/parse-xsl/plugin.xml
new file mode 100644
index 0000000000..ff14c0a5f0
--- /dev/null
+++ b/src/plugin/parse-xsl/plugin.xml
@@ -0,0 +1,49 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/parse-xsl/sample/sample1/book1.html b/src/plugin/parse-xsl/sample/sample1/book1.html
new file mode 100644
index 0000000000..fb8a491d9a
--- /dev/null
+++ b/src/plugin/parse-xsl/sample/sample1/book1.html
@@ -0,0 +1,38 @@
+
+
+
+Buy Nutch for dummies!
+
+
+
+
+Nutch for dummies
+
+
+The ultimate book to master all nutch powerful mechanisms !
+
+
+Isbn: 123654987789
+
+
+Authors
+- Mr Allan A.
+
- Mrs Mulan B.
+
+Price: free
+
+Collection from nowhere
+
+
+Other related books
+- Lucene explained to your grandmother
+
- How I met Solr?
+
- Feels better with Elastic Search
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/plugin/parse-xsl/sample/sample1/rules.xml b/src/plugin/parse-xsl/sample/sample1/rules.xml
new file mode 100644
index 0000000000..4142533eb5
--- /dev/null
+++ b/src/plugin/parse-xsl/sample/sample1/rules.xml
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl b/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl
new file mode 100644
index 0000000000..b13b1b64d2
--- /dev/null
+++ b/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl
@@ -0,0 +1,66 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java
new file mode 100644
index 0000000000..63a68db42f
--- /dev/null
+++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java
@@ -0,0 +1,235 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.xsl;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.bind.JAXB;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.stream.StreamSource;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.xsl.xml.rule.Rules;
+import org.apache.nutch.parse.xsl.xml.rule.TRule;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Manage a set of Transformers. It allows to avoid having several instances of
+ * Transformers with XSL to load each time for performance matter. The decision
+ * to use a given Transformer is determined by a set DO NOT make this class a
+ * singleton otherwise it will produce thread safety problems related to Xsl
+ * transformers not thread safe.
+ *
+ * @see Transformer
+ *
+ */
+public class RulesManager {
+
+ /** All the rules used to determine which xsl parser to use */
+ protected Rules rules = null;
+
+ /**
+ * Transformer factory. Thread-local because {@link TransformerFactory}
+ * "is NOT guaranteed to be thread safe".
+ */
+ protected ThreadLocal factory = new ThreadLocal() {
+ @Override
+ protected TransformerFactory initialValue() {
+ return TransformerFactory.newInstance();
+ }
+ };
+
+ /** A RuleTransformer holds transformations defined in one XSLT file. */
+ protected class RuleTransformer {
+
+ String xslFile;
+ byte[] xslSource;
+ ThreadLocal transformer = new ThreadLocal() {
+ @Override
+ protected Transformer initialValue() {
+ ByteArrayInputStream input = new ByteArrayInputStream(xslSource);
+ StreamSource streamSource = new StreamSource(input);
+ Transformer t = null;
+ try {
+ t = factory.get().newTransformer(streamSource);
+ } catch (TransformerConfigurationException e) {
+ LOG.warn("Failed to create transformer for xsl file {}: {}", xslFile,
+ StringUtils.stringifyException(e));
+ }
+ return t;
+ }
+ };
+
+ public RuleTransformer(Configuration conf, String xslFile)
+ throws IOException {
+ this.xslFile = xslFile;
+ InputStream stream = conf.getConfResourceAsInputStream(xslFile);
+ xslSource = IOUtils.toByteArray(stream);
+ }
+
+ public Transformer getTransformer() {
+ return transformer.get();
+ }
+
+ }
+
+ /** A map containing all transformers given their file name as key */
+ protected Map transformers = new HashMap();
+
+ /** The XSLT file to use for transformation */
+ public static final String CONF_XML_RULES = "parser.xsl.rulesFile";
+
+ private static final Logger LOG = LoggerFactory.getLogger(RulesManager.class);
+
+ /**
+ * Default constructor forbidden.
+ */
+ @SuppressWarnings("unused")
+ private RulesManager() {
+ }
+
+ /**
+ * Instantiates an object using the Nutch/Hadoop {@link Configuration}
+ * containing the property defining the rules. All rules and transformation
+ * files are load from the class path.
+ *
+ * @param conf
+ * configuration
+ */
+ public RulesManager(Configuration conf) {
+
+ String rulesFile = conf.get(RulesManager.CONF_XML_RULES);
+ if (rulesFile != null) {
+ Reader rulesXmlReader = conf.getConfResourceAsReader(rulesFile);
+
+ if (rulesXmlReader != null) {
+ LOG.debug("Reading parse-xsl rules file `{}'", rulesFile);
+ rules = JAXB.unmarshal(rulesXmlReader, Rules.class);
+
+ // load transformation files
+ for (TRule rule : rules.getRule()) {
+ final String xslFile = rule.getTransformer().getFile();
+
+ if (xslFile != null) {
+ LOG.debug("Reading parse-xsl transformation file `{}'", xslFile);
+ try {
+ RuleTransformer rt = new RuleTransformer(conf, xslFile);
+ transformers.put(xslFile, rt);
+ } catch (IOException e) {
+ LOG.error("Failed to read parse-xsl transformation file {}: {}",
+ xslFile, StringUtils.stringifyException(e));
+ }
+ }
+ }
+
+ } else {
+ LOG.error(
+ "Failed to open parse-xsl rules file `{}' defined by property {}",
+ rulesFile, RulesManager.CONF_XML_RULES);
+ LOG.error(System.getProperty("java.class.path"));
+ }
+
+ } else {
+ LOG.warn("Plugin parse-xsl active but no rules file defined!");
+ }
+ }
+
+ /**
+ * Match URL against regular expressions to assign it to a transformer file.
+ *
+ * @param url
+ * the URL to filter
+ * @return the transformer file path that matches the rules or null if no rule
+ * does match
+ */
+ public String getTransformerFilePath(String url) {
+
+ String xslFile = null;
+
+ if (rules == null) {
+ // no rules defined
+ return xslFile;
+ }
+
+ // Search for a matching rule by applying defined regex
+ // The first matching rule will be applied
+ for (TRule rule : rules.getRule()) {
+ if (url.matches(rule.getMatches())) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(String.format("Url %s is matching regex rule %s", url,
+ rule.getMatches()));
+ }
+ xslFile = rule.getTransformer().getFile();
+
+ break;
+ }
+ }
+ if (xslFile == null) {
+ LOG.debug("No filter found for url: {}", url);
+ }
+
+ return xslFile;
+ }
+
+ /**
+ * Get the first transformer matching a URL.
+ *
+ * @param url
+ * the url to filter
+ * @return the transformer that suits the rules
+ * @throws Exception
+ */
+ public Transformer getTransformer(String url) {
+ Transformer transformer = null;
+ String xslFile = getTransformerFilePath(url);
+ if (xslFile != null) {
+ return transformers.get(xslFile).getTransformer();
+ }
+ return transformer;
+ }
+
+ /**
+ * Check whether a URL matches any rule.
+ *
+ * @param url
+ * the URL to test match in rules file
+ * @return true if the URL is matching any rule.
+ * @throws Exception
+ */
+ public boolean matches(String url) throws Exception {
+ return this.getTransformerFilePath(url) != null;
+ }
+
+ /**
+ * @return the current set of rules defined in the xml file
+ */
+ public Rules getRules() {
+ return rules;
+ }
+
+}
diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java
new file mode 100644
index 0000000000..f1cc663b2e
--- /dev/null
+++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.xsl;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.NodeList;
+
+import com.sun.org.apache.xpath.internal.XPathAPI;
+
+/**
+ * This class allows to:
+ *
+ * - index automatically fields defined in rules file.
+ *
- exclude urls that are not declared in the rules file.
+ */
+public class XslIndexFilter implements IndexingFilter {
+
+ private static final String NAME_ATTRIBUTE = "name";
+
+ private static final String FIELD_TAG = "//field";
+
+ private Configuration conf;
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(XslParseFilter.class);
+
+ private static HashMap> transformers = new HashMap>();
+
+ // Rules file to use
+ private String rulesFile;
+
+ // The XXX
+ private RulesManager manager;
+
+ /**
+ * @return the current configuration.
+ */
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Sets the current configuration.
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // Getting rules file
+ this.rulesFile = this.conf.get(RulesManager.CONF_XML_RULES);
+
+ // create rules manager and load all configuration files
+ manager = new RulesManager(conf);
+ }
+
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ NutchDocument result = null;
+ if (doc == null)
+ return result;
+
+ try {
+
+ // Getting transformer file path associated to rule if exists
+ String xsltFilePath = null;
+ try {
+ xsltFilePath = manager.getTransformerFilePath(url.toString());
+ } catch (Exception e) {
+ LOG.info("Xslt not found");
+ }
+
+ // The url matches a rule, we keep it
+ if (xsltFilePath != null) {
+ // We keep the document
+ result = doc;
+ List fields = XslIndexFilter.transformers.get(xsltFilePath);
+ // List was never loaded
+ if (fields == null) {
+ fields = this.extractFields(xsltFilePath);
+ }
+
+ // All the fields defined in the xsl file will be put directly
+ // into the Nutch document
+ // Fields defined by the xsl plugin are only stored in parse
+ // meta.
+ if (parse != null && parse.getData() != null
+ && parse.getData().getParseMeta() != null) {
+ for (String field : fields) {
+ for (String value : parse.getData().getParseMeta().getValues(field)) {
+ doc.add(field, value);
+ }
+ }
+ }
+
+ }
+ // The document is indexed anyway because explicitly decided
+ else if (!manager.getRules().isFilterUrlsWithNoRule()) {
+ result = doc;
+ LOG.info("The url "
+ + url.toString()
+ + " has been kept because it has been explicitly specified in the rules");
+ }
+ // The document is not indexed
+ else {
+ LOG.info("The url " + url.toString()
+ + " has been filtered because no xsl file fits the defined rules");
+ }
+
+ } catch (Exception e) {
+ String message = "Cannot index data";
+ if (url != null && url.toString() != null) {
+ message += " from " + url.toString();
+ }
+ LOG.error(message, e);
+ }
+
+ return result;
+ }
+
+ /**
+ *
+ * @param xsltFilePath
+ * the path of the xsl file
+ * @return the list of fields defined in xsl file
+ * @throws Exception
+ */
+ protected List extractFields(String xsltFilePath) throws Exception {
+ List fields = new ArrayList();
+ // Creating xsl DOM document
+ Document document = DocumentBuilderFactory.newInstance()
+ .newDocumentBuilder().parse(new File(xsltFilePath));
+ NodeList list = XPathAPI.selectNodeList(document, FIELD_TAG);
+ HashSet hashedFields = new HashSet();
+ // Populating list
+ for (int i = 0; i < list.getLength(); i++) {
+ NamedNodeMap attributes = list.item(i).getAttributes();
+ if (attributes != null && attributes.getNamedItem(NAME_ATTRIBUTE) != null) {
+ hashedFields
+ .add(attributes.getNamedItem(NAME_ATTRIBUTE).getNodeValue());
+ }
+ }
+ // Keeps list
+ fields.addAll(hashedFields);
+ XslIndexFilter.transformers.put(xsltFilePath, fields);
+
+ return fields;
+ }
+
+}
diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java
new file mode 100644
index 0000000000..0cb252f599
--- /dev/null
+++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java
@@ -0,0 +1,272 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.xsl;
+
+import java.io.File;
+import java.io.FileOutputStream;
+
+import javax.xml.bind.JAXB;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMResult;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.xsl.xml.document.Documents;
+import org.apache.nutch.parse.xsl.xml.document.TDocument;
+import org.apache.nutch.parse.xsl.xml.document.TField;
+import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Node;
+
+import com.sun.org.apache.xpath.internal.XPathAPI;
+
+/**
+ * This is a parse filter plugin (@see HtmlParseFilter) A class to apply an xsl
+ * transformation on an html page. Instead of coding java, a simple xpath can be
+ * used.
+ *
+ */
+public class XslParseFilter implements HtmlParseFilter {
+
+ /** Specifies whether to use html parse TagSoup or NekoHtml */
+ public enum PARSER {
+ /** TagSoup parser */
+ TAGSOUP {
+ @Override
+ public String toString() {
+ return "tagsoup";
+ }
+ },
+ /** Neko parser */
+ NEKO {
+ @Override
+ public String toString() {
+ return "neko";
+ }
+ }
+ }
+
+ /**
+ * The output of the transformation for debug purpose (log level "DEBUG" shall
+ * be activated)
+ */
+ public static final String CONF_XSLT_OUTPUT_DEBUG_FILE = "parser.xsl.output.debug.file";
+
+ /** Whether to use Saxon or Standard JVM XSLT parser */
+ public static final String CONF_XSLT_USE_SAXON = "parser.xsl.useSaxon";
+
+ /**
+ * Whether to use Neko or Tagsoup.
+ *
+ * @Warning this configuration property is set by Nutch and not by the current
+ * plugin. see HtmlParser
+ */
+ public static final String CONF_HTML_PARSER = "parser.html.impl";
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(XslParseFilter.class);
+
+ private Configuration conf;
+
+ // The html parser to use (default is neko. Otherwise Tag Soup)
+ private String parser;
+ // The xsl parser to use (default from jvm or Saxon)
+ private boolean ifSaxonParser;
+ // Debug file to use
+ private String debugFile;
+
+ // The XXX
+ private RulesManager manager;
+
+ /**
+ * Default constructor forbidden.
+ */
+ public XslParseFilter() {
+ super();
+ }
+
+ /**
+ * @param content
+ * full content to parse
+ * @param parseResult
+ * result of the parse process
+ * @param metaTags
+ * metatags set in the document
+ * @param document
+ * the DOM document to parse
+ * @return the resulting {@link ParseResult}
+ */
+ @Override
+ public ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment document) {
+
+ if (manager == null) {
+ // no RulesManager, nothing to do
+ return parseResult;
+ }
+
+ Transformer transformer = manager.getTransformer(content.getUrl());
+ if (transformer == null) {
+ return parseResult;
+ }
+
+ try {
+ // We are selecting the HTML tag with a XPath to convert the
+ // DocumentFragment to a more natural
+ // HTML document that can be further processed with XSL.
+ // TODO applying an "html" xpath is a dirty trick to change.
+ String xpath = "html";
+
+ // For neko, all tags are UPPER CASE.
+ // For tagsoup, it is in lower case.
+ // This is decided by the html parser plugin
+ if (this.parser.equals(PARSER.NEKO.toString())) {
+ xpath = xpath.toUpperCase();
+ } else {
+ // TODO Tag soup is not working. To be investigated.
+ throw new Exception("tag soup parser not implemented.");
+ }
+
+ Node doc = XPathAPI.selectSingleNode(document, xpath);
+
+ Parse parse = parseResult.get(content.getUrl());
+
+ DOMResult result = new DOMResult();
+ // At this state, thanks to the HtmlParser that is using
+ // HtmlParseFilter interface, we got
+ // a DOM object properly built (with Neko or TagSoup).
+ transformer.transform(new DOMSource(doc), result);
+
+ // Storing the xml output for debug purpose
+ if (LOG.isDebugEnabled() && this.debugFile != null) {
+ XslParseFilter.saveDOMOutput(doc, new File(debugFile));
+ // XslParseFilter.saveDOMOutput(result.getNode(), new File(debugFile));
+ }
+
+ XslParseFilter.updateMetadata(result.getNode(), parse);
+
+ } catch (Exception e) {
+ LOG.warn("Cannot extract HTML tags. The XSL processing will not be run.",
+ e);
+ }
+
+ return parseResult;
+ }
+
+ /**
+ * @param node
+ * the node that is used to provide metadata information.
+ * @param data
+ * the data to update This is a simple format like the following:
+ * Check the documents.xsd to figure out the structure.
+ */
+ protected static void updateMetadata(Node node, Parse data) {
+
+ Documents documents = JAXB.unmarshal(new DOMSource(node), Documents.class);
+
+ // No document unmarshalled
+ if (documents == null) {
+ LOG.debug("No metadata to update");
+ return;
+ }
+
+ // Browsing documents
+ for (TDocument document : documents.getDocument()) {
+
+ // There are metadata to process
+ for (TField field : document.getField()) {
+ String value = field.getValue();
+ // Trim values by default
+ if (value != null) {
+ value = value.trim();
+ // Do not keep string with 0 size
+ if (value.length() != 0) {
+ // Adds the meta to the parse meta list
+ data.getData().getParseMeta().add(field.getName(), value);
+ }
+ if (LOG.isDebugEnabled())
+ LOG.debug("Content " + field.getName() + " has value: '" + value
+ + "'");
+ }
+ }
+ }
+
+ }
+
+ /**
+ *
+ * @param node
+ * the DOM node to save.
+ * @param file
+ * the file where to write the DOM.
+ */
+ private static void saveDOMOutput(Node node, File file) {
+ FileOutputStream fos = null;
+
+ try {
+ fos = new FileOutputStream(file);
+
+ TransformerFactory.newInstance().newTransformer()
+ .transform(new DOMSource(node), new StreamResult(fos));
+ } catch (Exception e) {
+ LOG.warn("Cannot store DOM node to file: " + file.getAbsolutePath(), e);
+ } finally {
+ if (fos != null)
+ try {
+ fos.close();
+ } catch (Exception e) {
+ LOG.warn("Cannot close xml file stream.", e);
+ }
+ }
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // Setting the parser from conf
+ parser = this.conf.get(CONF_HTML_PARSER, PARSER.NEKO.toString());
+ // Setting the parser to use from conf
+ ifSaxonParser = this.conf.getBoolean(CONF_XSLT_USE_SAXON, false);
+ // Debug file to use
+ debugFile = this.conf.get(CONF_XSLT_OUTPUT_DEBUG_FILE);
+
+ // TODO: use saxon for xslt 2.0 compliancy
+ if (this.ifSaxonParser) {
+ System.setProperty("javax.xml.transform.TransformerFactory",
+ "net.sf.saxon.TransformerFactoryImpl");
+ }
+
+ // create rules manager and load all configuration files
+ manager = new RulesManager(conf);
+ }
+
+}
diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java
new file mode 100644
index 0000000000..08d2442aca
--- /dev/null
+++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse and index filter to extract field content via XSL statements.
+ */
+package org.apache.nutch.parse.xsl;
+
diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java
new file mode 100644
index 0000000000..4886c32cdd
--- /dev/null
+++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.xsl;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.text.NumberFormat;
+import java.util.Date;
+
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.html.DOMBuilder;
+import org.apache.nutch.parse.xsl.XslParseFilter.PARSER;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * A class to group all classic methods to simulate a crawl without running
+ * Nutch like setting a configuration, providing a DocumentFragment, etc... All
+ * your tests related to parse-xsl shall extend this test.
+ *
+ *
+ */
+public abstract class AbstractCrawlTest {
+
+ /** The logger used for current and derived classes */
+ protected static final Logger LOG = LoggerFactory
+ .getLogger(AbstractCrawlTest.class);
+
+ /**
+ * the configuration to use with current crawler Never access this property. @see
+ * AbstractCrawlTest#getConfiguration()
+ */
+ private Configuration configuration = null;
+
+ protected String sampleDir = System.getProperty("test.data", ".");
+
+ private long startDate;
+
+ /**
+ * @param parseFilter
+ * the filter to use
+ * @param filePath
+ * the file to crawl
+ * @param url
+ * the url that identifies the file to crawl (only used to set the
+ * unique key)
+ * @return the resulting content after the crawl
+ * @throws Exception
+ */
+ protected ParseResult simulateCrawl(PARSER parseFilter, String filePath,
+ String url) throws Exception {
+ ParseResult result = null;
+ FileInputStream is = null;
+ try {
+ // Opening test file
+ File file = new File(filePath);
+ is = new FileInputStream(file);
+ byte[] bytes = new byte[0];
+
+ // Setting the void content
+ Content content = new Content(url, "", bytes, "text/html",
+ new Metadata(), this.getConfiguration());
+
+ // Parse document with related parser
+ DocumentFragment document = null;
+ if (parseFilter == PARSER.NEKO) {
+ document = parseNeko(new InputSource(is));
+
+ } else {
+ document = parseTagSoup(new InputSource(is));
+ }
+
+ // Creates a parser with dedicated method
+ HtmlParseFilter filter = new XslParseFilter();
+ // Setting configuration
+ filter.setConf(this.getConfiguration());
+
+ ParseData data = new ParseData();
+
+ // Initializing the parse result
+ ParseResult parseResult = ParseResult.createParseResult(url,
+ new ParseImpl("no text", data));
+
+ // Extracting metadata
+ result = filter.filter(content, parseResult, null, document);
+ } catch (Exception e) {
+ throw new Exception("Cannot simulate crawl", e);
+ } finally {
+ if (is != null) {
+ try {
+ is.close();
+ } catch (IOException e) {
+ LOG.error("Cannot close input stream", e);
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Constructs a an html DOM structure.
+ *
+ * @param input
+ * the html/xml input stream
+ * @return DocumentFragment the document that has been created.
+ * @throws Exception
+ */
+ protected static DocumentFragment parseTagSoup(InputSource input)
+ throws Exception {
+ HTMLDocumentImpl doc = new HTMLDocumentImpl();
+ DocumentFragment frag = doc.createDocumentFragment();
+ DOMBuilder builder = new DOMBuilder(doc, frag);
+ org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
+ reader.setContentHandler(builder);
+ reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+ reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
+ reader
+ .setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+ reader.parse(input);
+ return frag;
+ }
+
+ /**
+ * Constructs a an html DOM structure.
+ *
+ * @param input
+ * the html/xml input stream
+ * @return DocumentFragment the document that has been created.
+ * @throws Exception
+ */
+ protected static DocumentFragment parseNeko(InputSource input)
+ throws Exception {
+ DOMFragmentParser parser = new DOMFragmentParser();
+ try {
+ parser
+ .setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
+ parser.setFeature("http://cyberneko.org/html/features/augmentations",
+ true);
+ parser.setProperty(
+ "http://cyberneko.org/html/properties/default-encoding", "UTF-8");
+ parser
+ .setFeature(
+ "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
+ true);
+ parser
+ .setFeature(
+ "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
+ false);
+ parser.setFeature(
+ "http://cyberneko.org/html/features/balance-tags/document-fragment",
+ true);
+ parser
+ .setFeature("http://cyberneko.org/html/features/balance-tags", true);
+ parser.setFeature("http://cyberneko.org/html/features/report-errors",
+ true);
+ parser.setProperty("http://cyberneko.org/html/properties/names/elems",
+ "lower");
+
+ System.out.println(LOG.isTraceEnabled());
+
+ } catch (SAXException e) {
+ LOG.error("Cannot set parser features", e);
+ }
+ // convert Document to DocumentFragment
+ HTMLDocumentImpl doc = new HTMLDocumentImpl();
+ doc.setErrorChecking(false);
+ DocumentFragment res = doc.createDocumentFragment();
+ DocumentFragment frag = doc.createDocumentFragment();
+ parser.parse(input, frag);
+ res.appendChild(frag);
+
+ try {
+ while (true) {
+ frag = doc.createDocumentFragment();
+ parser.parse(input, frag);
+ if (!frag.hasChildNodes())
+ break;
+ // if (LOG.isInfoEnabled()) {
+ LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
+ System.out.println(" - new frag, " + frag.getChildNodes().getLength()
+ + " nodes.");
+ // }
+ res.appendChild(frag);
+ }
+ } catch (Exception e) {
+ LOG.error("Error: ", e);
+ System.out.println(e);
+ }
+
+ return res;
+ }
+
+ /**
+ *
+ * @return the current configuration.
+ */
+ public Configuration getConfiguration() {
+ if (this.configuration == null) {
+ this.configuration = NutchConfiguration.create();
+ }
+ return this.configuration;
+ }
+
+ /**
+ * To display some memory related information. Can be used for benchmark test
+ */
+ private void displayMemoryUsage() {
+ Runtime runtime = Runtime.getRuntime();
+
+ NumberFormat format = NumberFormat.getInstance();
+
+ long maxMemory = runtime.maxMemory();
+ long allocatedMemory = runtime.totalMemory();
+ long freeMemory = runtime.freeMemory();
+
+ System.out.println("free memory: " + format.format(freeMemory / 1024));
+ System.out.println("allocated memory: "
+ + format.format(allocatedMemory / 1024));
+ System.out.println("max memory: " + format.format(maxMemory / 1024));
+ System.out.println("total free memory: "
+ + format.format((freeMemory + (maxMemory - allocatedMemory)) / 1024));
+ }
+
+ /**
+ * Can be called before each test to get the run test date.
+ */
+ protected void startTest() {
+ System.out.println("Starting test...");
+ this.displayMemoryUsage();
+ this.startDate = new Date().getTime();
+ }
+
+ /**
+ * Can be called at the end of a test to evaluate the elapsed time.
+ */
+ private void endTest() {
+ this.displayMemoryUsage();
+ System.out.println("Test took " + (new Date().getTime() - this.startDate)
+ + " ms");
+ System.out.println("Test ended.");
+ }
+
+}
diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java
new file mode 100644
index 0000000000..3e76c01c3d
--- /dev/null
+++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.xsl;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.FileReader;
+
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathFactory;
+
+import org.junit.Test;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+/**
+ *
+ */
+public class TestParseTechnical extends AbstractCrawlTest {
+
+ /**
+ * Executes some xpath on neko parsed document
+ */
+ @Test
+ public void testXpathNeko() {
+ try {
+ DocumentFragment doc = parseNeko(new InputSource(
+ new FileReader(new File(sampleDir, "sample1/book1.html"))));
+ XPath xpath = XPathFactory.newInstance().newXPath();
+ NodeList result = (NodeList) xpath.compile("//DIV").evaluate(doc,
+ XPathConstants.NODESET);
+ assertNotNull(result);
+ assertEquals(3, result.getLength());
+ System.out.println(result.getLength());
+ result = (NodeList) xpath.compile("//HTML").evaluate(doc,
+ XPathConstants.NODESET);
+ assertNotNull(result);
+ System.out.println(result.getLength());
+ assertEquals(1, result.getLength());
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Executes some xpath on TagSoup parsed document
+ * TODO not working with TagSoup. Investigate why.
+ */
+ @Test
+ public void testXpathTagSoup() {
+ try {
+ DocumentFragment doc = parseTagSoup(new InputSource(
+ new FileReader(new File(sampleDir, "sample1/book1.html"))));
+ XPath xpath = XPathFactory.newInstance().newXPath();
+ NodeList result = (NodeList) xpath.compile("//div").evaluate(doc,
+ XPathConstants.NODESET);
+ assertNotNull(result);
+ assertEquals(3, result.getLength());
+ System.out.println(result.getLength());
+ result = (NodeList) xpath.compile("//html").evaluate(doc,
+ XPathConstants.NODESET);
+ assertNotNull(result);
+ System.out.println(result.getLength());
+ assertEquals(1, result.getLength());
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java
new file mode 100644
index 0000000000..b3a9396f03
--- /dev/null
+++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java
@@ -0,0 +1,78 @@
+package org.apache.nutch.parse.xsl;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.ParseResult;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.nutch.parse.xsl.XslParseFilter.PARSER;
+import org.junit.Test;
+
+/**
+ *
+ * This sample test will show you how to test the crawling of a page by
+ * simulating a crawl. All the thing that you have to do is to inherit from
+ * AbstractCrawlTest.
+ *
+ */
+public class TestSample1 extends AbstractCrawlTest {
+
+ /**
+ * Loads the rules xml file that will route your transformers from urls.
+ */
+ public TestSample1() {
+ this.getConfiguration().set(RulesManager.CONF_XML_RULES, "sample1/rules.xml");
+ }
+
+ @Test
+ public void testBook1() {
+ String url = "http://www.sample1.com/book?1245";
+
+ try {
+ ParseResult parseResult = simulateCrawl(PARSER.NEKO,
+ new File(sampleDir, "sample1/book1.html").toString(), url);
+ assertNotNull(parseResult);
+
+ Metadata parsedMetadata = parseResult.get(url).getData().getParseMeta();
+ // Asserts we have metadata
+ assertNotNull(parsedMetadata);
+ // Title check
+ assertEquals("Nutch for dummies", parsedMetadata.get("title"));
+ // Description check
+ assertEquals(
+ "The ultimate book to master all nutch powerful mechanisms !",
+ parsedMetadata.get("description"));
+ // Isbn check
+ assertEquals("123654987789", parsedMetadata.get("isbn"));
+ // Authors check
+ assertEquals("Mr Allan A.", parsedMetadata.getValues("author")[0]);
+ assertEquals("Mrs Mulan B.", parsedMetadata.getValues("author")[1]);
+ // Price check
+ assertEquals("free", parsedMetadata.get("price"));
+ // Collection check
+ assertEquals("Collection from nowhere", parsedMetadata.get("collection"));
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail("testBook1 exception");
+ }
+ }
+
+}
diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java
new file mode 100644
index 0000000000..3285770418
--- /dev/null
+++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.xsl;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.util.List;
+
+import org.junit.Test;
+
+/**
+ *
+ * Testing the filter that will auto import fields defined in the xsl file.
+ *
+ */
+public class TestXslIndexFilter extends AbstractCrawlTest {
+
+ /**
+ * Test the fields fetch from xsl file.
+ */
+ @Test
+ public void testFields() {
+ XslIndexFilter filter = new XslIndexFilter();
+ try {
+ List list = filter.extractFields(
+ new File(sampleDir, "sample1/transformer_book.xsl").toString());
+ assertNotNull(list);
+ assertEquals(6, list.size());
+ } catch (Exception e) {
+ fail();
+ }
+ }
+}