From c4c059b93cb75d365b3c964f2c95447b682a8526 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Thu, 10 Mar 2016 23:19:14 +0100
Subject: [PATCH] NUTCH-1870 XSL parse filter - apply patch contributed by
 @albinscode - load configuration files from classpath - address thread-safety

---
 build.xml                                     |  12 +
 conf/nutch-default.xml                        |  16 +
 conf/parse-xsl-rules.xml.template             |  18 ++
 conf/parse-xsl-transform.xsl.template         |  26 ++
 default.properties                            |   3 +-
 src/plugin/build.xml                          |   3 +
 src/plugin/parse-xsl/build.xml                |  65 +++++
 src/plugin/parse-xsl/conf/documents.xsd       |  27 ++
 src/plugin/parse-xsl/conf/rules.xsd           |  29 ++
 src/plugin/parse-xsl/ivy.xml                  |  46 +++
 src/plugin/parse-xsl/plugin.xml               |  49 ++++
 .../parse-xsl/sample/sample1/book1.html       |  38 +++
 src/plugin/parse-xsl/sample/sample1/rules.xml |  26 ++
 .../sample/sample1/transformer_book.xsl       |  66 +++++
 .../apache/nutch/parse/xsl/RulesManager.java  | 235 +++++++++++++++
 .../nutch/parse/xsl/XslIndexFilter.java       | 185 ++++++++++++
 .../nutch/parse/xsl/XslParseFilter.java       | 272 +++++++++++++++++
 .../apache/nutch/parse/xsl/package-info.java  |  22 ++
 .../nutch/parse/xsl/AbstractCrawlTest.java    | 274 ++++++++++++++++++
 .../nutch/parse/xsl/TestParseTechnical.java   |  87 ++++++
 .../apache/nutch/parse/xsl/TestSample1.java   |  78 +++++
 .../nutch/parse/xsl/TestXslIndexFilter.java   |  50 ++++
 22 files changed, 1626 insertions(+), 1 deletion(-)
 create mode 100644 conf/parse-xsl-rules.xml.template
 create mode 100644 conf/parse-xsl-transform.xsl.template
 create mode 100644 src/plugin/parse-xsl/build.xml
 create mode 100644 src/plugin/parse-xsl/conf/documents.xsd
 create mode 100644 src/plugin/parse-xsl/conf/rules.xsd
 create mode 100644 src/plugin/parse-xsl/ivy.xml
 create mode 100644 src/plugin/parse-xsl/plugin.xml
 create mode 100644 src/plugin/parse-xsl/sample/sample1/book1.html
 create mode 100644 src/plugin/parse-xsl/sample/sample1/rules.xml
 create mode 100644 src/plugin/parse-xsl/sample/sample1/transformer_book.xsl
 create mode 100644 src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java
 create mode 100644 src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java
 create mode 100644 src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java
 create mode 100644 src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java
 create mode 100644 src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java
 create mode 100644 src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java
 create mode 100644 src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java
 create mode 100644 src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java
diff --git a/build.xml b/build.xml
index 65e8f3fcec..d51cd7aaae 100644
--- a/build.xml
+++ b/build.xml
@@ -208,6 +208,11 @@
       <packageset dir="${plugins.dir}/parse-metatags/src/java"/>
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
+      <packageset dir="${plugins.dir}/parse-xsl/src/java">
+        <!-- plugin parse-xsl : exclude JAXB generated packages -->
+        <exclude name="org/apache/nutch/parse/xsl/xml/document"/>
+        <exclude name="org/apache/nutch/parse/xsl/xml/rule"/>
+      </packageset>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
@@ -704,6 +709,11 @@
       <packageset dir="${plugins.dir}/parse-metatags/src/java"/>
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
+      <packageset dir="${plugins.dir}/parse-xsl/src/java">
+        <!-- plugin parse-xsl : exclude JAXB generated packages -->
+        <exclude name="org/apache/nutch/parse/xsl/xml/document"/>
+        <exclude name="org/apache/nutch/parse/xsl/xml/rule"/>
+      </packageset>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
@@ -1140,6 +1150,8 @@
         <source path="${plugins.dir}/parse-swf/src/test/" />
         <source path="${plugins.dir}/parse-tika/src/java/" />
         <source path="${plugins.dir}/parse-tika/src/test/" />
+        <source path="${plugins.dir}/parse-xsl/src/java/" />
+        <source path="${plugins.dir}/parse-xsl/src/test/" />
         <source path="${plugins.dir}/parse-zip/src/java/" />
         <source path="${plugins.dir}/parse-zip/src/test/" />
         <source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a42e6a9b80..b0384dc1e4 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1933,6 +1933,22 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
+
+<!-- parse-xsl plugin properties -->
+<property>
+  <name>parser.xsl.rulesFile</name>
+  <value>parse-xsl-rules.xml</value>
+  <description>
+    Rule file for plugin parse-xsl: it may contain multiple
+    rules. Every rule assigns a XSL transformer to all documents
+    matched by a given URL pattern. Transformers are specified
+    in separate XML files referenced from the rule file.
+    A transformer can specify multiple index fields to be filled
+    by XSL statements from the DOM tree of the parsed document.
+  </description>
+</property>
+
+
 <!-- Temporary Hadoop 0.17.x workaround. -->
 
 <property>
diff --git a/conf/parse-xsl-rules.xml.template b/conf/parse-xsl-rules.xml.template
new file mode 100644
index 0000000000..490896febe
--- /dev/null
+++ b/conf/parse-xsl-rules.xml.template
@@ -0,0 +1,18 @@
+<!-- parse-xsl-rules.xml - Configuration file for plugin parse-xsl -->
+<rules filterUrlsWithNoRule="false">
+
+  <!--
+      If an URL matches the pattern of a rule, 
+      the associated transformer is used to extract fields via XSL
+      statements from the document behind this URL. 
+
+      Depending on the value of the attribute filterUrlsWithNoRule
+      documents which are not matched by any of the rules are
+      - either skipped from indexing (if filterUrlsWithNoRule == true)
+      - or kept without any fields filled by XSL transformations.
+  -->
+  <rule matches=".*">
+    <transformer file="parse-xsl-transform.xsl" />
+  </rule>
+
+</rules>
diff --git a/conf/parse-xsl-transform.xsl.template b/conf/parse-xsl-transform.xsl.template
new file mode 100644
index 0000000000..0f70131e77
--- /dev/null
+++ b/conf/parse-xsl-transform.xsl.template
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+  Extract fields from document by XSL transforms.
+-->
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+  <xsl:template match="/">
+    <documents>
+      <document>
+
+        <field name="headline">
+          <xsl:variable name="extractedValue" select="/HTML/BODY/H1" />
+          <xsl:value-of select="normalize-space($extractedValue)" />
+        </field>
+
+        <field name="description">
+          <xsl:value-of select="//DIV[@id='description']" />
+        </field>
+
+      </document>
+    </documents>
+  </xsl:template>
+
+</xsl:stylesheet>
diff --git a/default.properties b/default.properties
index bb987d9666..cb48974585 100644
--- a/default.properties
+++ b/default.properties
@@ -213,4 +213,5 @@ plugins.misc=\
    org.creativecommons.nutch*:\
    org.apache.nutch.microformats.reltag*:\
    org.apache.nutch.any23*
-   
+   org.apache.nutch.parse.xsl*
+
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index d8826e88d9..4a303a96f3 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -66,6 +66,7 @@
     <ant dir="parse-metatags" target="deploy"/>
     <ant dir="parse-swf" target="deploy"/>
     <ant dir="parse-tika" target="deploy"/>
+    <ant dir="parse-xsl" target="deploy"/>
     <ant dir="parse-zip" target="deploy"/>
     <ant dir="parsefilter-naivebayes" target="deploy"/>
     <ant dir="parsefilter-regex" target="deploy"/>
@@ -134,6 +135,7 @@
      <ant dir="parse-metatags" target="test"/>
      <ant dir="parse-swf" target="test"/>
      <ant dir="parse-tika" target="test"/>
+     <ant dir="parse-xsl" target="test"/>
      <ant dir="parse-zip" target="test"/>
      <ant dir="parsefilter-regex" target="test"/>
      <ant dir="protocol-file" target="test"/>
@@ -210,6 +212,7 @@
     <ant dir="parse-metatags" target="clean"/>
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
+    <ant dir="parse-xsl" target="clean"/>
     <ant dir="parse-zip" target="clean"/>
     <ant dir="parsefilter-naivebayes" target="clean" />
     <ant dir="parsefilter-regex" target="clean"/>
diff --git a/src/plugin/parse-xsl/build.xml b/src/plugin/parse-xsl/build.xml
new file mode 100644
index 0000000000..8ab8e2b061
--- /dev/null
+++ b/src/plugin/parse-xsl/build.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-xsl" default="jar-core">
+
+	<import file="../build-plugin.xml"/>
+
+	<!-- Build compilation dependencies -->
+	<target name="deps-jar">
+		<ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
+		<ant target="jar" inheritall="false" dir="../parse-html"/>
+	</target>
+
+	<!-- Add compilation dependencies to classpath -->
+	<path id="plugin.deps">
+		<fileset dir="${nutch.root}/build">
+			<include name="**/lib-nekohtml/*.jar" />
+			<include name="**/parse-html/*.jar" />
+		</fileset>
+		<!-- config files are load from test/data -->
+		<pathelement path="${build.test}/data" />
+	</path>
+
+	<target name="init-plugin" depends="deps-jar,resolve-default"
+			description="Plugin-specific initialization">
+
+		<taskdef name="xjc" classname="com.sun.tools.xjc.XJCTask"
+				 description="Compile XML bindings (xjc)">
+			<classpath>
+				<path refid="classpath"/>
+			</classpath>
+		</taskdef>
+
+		<xjc schema="conf/documents.xsd" destdir="src/java"
+			 package="org.apache.nutch.parse.xsl.xml.document"/>
+		<xjc schema="conf/rules.xsd" destdir="src/java"
+			 package="org.apache.nutch.parse.xsl.xml.rule"/>
+
+	</target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="**/*.html"/>
+      <include name="**/*.xml"/>
+      <include name="**/*.xsl"/>
+    </fileset>
+  </copy>
+
+</project>
diff --git a/src/plugin/parse-xsl/conf/documents.xsd b/src/plugin/parse-xsl/conf/documents.xsd
new file mode 100644
index 0000000000..601672f3ec
--- /dev/null
+++ b/src/plugin/parse-xsl/conf/documents.xsd
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<schema xmlns="http://www.w3.org/2001/XMLSchema" xmlns:documents="http://www.example.org/documents/"
+	targetNamespace="http://www.example.org/documents/">
+
+	<!-- Root element -->
+	<element name="documents">
+		<complexType>
+			<sequence maxOccurs="unbounded" minOccurs="0">
+				<element name="document" type="documents:TDocument"></element>
+			</sequence>
+		</complexType>
+	</element>
+
+	<complexType name="TDocument">
+		<sequence maxOccurs="unbounded" minOccurs="0">
+			<element name="field" type="documents:TField"></element>
+		</sequence>
+	</complexType>
+
+	<complexType name="TField">
+		<simpleContent>
+			<extension base="string">
+				<attribute name="name" type="string"></attribute>
+			</extension>
+		</simpleContent>
+	</complexType>
+</schema>
\ No newline at end of file
diff --git a/src/plugin/parse-xsl/conf/rules.xsd b/src/plugin/parse-xsl/conf/rules.xsd
new file mode 100644
index 0000000000..e0a1c5e5a5
--- /dev/null
+++ b/src/plugin/parse-xsl/conf/rules.xsd
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<schema xmlns="http://www.w3.org/2001/XMLSchema" xmlns:rules="http://www.example.org/rules/"
+	targetNamespace="http://www.example.org/rules/">
+
+	<!-- Root element -->
+	<element name="rules">
+		<complexType>
+			<sequence maxOccurs="unbounded" minOccurs="1">
+				<element name="rule" type="rules:TRule"></element>
+			</sequence>
+			<attribute name="filterUrlsWithNoRule" type="boolean" use="optional" default="true"></attribute>
+		</complexType>
+	</element>
+
+	<complexType name="TRule">
+		<sequence maxOccurs="1" minOccurs="1">
+			<element name="transformer" type="rules:TTransformer"></element>
+		</sequence>
+		<attribute name="matches" type="string" use="required"></attribute>
+	</complexType>
+
+	<complexType name="TTransformer">
+		<simpleContent>
+			<extension base="string">
+				<attribute name="file" type="string" use="required"></attribute>
+			</extension>
+		</simpleContent>
+	</complexType>
+</schema>
\ No newline at end of file
diff --git a/src/plugin/parse-xsl/ivy.xml b/src/plugin/parse-xsl/ivy.xml
new file mode 100644
index 0000000000..e85e8bcf0c
--- /dev/null
+++ b/src/plugin/parse-xsl/ivy.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+   <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
+   <dependency org="com.sun.xml.bind" name="jaxb-xjc" rev="2.2.7"/>
+   <dependency org="com.sun.xml.bind" name="jaxb-impl" rev="2.2.7"/>
+   <dependency org="com.sun.xml.bind" name="jaxb-jxc" rev="2.2.7"/>
+   <dependency org="com.sun.xml.bind" name="jaxb-core" rev="2.2.7"/>
+  </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/parse-xsl/plugin.xml b/src/plugin/parse-xsl/plugin.xml
new file mode 100644
index 0000000000..ff14c0a5f0
--- /dev/null
+++ b/src/plugin/parse-xsl/plugin.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-xsl"
+   name="XSL parser"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-xsl.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="parse-html"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse"
+              name="Parse XSL Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="XslParseFilter" class="org.apache.nutch.parse.xsl.XslParseFilter">
+      </implementation>
+   </extension>
+   
+   <extension id="org.apache.nutch.indexer"
+              name="Index XSL Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="XslIndexFilter" class="org.apache.nutch.parse.xsl.XslIndexFilter">
+      </implementation>
+   </extension>
+
+</plugin>
diff --git a/src/plugin/parse-xsl/sample/sample1/book1.html b/src/plugin/parse-xsl/sample/sample1/book1.html
new file mode 100644
index 0000000000..fb8a491d9a
--- /dev/null
+++ b/src/plugin/parse-xsl/sample/sample1/book1.html
@@ -0,0 +1,38 @@
+<html>
+<!-- This is a fake page built to show you how to extract metadata given the parse-xsl plugin 
+To see how the job is done please refer to transformer_book.xsl
+-->
+
+<title>Buy Nutch for dummies!</title>
+
+<body>
+
+<!-- Easy data to extract (full data located between tags) -->
+<h1>Nutch for dummies</h1>
+
+<!-- Easy data to extract given a unique attribute value-->
+<div id="description">The ultimate book to master all nutch powerful mechanisms !</div>
+
+<!-- Data to extract after a label string -->
+<div>Isbn: 123654987789</div>
+
+<!-- Etc.. -->
+<ul>Authors
+<li>Mr Allan A.
+<li>Mrs Mulan B.
+</ul>
+<span>Price: free</span>
+
+<div class=".collection">Collection from nowhere</div>
+
+<!-- This to avoid getting authors data without thinking ;) -->
+<ul>Other related books
+<li>Lucene explained to your grandmother
+<li>How I met Solr?
+<li>Feels better with Elastic Search
+</ul>
+
+</body>
+
+
+</html>
\ No newline at end of file
diff --git a/src/plugin/parse-xsl/sample/sample1/rules.xml b/src/plugin/parse-xsl/sample/sample1/rules.xml
new file mode 100644
index 0000000000..4142533eb5
--- /dev/null
+++ b/src/plugin/parse-xsl/sample/sample1/rules.xml
@@ -0,0 +1,26 @@
+<rules filterUrlsWithNoRule="true">
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+  <!-- When this url matches, the associated transformer is used.
+  If not matching any other page will be ignored (see conf/rules.xsd) -->
+  <rule matches="http://www.sample1.com/book\?\d+">
+    <transformer file="sample1/transformer_book.xsl" />
+
+  </rule>
+</rules>
diff --git a/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl b/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl
new file mode 100644
index 0000000000..b13b1b64d2
--- /dev/null
+++ b/src/plugin/parse-xsl/sample/sample1/transformer_book.xsl
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- This file will transform a book.html to an xml document compounded of 
+	specific fields. Each field will then be indexed (by default) -->
+<xsl:stylesheet version="1.0"
+	xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+
+	<xsl:template match="/">
+		<documents>
+			<document>
+
+				<field name="title">
+					<xsl:value-of select="/HTML/BODY/H1" />
+				</field>
+
+				<field name="description">
+					<xsl:value-of select="//DIV[@id='description']" />
+				</field>
+
+				<field name="isbn">
+					<xsl:variable name="fullDivText"
+						select="//DIV[starts-with(text(), 'Isbn:')]/text()" />
+					<xsl:value-of select="substring-after($fullDivText, 'Isbn: ')" />
+				</field>
+
+				<!-- Adding several Author fields -->
+				<xsl:for-each select="/HTML/BODY/UL[starts-with(text(),'Authors')]/LI">
+					<field name="author">
+						<xsl:value-of select="." />
+					</field>
+				</xsl:for-each>
+
+				<field name="price">
+					<xsl:variable name="fullSpanText"
+						select="//SPAN[starts-with(text(), 'Price:')]/text()" />
+					<xsl:value-of select="substring-after($fullSpanText, 'Price: ')" />
+				</field>
+
+				<field name="collection">
+					<xsl:value-of select="//DIV[@class='.collection']" />
+				</field>
+
+
+			</document>
+		</documents>
+	</xsl:template>
+
+</xsl:stylesheet>
\ No newline at end of file
diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java
new file mode 100644
index 0000000000..63a68db42f
--- /dev/null
+++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/RulesManager.java
@@ -0,0 +1,235 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.xsl;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.bind.JAXB;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.stream.StreamSource;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.xsl.xml.rule.Rules;
+import org.apache.nutch.parse.xsl.xml.rule.TRule;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Manage a set of Transformers. It allows to avoid having several instances of
+ * Transformers with XSL to load each time for performance matter. The decision
+ * to use a given Transformer is determined by a set DO NOT make this class a
+ * singleton otherwise it will produce thread safety problems related to Xsl
+ * transformers not thread safe.
+ * 
+ * @see Transformer
+ * 
+ */
+public class RulesManager {
+
+  /** All the rules used to determine which xsl parser to use */
+  protected Rules rules = null;
+
+  /**
+   * Transformer factory. Thread-local because {@link TransformerFactory}
+   * "is NOT guaranteed to be thread safe".
+   */
+  protected ThreadLocal<TransformerFactory> factory = new ThreadLocal<TransformerFactory>() {
+    @Override
+    protected TransformerFactory initialValue() {
+      return TransformerFactory.newInstance();
+    }
+  };
+
+  /** A RuleTransformer holds transformations defined in one XSLT file. */
+  protected class RuleTransformer {
+
+    String xslFile;
+    byte[] xslSource;
+    ThreadLocal<Transformer> transformer = new ThreadLocal<Transformer>() {
+      @Override
+      protected Transformer initialValue() {
+        ByteArrayInputStream input = new ByteArrayInputStream(xslSource);
+        StreamSource streamSource = new StreamSource(input);
+        Transformer t = null;
+        try {
+          t = factory.get().newTransformer(streamSource);
+        } catch (TransformerConfigurationException e) {
+          LOG.warn("Failed to create transformer for xsl file {}: {}", xslFile,
+              StringUtils.stringifyException(e));
+        }
+        return t;
+      }
+    };
+
+    public RuleTransformer(Configuration conf, String xslFile)
+        throws IOException {
+      this.xslFile = xslFile;
+      InputStream stream = conf.getConfResourceAsInputStream(xslFile);
+      xslSource = IOUtils.toByteArray(stream);
+    }
+
+    public Transformer getTransformer() {
+      return transformer.get();
+    }
+
+  }
+
+  /** A map containing all transformers given their file name as key */
+  protected Map<String, RuleTransformer> transformers = new HashMap<String, RuleTransformer>();
+
+  /** The XSLT file to use for transformation */
+  public static final String CONF_XML_RULES = "parser.xsl.rulesFile";
+
+  private static final Logger LOG = LoggerFactory.getLogger(RulesManager.class);
+
+  /**
+   * Default constructor forbidden.
+   */
+  @SuppressWarnings("unused")
+  private RulesManager() {
+  }
+
+  /**
+   * Instantiates an object using the Nutch/Hadoop {@link Configuration}
+   * containing the property defining the rules. All rules and transformation
+   * files are load from the class path.
+   * 
+   * @param conf
+   *          configuration
+   */
+  public RulesManager(Configuration conf) {
+
+    String rulesFile = conf.get(RulesManager.CONF_XML_RULES);
+    if (rulesFile != null) {
+      Reader rulesXmlReader = conf.getConfResourceAsReader(rulesFile);
+
+      if (rulesXmlReader != null) {
+        LOG.debug("Reading parse-xsl rules file `{}'", rulesFile);
+        rules = JAXB.unmarshal(rulesXmlReader, Rules.class);
+
+        // load transformation files
+        for (TRule rule : rules.getRule()) {
+          final String xslFile = rule.getTransformer().getFile();
+
+          if (xslFile != null) {
+            LOG.debug("Reading parse-xsl transformation file `{}'", xslFile);
+            try {
+              RuleTransformer rt = new RuleTransformer(conf, xslFile);
+              transformers.put(xslFile, rt);
+            } catch (IOException e) {
+              LOG.error("Failed to read parse-xsl transformation file {}: {}",
+                  xslFile, StringUtils.stringifyException(e));
+            }
+          }
+        }
+
+      } else {
+        LOG.error(
+            "Failed to open parse-xsl rules file `{}' defined by property {}",
+            rulesFile, RulesManager.CONF_XML_RULES);
+        LOG.error(System.getProperty("java.class.path"));
+      }
+
+    } else {
+      LOG.warn("Plugin parse-xsl active but no rules file defined!");
+    }
+  }
+
+  /**
+   * Match URL against regular expressions to assign it to a transformer file.
+   * 
+   * @param url
+   *          the URL to filter
+   * @return the transformer file path that matches the rules or null if no rule
+   *         does match
+   */
+  public String getTransformerFilePath(String url) {
+
+    String xslFile = null;
+
+    if (rules == null) {
+      // no rules defined
+      return xslFile;
+    }
+
+    // Search for a matching rule by applying defined regex
+    // The first matching rule will be applied
+    for (TRule rule : rules.getRule()) {
+      if (url.matches(rule.getMatches())) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(String.format("Url %s is matching regex rule %s", url,
+              rule.getMatches()));
+        }
+        xslFile = rule.getTransformer().getFile();
+
+        break;
+      }
+    }
+    if (xslFile == null) {
+      LOG.debug("No filter found for url: {}", url);
+    }
+
+    return xslFile;
+  }
+
+  /**
+   * Get the first transformer matching a URL.
+   * 
+   * @param url
+   *          the url to filter
+   * @return the transformer that suits the rules
+   * @throws Exception
+   */
+  public Transformer getTransformer(String url) {
+    Transformer transformer = null;
+    String xslFile = getTransformerFilePath(url);
+    if (xslFile != null) {
+      return transformers.get(xslFile).getTransformer();
+    }
+    return transformer;
+  }
+
+  /**
+   * Check whether a URL matches any rule.
+   * 
+   * @param url
+   *          the URL to test match in rules file
+   * @return true if the URL is matching any rule.
+   * @throws Exception
+   */
+  public boolean matches(String url) throws Exception {
+    return this.getTransformerFilePath(url) != null;
+  }
+
+  /**
+   * @return the current set of rules defined in the xml file
+   */
+  public Rules getRules() {
+    return rules;
+  }
+
+}
diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java
new file mode 100644
index 0000000000..f1cc663b2e
--- /dev/null
+++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslIndexFilter.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.xsl;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.NodeList;
+
+import com.sun.org.apache.xpath.internal.XPathAPI;
+
+/**
+ * This class allows to:
+ * <ul>
+ * <li>index automatically fields defined in rules file.
+ * <li>exclude urls that are not declared in the rules file.
+ */
+public class XslIndexFilter implements IndexingFilter {
+
+  private static final String NAME_ATTRIBUTE = "name";
+
+  private static final String FIELD_TAG = "//field";
+
+  private Configuration conf;
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(XslParseFilter.class);
+
+  private static HashMap<String, List<String>> transformers = new HashMap<String, List<String>>();
+
+  // Rules file to use
+  private String rulesFile;
+  
+  // The XXX
+  private RulesManager manager;
+
+  /**
+   * @return the current configuration.
+   */
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Sets the current configuration.
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // Getting rules file
+    this.rulesFile = this.conf.get(RulesManager.CONF_XML_RULES);
+    
+    // create rules manager and load all configuration files
+    manager = new RulesManager(conf);
+  }
+
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    NutchDocument result = null;
+    if (doc == null)
+      return result;
+
+    try {
+
+      // Getting transformer file path associated to rule if exists
+      String xsltFilePath = null;
+      try {
+        xsltFilePath = manager.getTransformerFilePath(url.toString());
+      } catch (Exception e) {
+        LOG.info("Xslt not found");
+      }
+
+      // The url matches a rule, we keep it
+      if (xsltFilePath != null) {
+        // We keep the document
+        result = doc;
+        List<String> fields = XslIndexFilter.transformers.get(xsltFilePath);
+        // List was never loaded
+        if (fields == null) {
+          fields = this.extractFields(xsltFilePath);
+        }
+
+        // All the fields defined in the xsl file will be put directly
+        // into the Nutch document
+        // Fields defined by the xsl plugin are only stored in parse
+        // meta.
+        if (parse != null && parse.getData() != null
+            && parse.getData().getParseMeta() != null) {
+          for (String field : fields) {
+            for (String value : parse.getData().getParseMeta().getValues(field)) {
+              doc.add(field, value);
+            }
+          }
+        }
+
+      }
+      // The document is indexed anyway because explicitly decided
+      else if (!manager.getRules().isFilterUrlsWithNoRule()) {
+        result = doc;
+        LOG.info("The url "
+            + url.toString()
+            + " has been kept because it has been explicitly specified in the rules");
+      }
+      // The document is not indexed
+      else {
+        LOG.info("The url " + url.toString()
+            + " has been filtered because no xsl file fits the defined rules");
+      }
+
+    } catch (Exception e) {
+      String message = "Cannot index data";
+      if (url != null && url.toString() != null) {
+        message += " from " + url.toString();
+      }
+      LOG.error(message, e);
+    }
+
+    return result;
+  }
+
+  /**
+   * 
+   * @param xsltFilePath
+   *          the path of the xsl file
+   * @return the list of fields defined in xsl file
+   * @throws Exception
+   */
+  protected List<String> extractFields(String xsltFilePath) throws Exception {
+    List<String> fields = new ArrayList<String>();
+    // Creating xsl DOM document
+    Document document = DocumentBuilderFactory.newInstance()
+        .newDocumentBuilder().parse(new File(xsltFilePath));
+    NodeList list = XPathAPI.selectNodeList(document, FIELD_TAG);
+    HashSet<String> hashedFields = new HashSet<String>();
+    // Populating list
+    for (int i = 0; i < list.getLength(); i++) {
+      NamedNodeMap attributes = list.item(i).getAttributes();
+      if (attributes != null && attributes.getNamedItem(NAME_ATTRIBUTE) != null) {
+        hashedFields
+            .add(attributes.getNamedItem(NAME_ATTRIBUTE).getNodeValue());
+      }
+    }
+    // Keeps list
+    fields.addAll(hashedFields);
+    XslIndexFilter.transformers.put(xsltFilePath, fields);
+
+    return fields;
+  }
+
+}
diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java
new file mode 100644
index 0000000000..0cb252f599
--- /dev/null
+++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/XslParseFilter.java
@@ -0,0 +1,272 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.xsl;
+
+import java.io.File;
+import java.io.FileOutputStream;
+
+import javax.xml.bind.JAXB;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMResult;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.xsl.xml.document.Documents;
+import org.apache.nutch.parse.xsl.xml.document.TDocument;
+import org.apache.nutch.parse.xsl.xml.document.TField;
+import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Node;
+
+import com.sun.org.apache.xpath.internal.XPathAPI;
+
+/**
+ * This is a parse filter plugin (@see HtmlParseFilter) A class to apply an xsl
+ * transformation on an html page. Instead of coding java, a simple xpath can be
+ * used.
+ * 
+ */
+public class XslParseFilter implements HtmlParseFilter {
+
+  /** Specifies whether to use html parse TagSoup or NekoHtml */
+  public enum PARSER {
+    /** TagSoup parser */
+    TAGSOUP {
+      @Override
+      public String toString() {
+        return "tagsoup";
+      }
+    },
+    /** Neko parser */
+    NEKO {
+      @Override
+      public String toString() {
+        return "neko";
+      }
+    }
+  }
+
+  /**
+   * The output of the transformation for debug purpose (log level "DEBUG" shall
+   * be activated)
+   */
+  public static final String CONF_XSLT_OUTPUT_DEBUG_FILE = "parser.xsl.output.debug.file";
+
+  /** Whether to use Saxon or Standard JVM XSLT parser */
+  public static final String CONF_XSLT_USE_SAXON = "parser.xsl.useSaxon";
+
+  /**
+   * Whether to use Neko or Tagsoup.
+   * 
+   * @Warning this configuration property is set by Nutch and not by the current
+   *          plugin. see HtmlParser
+   */
+  public static final String CONF_HTML_PARSER = "parser.html.impl";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(XslParseFilter.class);
+
+  private Configuration conf;
+
+  // The html parser to use (default is neko. Otherwise Tag Soup)
+  private String parser;
+  // The xsl parser to use (default from jvm or Saxon)
+  private boolean ifSaxonParser;
+  // Debug file to use
+  private String debugFile;
+
+  // The XXX
+  private RulesManager manager;
+
+  /**
+   * Default constructor forbidden.
+   */
+  public XslParseFilter() {
+    super();
+  }
+
+  /**
+   * @param content
+   *          full content to parse
+   * @param parseResult
+   *          result of the parse process
+   * @param metaTags
+   *          metatags set in the document
+   * @param document
+   *          the DOM document to parse
+   * @return the resulting {@link ParseResult}
+   */
+  @Override
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment document) {
+
+    if (manager == null) {
+      // no RulesManager, nothing to do
+      return parseResult;
+    }
+
+    Transformer transformer = manager.getTransformer(content.getUrl());
+    if (transformer == null) {
+      return parseResult;
+    }
+
+    try {
+      // We are selecting the HTML tag with a XPath to convert the
+      // DocumentFragment to a more natural
+      // HTML document that can be further processed with XSL.
+      // TODO applying an "html" xpath is a dirty trick to change.
+      String xpath = "html";
+
+      // For neko, all tags are UPPER CASE.
+      // For tagsoup, it is in lower case.
+      // This is decided by the html parser plugin
+      if (this.parser.equals(PARSER.NEKO.toString())) {
+        xpath = xpath.toUpperCase();
+      } else {
+        // TODO Tag soup is not working. To be investigated.
+        throw new Exception("tag soup parser not implemented.");
+      }
+
+      Node doc = XPathAPI.selectSingleNode(document, xpath);
+
+      Parse parse = parseResult.get(content.getUrl());
+
+      DOMResult result = new DOMResult();
+      // At this state, thanks to the HtmlParser that is using
+      // HtmlParseFilter interface, we got
+      // a DOM object properly built (with Neko or TagSoup).
+      transformer.transform(new DOMSource(doc), result);
+
+      // Storing the xml output for debug purpose
+      if (LOG.isDebugEnabled() && this.debugFile != null) {
+        XslParseFilter.saveDOMOutput(doc, new File(debugFile));
+        // XslParseFilter.saveDOMOutput(result.getNode(), new File(debugFile));
+      }
+
+      XslParseFilter.updateMetadata(result.getNode(), parse);
+
+    } catch (Exception e) {
+      LOG.warn("Cannot extract HTML tags. The XSL processing will not be run.",
+          e);
+    }
+
+    return parseResult;
+  }
+
+  /**
+   * @param node
+   *          the node that is used to provide metadata information.
+   * @param data
+   *          the data to update This is a simple format like the following:
+   *          Check the documents.xsd to figure out the structure.
+   */
+  protected static void updateMetadata(Node node, Parse data) {
+
+    Documents documents = JAXB.unmarshal(new DOMSource(node), Documents.class);
+
+    // No document unmarshalled
+    if (documents == null) {
+      LOG.debug("No metadata to update");
+      return;
+    }
+
+    // Browsing documents
+    for (TDocument document : documents.getDocument()) {
+
+      // There are metadata to process
+      for (TField field : document.getField()) {
+        String value = field.getValue();
+        // Trim values by default
+        if (value != null) {
+          value = value.trim();
+          // Do not keep string with 0 size
+          if (value.length() != 0) {
+            // Adds the meta to the parse meta list
+            data.getData().getParseMeta().add(field.getName(), value);
+          }
+          if (LOG.isDebugEnabled())
+            LOG.debug("Content " + field.getName() + " has value: '" + value
+                + "'");
+        }
+      }
+    }
+
+  }
+
+  /**
+   * 
+   * @param node
+   *          the DOM node to save.
+   * @param file
+   *          the file where to write the DOM.
+   */
+  private static void saveDOMOutput(Node node, File file) {
+    FileOutputStream fos = null;
+
+    try {
+      fos = new FileOutputStream(file);
+
+      TransformerFactory.newInstance().newTransformer()
+          .transform(new DOMSource(node), new StreamResult(fos));
+    } catch (Exception e) {
+      LOG.warn("Cannot store DOM node to file: " + file.getAbsolutePath(), e);
+    } finally {
+      if (fos != null)
+        try {
+          fos.close();
+        } catch (Exception e) {
+          LOG.warn("Cannot close xml file stream.", e);
+        }
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // Setting the parser from conf
+    parser = this.conf.get(CONF_HTML_PARSER, PARSER.NEKO.toString());
+    // Setting the parser to use from conf
+    ifSaxonParser = this.conf.getBoolean(CONF_XSLT_USE_SAXON, false);
+    // Debug file to use
+    debugFile = this.conf.get(CONF_XSLT_OUTPUT_DEBUG_FILE);
+
+    // TODO: use saxon for xslt 2.0 compliancy
+    if (this.ifSaxonParser) {
+      System.setProperty("javax.xml.transform.TransformerFactory",
+          "net.sf.saxon.TransformerFactoryImpl");
+    }
+
+    // create rules manager and load all configuration files
+    manager = new RulesManager(conf);
+  }
+
+}
diff --git a/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java
new file mode 100644
index 0000000000..08d2442aca
--- /dev/null
+++ b/src/plugin/parse-xsl/src/java/org/apache/nutch/parse/xsl/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse and index filter to extract field content via XSL statements.
+ */
+package org.apache.nutch.parse.xsl;
+
diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java
new file mode 100644
index 0000000000..4886c32cdd
--- /dev/null
+++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/AbstractCrawlTest.java
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.xsl;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.text.NumberFormat;
+import java.util.Date;
+
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.html.DOMBuilder;
+import org.apache.nutch.parse.xsl.XslParseFilter.PARSER;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * A class to group all classic methods to simulate a crawl without running
+ * Nutch like setting a configuration, providing a DocumentFragment, etc... All
+ * your tests related to parse-xsl shall extend this test.
+ * 
+ * 
+ */
+public abstract class AbstractCrawlTest {
+
+  /** The logger used for current and derived classes */
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(AbstractCrawlTest.class);
+
+  /**
+   * the configuration to use with current crawler Never access this property. @see
+   * AbstractCrawlTest#getConfiguration()
+   */
+  private Configuration configuration = null;
+
+  protected String sampleDir = System.getProperty("test.data", ".");
+
+  private long startDate;
+
+  /**
+   * @param parseFilter
+   *          the filter to use
+   * @param filePath
+   *          the file to crawl
+   * @param url
+   *          the url that identifies the file to crawl (only used to set the
+   *          unique key)
+   * @return the resulting content after the crawl
+   * @throws Exception
+   */
+  protected ParseResult simulateCrawl(PARSER parseFilter, String filePath,
+      String url) throws Exception {
+    ParseResult result = null;
+    FileInputStream is = null;
+    try {
+      // Opening test file
+      File file = new File(filePath);
+      is = new FileInputStream(file);
+      byte[] bytes = new byte[0];
+
+      // Setting the void content
+      Content content = new Content(url, "", bytes, "text/html",
+          new Metadata(), this.getConfiguration());
+
+      // Parse document with related parser
+      DocumentFragment document = null;
+      if (parseFilter == PARSER.NEKO) {
+        document = parseNeko(new InputSource(is));
+
+      } else {
+        document = parseTagSoup(new InputSource(is));
+      }
+
+      // Creates a parser with dedicated method
+      HtmlParseFilter filter = new XslParseFilter();
+      // Setting configuration
+      filter.setConf(this.getConfiguration());
+
+      ParseData data = new ParseData();
+
+      // Initializing the parse result
+      ParseResult parseResult = ParseResult.createParseResult(url,
+          new ParseImpl("no text", data));
+
+      // Extracting metadata
+      result = filter.filter(content, parseResult, null, document);
+    } catch (Exception e) {
+      throw new Exception("Cannot simulate crawl", e);
+    } finally {
+      if (is != null) {
+        try {
+          is.close();
+        } catch (IOException e) {
+          LOG.error("Cannot close input stream", e);
+        }
+      }
+    }
+    return result;
+  }
+
+  /**
+   * Constructs a an html DOM structure.
+   * 
+   * @param input
+   *          the html/xml input stream
+   * @return DocumentFragment the document that has been created.
+   * @throws Exception
+   */
+  protected static DocumentFragment parseTagSoup(InputSource input)
+      throws Exception {
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    DocumentFragment frag = doc.createDocumentFragment();
+    DOMBuilder builder = new DOMBuilder(doc, frag);
+    org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
+    reader.setContentHandler(builder);
+    reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+    reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
+    reader
+        .setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+    reader.parse(input);
+    return frag;
+  }
+
+  /**
+   * Constructs a an html DOM structure.
+   * 
+   * @param input
+   *          the html/xml input stream
+   * @return DocumentFragment the document that has been created.
+   * @throws Exception
+   */
+  protected static DocumentFragment parseNeko(InputSource input)
+      throws Exception {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    try {
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+              true);
+      parser.setFeature("http://cyberneko.org/html/features/augmentations",
+          true);
+      parser.setProperty(
+          "http://cyberneko.org/html/properties/default-encoding", "UTF-8");
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
+              true);
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
+              false);
+      parser.setFeature(
+          "http://cyberneko.org/html/features/balance-tags/document-fragment",
+          true);
+      parser
+          .setFeature("http://cyberneko.org/html/features/balance-tags", true);
+      parser.setFeature("http://cyberneko.org/html/features/report-errors",
+          true);
+      parser.setProperty("http://cyberneko.org/html/properties/names/elems",
+          "lower");
+
+      System.out.println(LOG.isTraceEnabled());
+
+    } catch (SAXException e) {
+      LOG.error("Cannot set parser features", e);
+    }
+    // convert Document to DocumentFragment
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment res = doc.createDocumentFragment();
+    DocumentFragment frag = doc.createDocumentFragment();
+    parser.parse(input, frag);
+    res.appendChild(frag);
+
+    try {
+      while (true) {
+        frag = doc.createDocumentFragment();
+        parser.parse(input, frag);
+        if (!frag.hasChildNodes())
+          break;
+        // if (LOG.isInfoEnabled()) {
+        LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
+        System.out.println(" - new frag, " + frag.getChildNodes().getLength()
+            + " nodes.");
+        // }
+        res.appendChild(frag);
+      }
+    } catch (Exception e) {
+      LOG.error("Error: ", e);
+      System.out.println(e);
+    }
+
+    return res;
+  }
+
+  /**
+   * 
+   * @return the current configuration.
+   */
+  public Configuration getConfiguration() {
+    if (this.configuration == null) {
+      this.configuration = NutchConfiguration.create();
+    }
+    return this.configuration;
+  }
+
+  /**
+   * To display some memory related information. Can be used for benchmark test
+   */
+  private void displayMemoryUsage() {
+    Runtime runtime = Runtime.getRuntime();
+
+    NumberFormat format = NumberFormat.getInstance();
+
+    long maxMemory = runtime.maxMemory();
+    long allocatedMemory = runtime.totalMemory();
+    long freeMemory = runtime.freeMemory();
+
+    System.out.println("free memory: " + format.format(freeMemory / 1024));
+    System.out.println("allocated memory: "
+        + format.format(allocatedMemory / 1024));
+    System.out.println("max memory: " + format.format(maxMemory / 1024));
+    System.out.println("total free memory: "
+        + format.format((freeMemory + (maxMemory - allocatedMemory)) / 1024));
+  }
+
+  /**
+   * Can be called before each test to get the run test date.
+   */
+  protected void startTest() {
+    System.out.println("Starting test...");
+    this.displayMemoryUsage();
+    this.startDate = new Date().getTime();
+  }
+
+  /**
+   * Can be called at the end of a test to evaluate the elapsed time.
+   */
+  private void endTest() {
+    this.displayMemoryUsage();
+    System.out.println("Test took " + (new Date().getTime() - this.startDate)
+        + " ms");
+    System.out.println("Test ended.");
+  }
+
+}
diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java
new file mode 100644
index 0000000000..3e76c01c3d
--- /dev/null
+++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestParseTechnical.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.xsl;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.FileReader;
+
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathFactory;
+
+import org.junit.Test;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+/**
+ *  
+ */
+public class TestParseTechnical extends AbstractCrawlTest {
+
+  /**
+   * Executes some xpath on neko parsed document
+   */
+  @Test
+  public void testXpathNeko() {
+    try {
+      DocumentFragment doc = parseNeko(new InputSource(
+          new FileReader(new File(sampleDir, "sample1/book1.html"))));
+      XPath xpath = XPathFactory.newInstance().newXPath();
+      NodeList result = (NodeList) xpath.compile("//DIV").evaluate(doc,
+          XPathConstants.NODESET);
+      assertNotNull(result);
+      assertEquals(3, result.getLength());
+      System.out.println(result.getLength());
+      result = (NodeList) xpath.compile("//HTML").evaluate(doc,
+          XPathConstants.NODESET);
+      assertNotNull(result);
+      System.out.println(result.getLength());
+      assertEquals(1, result.getLength());
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  /**
+   * Executes some xpath on TagSoup parsed document
+   * TODO not working with TagSoup. Investigate why.
+   */
+  @Test
+  public void testXpathTagSoup() {
+    try {
+      DocumentFragment doc = parseTagSoup(new InputSource(
+          new FileReader(new File(sampleDir, "sample1/book1.html"))));
+      XPath xpath = XPathFactory.newInstance().newXPath();
+      NodeList result = (NodeList) xpath.compile("//div").evaluate(doc,
+          XPathConstants.NODESET);
+      assertNotNull(result);
+      assertEquals(3, result.getLength());
+      System.out.println(result.getLength());
+      result = (NodeList) xpath.compile("//html").evaluate(doc,
+          XPathConstants.NODESET);
+      assertNotNull(result);
+      System.out.println(result.getLength());
+      assertEquals(1, result.getLength());
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}
diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java
new file mode 100644
index 0000000000..b3a9396f03
--- /dev/null
+++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestSample1.java
@@ -0,0 +1,78 @@
+package org.apache.nutch.parse.xsl;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.ParseResult;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.nutch.parse.xsl.XslParseFilter.PARSER;
+import org.junit.Test;
+
+/**
+ * 
+ * This sample test will show you how to test the crawling of a page by
+ * simulating a crawl. All the thing that you have to do is to inherit from
+ * AbstractCrawlTest.
+ * 
+ */
+public class TestSample1 extends AbstractCrawlTest {
+
+  /**
+   * Loads the rules xml file that will route your transformers from urls.
+   */
+  public TestSample1() {
+    this.getConfiguration().set(RulesManager.CONF_XML_RULES, "sample1/rules.xml");
+  }
+
+  @Test
+  public void testBook1() {
+    String url = "http://www.sample1.com/book?1245";
+
+    try {
+      ParseResult parseResult = simulateCrawl(PARSER.NEKO,
+          new File(sampleDir, "sample1/book1.html").toString(), url);
+      assertNotNull(parseResult);
+
+      Metadata parsedMetadata = parseResult.get(url).getData().getParseMeta();
+      // Asserts we have metadata
+      assertNotNull(parsedMetadata);
+      // Title check
+      assertEquals("Nutch for dummies", parsedMetadata.get("title"));
+      // Description check
+      assertEquals(
+          "The ultimate book to master all nutch powerful mechanisms !",
+          parsedMetadata.get("description"));
+      // Isbn check
+      assertEquals("123654987789", parsedMetadata.get("isbn"));
+      // Authors check
+      assertEquals("Mr Allan A.", parsedMetadata.getValues("author")[0]);
+      assertEquals("Mrs Mulan B.", parsedMetadata.getValues("author")[1]);
+      // Price check
+      assertEquals("free", parsedMetadata.get("price"));
+      // Collection check
+      assertEquals("Collection from nowhere", parsedMetadata.get("collection"));
+
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail("testBook1 exception");
+    }
+  }
+
+}
diff --git a/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java
new file mode 100644
index 0000000000..3285770418
--- /dev/null
+++ b/src/plugin/parse-xsl/src/test/org/apache/nutch/parse/xsl/TestXslIndexFilter.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.xsl;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.util.List;
+
+import org.junit.Test;
+
+/**
+ * 
+ * Testing the filter that will auto import fields defined in the xsl file.
+ * 
+ */
+public class TestXslIndexFilter extends AbstractCrawlTest {
+
+  /**
+   * Test the fields fetch from xsl file.
+   */
+  @Test
+  public void testFields() {
+    XslIndexFilter filter = new XslIndexFilter();
+    try {
+      List<String> list = filter.extractFields(
+          new File(sampleDir, "sample1/transformer_book.xsl").toString());
+      assertNotNull(list);
+      assertEquals(6, list.size());
+    } catch (Exception e) {
+      fail();
+    }
+  }
+}