Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NUTCH-1870 XSL parse filter #439

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,11 @@
<packageset dir="${plugins.dir}/parse-metatags/src/java"/>
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-tika/src/java"/>
<packageset dir="${plugins.dir}/parse-xsl/src/java">
<!-- plugin parse-xsl : exclude JAXB generated packages -->
<exclude name="org/apache/nutch/parse/xsl/xml/document"/>
<exclude name="org/apache/nutch/parse/xsl/xml/rule"/>
</packageset>
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
Expand Down Expand Up @@ -704,6 +709,11 @@
<packageset dir="${plugins.dir}/parse-metatags/src/java"/>
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-tika/src/java"/>
<packageset dir="${plugins.dir}/parse-xsl/src/java">
<!-- plugin parse-xsl : exclude JAXB generated packages -->
<exclude name="org/apache/nutch/parse/xsl/xml/document"/>
<exclude name="org/apache/nutch/parse/xsl/xml/rule"/>
</packageset>
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
Expand Down Expand Up @@ -1140,6 +1150,8 @@
<source path="${plugins.dir}/parse-swf/src/test/" />
<source path="${plugins.dir}/parse-tika/src/java/" />
<source path="${plugins.dir}/parse-tika/src/test/" />
<source path="${plugins.dir}/parse-xsl/src/java/" />
<source path="${plugins.dir}/parse-xsl/src/test/" />
<source path="${plugins.dir}/parse-zip/src/java/" />
<source path="${plugins.dir}/parse-zip/src/test/" />
<source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
Expand Down
16 changes: 16 additions & 0 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1933,6 +1933,22 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
</description>
</property>


<!-- parse-xsl plugin properties -->
<property>
<name>parser.xsl.rulesFile</name>
<value>parse-xsl-rules.xml</value>
<description>
Rule file for plugin parse-xsl: it may contain multiple
rules. Every rule assigns a XSL transformer to all documents
matched by a given URL pattern. Transformers are specified
in separate XML files referenced from the rule file.
A transformer can specify multiple index fields to be filled
by XSL statements from the DOM tree of the parsed document.
</description>
</property>


<!-- Temporary Hadoop 0.17.x workaround. -->

<property>
Expand Down
18 changes: 18 additions & 0 deletions conf/parse-xsl-rules.xml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!-- parse-xsl-rules.xml - Configuration file for plugin parse-xsl -->
<rules filterUrlsWithNoRule="false">

<!--
If an URL matches the pattern of a rule,
the associated transformer is used to extract fields via XSL
statements from the document behind this URL.

Depending on the value of the attribute filterUrlsWithNoRule
documents which are not matched by any of the rules are
- either skipped from indexing (if filterUrlsWithNoRule == true)
- or kept without any fields filled by XSL transformations.
-->
<rule matches=".*">
<transformer file="parse-xsl-transform.xsl" />
</rule>

</rules>
26 changes: 26 additions & 0 deletions conf/parse-xsl-transform.xsl.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>

<!--
Extract fields from document by XSL transforms.
-->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">

<xsl:template match="/">
<documents>
<document>

<field name="headline">
<xsl:variable name="extractedValue" select="/HTML/BODY/H1" />
<xsl:value-of select="normalize-space($extractedValue)" />
</field>

<field name="description">
<xsl:value-of select="//DIV[@id='description']" />
</field>

</document>
</documents>
</xsl:template>

</xsl:stylesheet>
3 changes: 2 additions & 1 deletion default.properties
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,5 @@ plugins.misc=\
org.creativecommons.nutch*:\
org.apache.nutch.microformats.reltag*:\
org.apache.nutch.any23*

org.apache.nutch.parse.xsl*

3 changes: 3 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
<ant dir="parse-metatags" target="deploy"/>
<ant dir="parse-swf" target="deploy"/>
<ant dir="parse-tika" target="deploy"/>
<ant dir="parse-xsl" target="deploy"/>
<ant dir="parse-zip" target="deploy"/>
<ant dir="parsefilter-naivebayes" target="deploy"/>
<ant dir="parsefilter-regex" target="deploy"/>
Expand Down Expand Up @@ -134,6 +135,7 @@
<ant dir="parse-metatags" target="test"/>
<ant dir="parse-swf" target="test"/>
<ant dir="parse-tika" target="test"/>
<ant dir="parse-xsl" target="test"/>
<ant dir="parse-zip" target="test"/>
<ant dir="parsefilter-regex" target="test"/>
<ant dir="protocol-file" target="test"/>
Expand Down Expand Up @@ -210,6 +212,7 @@
<ant dir="parse-metatags" target="clean"/>
<ant dir="parse-swf" target="clean"/>
<ant dir="parse-tika" target="clean"/>
<ant dir="parse-xsl" target="clean"/>
<ant dir="parse-zip" target="clean"/>
<ant dir="parsefilter-naivebayes" target="clean" />
<ant dir="parsefilter-regex" target="clean"/>
Expand Down
65 changes: 65 additions & 0 deletions src/plugin/parse-xsl/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="parse-xsl" default="jar-core">

<import file="../build-plugin.xml"/>

<!-- Build compilation dependencies -->
<target name="deps-jar">
<ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
<ant target="jar" inheritall="false" dir="../parse-html"/>
</target>

<!-- Add compilation dependencies to classpath -->
<path id="plugin.deps">
<fileset dir="${nutch.root}/build">
<include name="**/lib-nekohtml/*.jar" />
<include name="**/parse-html/*.jar" />
</fileset>
<!-- config files are load from test/data -->
<pathelement path="${build.test}/data" />
</path>

<target name="init-plugin" depends="deps-jar,resolve-default"
description="Plugin-specific initialization">

<taskdef name="xjc" classname="com.sun.tools.xjc.XJCTask"
description="Compile XML bindings (xjc)">
<classpath>
<path refid="classpath"/>
</classpath>
</taskdef>

<xjc schema="conf/documents.xsd" destdir="src/java"
package="org.apache.nutch.parse.xsl.xml.document"/>
<xjc schema="conf/rules.xsd" destdir="src/java"
package="org.apache.nutch.parse.xsl.xml.rule"/>

</target>

<!-- for junit test -->
<mkdir dir="${build.test}/data"/>
<copy todir="${build.test}/data">
<fileset dir="sample">
<include name="**/*.html"/>
<include name="**/*.xml"/>
<include name="**/*.xsl"/>
</fileset>
</copy>

</project>
27 changes: 27 additions & 0 deletions src/plugin/parse-xsl/conf/documents.xsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<schema xmlns="http://www.w3.org/2001/XMLSchema" xmlns:documents="http://www.example.org/documents/"
targetNamespace="http://www.example.org/documents/">

<!-- Root element -->
<element name="documents">
<complexType>
<sequence maxOccurs="unbounded" minOccurs="0">
<element name="document" type="documents:TDocument"></element>
</sequence>
</complexType>
</element>

<complexType name="TDocument">
<sequence maxOccurs="unbounded" minOccurs="0">
<element name="field" type="documents:TField"></element>
</sequence>
</complexType>

<complexType name="TField">
<simpleContent>
<extension base="string">
<attribute name="name" type="string"></attribute>
</extension>
</simpleContent>
</complexType>
</schema>
29 changes: 29 additions & 0 deletions src/plugin/parse-xsl/conf/rules.xsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?xml version="1.0" encoding="UTF-8"?>
<schema xmlns="http://www.w3.org/2001/XMLSchema" xmlns:rules="http://www.example.org/rules/"
targetNamespace="http://www.example.org/rules/">

<!-- Root element -->
<element name="rules">
<complexType>
<sequence maxOccurs="unbounded" minOccurs="1">
<element name="rule" type="rules:TRule"></element>
</sequence>
<attribute name="filterUrlsWithNoRule" type="boolean" use="optional" default="true"></attribute>
</complexType>
</element>

<complexType name="TRule">
<sequence maxOccurs="1" minOccurs="1">
<element name="transformer" type="rules:TTransformer"></element>
</sequence>
<attribute name="matches" type="string" use="required"></attribute>
</complexType>

<complexType name="TTransformer">
<simpleContent>
<extension base="string">
<attribute name="file" type="string" use="required"></attribute>
</extension>
</simpleContent>
</complexType>
</schema>
46 changes: 46 additions & 0 deletions src/plugin/parse-xsl/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version="1.0" ?>

<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<ivy-module version="1.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../../ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
<dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
<dependency org="com.sun.xml.bind" name="jaxb-xjc" rev="2.2.7"/>
<dependency org="com.sun.xml.bind" name="jaxb-impl" rev="2.2.7"/>
<dependency org="com.sun.xml.bind" name="jaxb-jxc" rev="2.2.7"/>
<dependency org="com.sun.xml.bind" name="jaxb-core" rev="2.2.7"/>
</dependencies>

</ivy-module>
49 changes: 49 additions & 0 deletions src/plugin/parse-xsl/plugin.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<plugin
id="parse-xsl"
name="XSL parser"
version="1.0.0"
provider-name="nutch.org">

<runtime>
<library name="parse-xsl.jar">
<export name="*"/>
</library>
</runtime>

<requires>
<import plugin="nutch-extensionpoints"/>
<import plugin="parse-html"/>
</requires>

<extension id="org.apache.nutch.parse"
name="Parse XSL Filter"
point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="XslParseFilter" class="org.apache.nutch.parse.xsl.XslParseFilter">
</implementation>
</extension>

<extension id="org.apache.nutch.indexer"
name="Index XSL Filter"
point="org.apache.nutch.indexer.IndexingFilter">
<implementation id="XslIndexFilter" class="org.apache.nutch.parse.xsl.XslIndexFilter">
</implementation>
</extension>

</plugin>
38 changes: 38 additions & 0 deletions src/plugin/parse-xsl/sample/sample1/book1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<html>
<!-- This is a fake page built to show you how to extract metadata given the parse-xsl plugin
To see how the job is done please refer to transformer_book.xsl
-->

<title>Buy Nutch for dummies!</title>

<body>

<!-- Easy data to extract (full data located between tags) -->
<h1>Nutch for dummies</h1>

<!-- Easy data to extract given a unique attribute value-->
<div id="description">The ultimate book to master all nutch powerful mechanisms !</div>

<!-- Data to extract after a label string -->
<div>Isbn: 123654987789</div>

<!-- Etc.. -->
<ul>Authors
<li>Mr Allan A.
<li>Mrs Mulan B.
</ul>
<span>Price: free</span>

<div class=".collection">Collection from nowhere</div>

<!-- This to avoid getting authors data without thinking ;) -->
<ul>Other related books
<li>Lucene explained to your grandmother
<li>How I met Solr?
<li>Feels better with Elastic Search
</ul>

</body>


</html>
Loading