diff --git a/README.md b/README.md
index a579cd8..296ffd4 100644
--- a/README.md
+++ b/README.md
@@ -45,9 +45,12 @@ public class Example {
RakeParams params = new RakeParams(stopWords, stopPOS, minWordChar, shouldStem, phraseDelims);
// Create a RakeAlgorithm object
+ // You can use the RakeAlgorithm(RakeParams, POSTaggerME, SentenceDetectorME)
+ // constructor instead of the one shown below if you want to pass in
+ // pre-initialized opennlp models.
String POStaggerURL = "model-bin/en-pos-maxent.bin"; // The path to your POS tagging model
- String SentDetecURL = "model-bin/en-sent.bin"; // The path to your sentence detection model
- RakeAlgorithm rakeAlg = new RakeAlgorithm(params, POStaggerURL, SentDetecURL);
+ String SentDetectURL = "model-bin/en-sent.bin"; // The path to your sentence detection model
+ RakeAlgorithm rakeAlg = new RakeAlgorithm(params, POStaggerURL, SentDetectURL);
// Call the rake method
String txt = "dogs are great, don't you agree? I love dogs, especially big dogs";
diff --git a/pom.xml b/pom.xml
index ecc8ed0..b25dc19 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
jar
rapidrake
${project.groupId}:${project.artifactId}
- 0.1.4
+ 0.1.5
A fast version of the Rapid Automatic Keyword Extraction (RAKE) algorithm
https://github.com/crew102/rapidrake-java
diff --git a/src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java b/src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java
index 96c0c5a..444bdf0 100644
--- a/src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java
+++ b/src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java
@@ -64,7 +64,35 @@ public RakeAlgorithm(RakeParams rakeParams, InputStream taggerStream, InputStrea
this.tagger = new Tagger(taggerStream).getPosTagger();
this.sentDetector = new SentDetector(sentDectStream).getSentDetector();
}
-
+
+ /**
+ * Constructor. See below for example usage. Use this constructor if you want
+ * to avoid the overhead of creating the POS tagger and sentence detector
+ * models each time you create a new instance of RakeAlgorithm.
+ *
+ *
+ * String POStaggerURL = "model-bin/en-pos-maxent.bin";
+ * String SentDetectURL = "model-bin/en-sent.bin";
+ *
+ * // create the required model classes, you can cache these instances in a singleton
+ * POSTaggerME tagger = new Tagger(POStaggerURL).getPosTagger();
+ * SentenceDetectorME sentDetect = new SentDetector(SentDetectURL).getSentDetector();
+ *
+ * // now creating an instance of the RakeAlgorithm is fast
+ * RakeAlgorithm rakeAlg = new RakeAlgorithm(params, tagger, sentDetect);
+ *
+ *
+ * @param rakeParams the parameters RAKE will use
+ * @param posTaggerME An instance of opennlp.tools.postag.POSTaggerME.
+ * @param sentDetectorME An instance of opennlp.tools.sentdetect.SentenceDetectorME.
+ * @see RakeParams
+ */
+ public RakeAlgorithm(RakeParams rakeParams, POSTaggerME posTaggerME, SentenceDetectorME sentDetectorME) {
+ this.rakeParams = rakeParams;
+ this.tagger = posTaggerME;
+ this.sentDetector = sentDetectorME;
+ }
+
/**
* Run RAKE on a single string.
*
@@ -87,12 +115,23 @@ private String[] getTokens(String txtEl) {
ArrayList tokenList = new ArrayList();
Pattern anyWordChar = Pattern.compile("[a-z]");
- String[] sents = sentDetector.sentDetect(txtPadded);
+ String[] sents;
+ // Make sure that the tagger is thread safe
+ synchronized(sentDetector){
+ sents = sentDetector.sentDetect(txtPadded);
+ }
+
+ WhitespaceTokenizer wsTokenizer = WhitespaceTokenizer.INSTANCE;
for (String sentence : sents) {
- String[] tokenArray = WhitespaceTokenizer.INSTANCE.tokenize(sentence);
- String[] tags = tagger.tag(tokenArray);
+ String[] tokenArray = wsTokenizer.tokenize(sentence);
+
+ String[] tags;
+ // Make sure that the tagger is thread safe
+ synchronized(tagger){
+ tags = tagger.tag(tokenArray);
+ }
for (int i = 0; i < tokenArray.length; i++) {
diff --git a/src/test/java/io/github/crew102/rapidrake/TestRapidRake.java b/src/test/java/io/github/crew102/rapidrake/TestRapidRake.java
index 423f134..cfc5877 100644
--- a/src/test/java/io/github/crew102/rapidrake/TestRapidRake.java
+++ b/src/test/java/io/github/crew102/rapidrake/TestRapidRake.java
@@ -6,125 +6,159 @@
import org.junit.Test;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.sentdetect.SentenceDetectorME;
import io.github.crew102.rapidrake.data.SmartWords;
import io.github.crew102.rapidrake.model.*;
+import io.github.crew102.rapidrake.opennlpUtils.Tagger;
+import io.github.crew102.rapidrake.opennlpUtils.SentDetector;
public class TestRapidRake {
-
+
private static String delims = "[-,.?():;\"!/]";
private static String posUrl = "model-bin/en-pos-maxent.bin";
private static String sentUrl = "model-bin/en-sent.bin";
-
- @Test
- public void testMinAlg() throws java.io.IOException {
+
+ @Test
+ public void testMinAlg() throws java.io.IOException {
+
String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, false, delims);
RakeAlgorithm minRakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
- Result aRes = minRakeAlg.rake("here is some text");
- String[] keys = aRes.getFullKeywords();
- float[] scores = aRes.getScores();
-
- assertEquals("here is some text", keys[0]);
- assertEquals("Incorrect scoring", 16, scores[0], 0);
-
- Result res2 = minRakeAlg.rake("also, here is some more. text.");
- String[] keys2 = res2.getFullKeywords();
- float[] scores2 = res2.getScores();
-
- assertEquals("also", keys2[0]);
- assertEquals("here is some more", keys2[1]);
- assertEquals("text", keys2[2]);
- assertEquals("Incorrect scoring", 1, scores2[0], 0);
- }
+ Result aRes = minRakeAlg.rake("here is some text");
+ String[] keys = aRes.getFullKeywords();
+ float[] scores = aRes.getScores();
+
+ assertEquals("here is some text", keys[0]);
+ assertEquals("Incorrect scoring", 16, scores[0], 0);
+
+ Result res2 = minRakeAlg.rake("also, here is some more. text.");
+ String[] keys2 = res2.getFullKeywords();
+ float[] scores2 = res2.getScores();
+
+ assertEquals("also", keys2[0]);
+ assertEquals("here is some more", keys2[1]);
+ assertEquals("text", keys2[2]);
+ assertEquals("Incorrect scoring", 1, scores2[0], 0);
+ }
- @Test
- public void testStopWordRemoval() throws java.io.IOException {
-
+ @Test
+ public void testStopWordRemoval() throws java.io.IOException {
+
String[] stopWords = {"text"};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
- Result aRes = rakeAlg.rake("here is some. text");
- String[] keys = aRes.getFullKeywords();
-
- assertEquals("Stopword removal", 1, keys.length, 0);
- }
+ Result aRes = rakeAlg.rake("here is some. text");
+ String[] keys = aRes.getFullKeywords();
+
+ assertEquals("Stopword removal", 1, keys.length, 0);
+ }
- @Test
- public void testStopPOSRemoval() throws java.io.IOException {
+ @Test
+ public void testStopPOSRemoval() throws java.io.IOException {
+
String[] stopWords = {""};
String[] stopPOS = {"VBD"};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
- Result aRes = rakeAlg.rake("I ran to the store");
- String[] keys = aRes.getFullKeywords();
-
- assertEquals("StopPOS removal", 2, keys.length, 0);
- }
+ Result aRes = rakeAlg.rake("I ran to the store");
+ String[] keys = aRes.getFullKeywords();
+
+ assertEquals("StopPOS removal", 2, keys.length, 0);
+ }
- @Test
- public void testMinWordCharRemoval() throws java.io.IOException {
-
+ @Test
+ public void testMinWordCharRemoval() throws java.io.IOException {
+
String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 2, true, delims);
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
- Result aRes = rakeAlg.rake("I ran to the store");
- String[] keys = aRes.getFullKeywords();
-
- assertEquals("ran to the store", keys[0]);
- }
+ Result aRes = rakeAlg.rake("I ran to the store");
+ String[] keys = aRes.getFullKeywords();
+
+ assertEquals("ran to the store", keys[0]);
+ }
- @Test
- public void testStemScoring() throws java.io.IOException {
-
+ @Test
+ public void testStemScoring() throws java.io.IOException {
+
String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
- Result aRes = rakeAlg.rake("good dogs. good. dog");
- float[] scores = aRes.getScores();
-
+ Result aRes = rakeAlg.rake("good dogs. good. dog");
+ float[] scores = aRes.getScores();
+
RakeParams params2 = new RakeParams(stopWords, stopPOS, 0, false, delims);
RakeAlgorithm rakeAlg2 = new RakeAlgorithm(params2, posUrl, sentUrl);
- Result aRes2 = rakeAlg2.rake("good dog. good. dog");
- float[] scores2 = aRes2.getScores();
-
- assertArrayEquals(scores, scores2, 0);
- }
+ Result aRes2 = rakeAlg2.rake("good dog. good. dog");
+ float[] scores2 = aRes2.getScores();
+
+ assertArrayEquals(scores, scores2, 0);
+ }
- @Test
- public void testDelims() throws java.io.IOException {
-
+ @Test
+ public void testDelims() throws java.io.IOException {
+
String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, "[.]"); // no comma
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
- Result aRes = rakeAlg.rake("good dogs, good. dog");
- String[] keys = aRes.getFullKeywords();
+ Result aRes = rakeAlg.rake("good dogs, good. dog");
+ String[] keys = aRes.getFullKeywords();
+
+ assertEquals(2, keys.length, 0);
+ }
- assertEquals(2, keys.length, 0);
- }
+ @Test
+ public void testStemmerLang() throws java.io.IOException {
+
+ String[] stopWords = new SmartWords().getSmartWords();
+ String[] stopPOS = {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"};
+ String txtEl = "dependent dogs. dependable dogs";
+
+ RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
+ RakeAlgorithm alg = new RakeAlgorithm(params, posUrl, sentUrl);
+ Result res = alg.rake(txtEl);
+
+ RakeParams frenchStem = new RakeParams(stopWords, stopPOS, 0, true, delims,
+ SnowballStemmer.ALGORITHM.FRENCH);
+ RakeAlgorithm frenchAlg = new RakeAlgorithm(frenchStem, posUrl, sentUrl);
+ Result frenchRes = frenchAlg.rake(txtEl);
+
+ assertFalse(Arrays.equals(res.getStemmedKeywords(), frenchRes.getStemmedKeywords()));
+ }
@Test
- public void testStemmerLang() throws java.io.IOException {
+ public void testSecondaryConstructor() throws java.io.IOException {
+
+ String[] stopWords = {""};
+ String[] stopPOS = {""};
+ RakeParams params = new RakeParams(stopWords, stopPOS, 0, false, delims);
+ // create the required model classes, you can cache these instances in a singleton
+ POSTaggerME tagger = new Tagger(posUrl).getPosTagger();
+ SentenceDetectorME sentDetect = new SentDetector(sentUrl).getSentDetector();
- String[] stopWords = new SmartWords().getSmartWords();
- String[] stopPOS = {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"};
- String txtEl = "dependent dogs. dependable dogs";
+ RakeAlgorithm minRakeAlg = new RakeAlgorithm(params, tagger, sentDetect);
+ Result aRes = minRakeAlg.rake("here is some text");
+ String[] keys = aRes.getFullKeywords();
+ float[] scores = aRes.getScores();
- RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
- RakeAlgorithm alg = new RakeAlgorithm(params, posUrl, sentUrl);
- Result res = alg.rake(txtEl);
+ assertEquals("here is some text", keys[0]);
+ assertEquals("Incorrect scoring", 16, scores[0], 0);
- RakeParams frenchStem = new RakeParams(stopWords, stopPOS, 0, true, delims,
- SnowballStemmer.ALGORITHM.FRENCH);
- RakeAlgorithm frenchAlg = new RakeAlgorithm(frenchStem, posUrl, sentUrl);
- Result frenchRes = frenchAlg.rake(txtEl);
+ Result res2 = minRakeAlg.rake("also, here is some more. text.");
+ String[] keys2 = res2.getFullKeywords();
+ float[] scores2 = res2.getScores();
- assertFalse(Arrays.equals(res.getStemmedKeywords(), frenchRes.getStemmedKeywords()));
+ assertEquals("also", keys2[0]);
+ assertEquals("here is some more", keys2[1]);
+ assertEquals("text", keys2[2]);
+ assertEquals("Incorrect scoring", 1, scores2[0], 0);
}
}