Skip to content

Commit

Permalink
Fixed thread safety issues & allow cached OpenNLP models (#7)
Browse files Browse the repository at this point in the history
* Fixed thread safety issues & allow cached POSTaggerME/SentenceDetectorME  instances

* Made getTokens() thread safe by synchronizing calls to POSTaggerME & SentenceDetectorME objects
* Added constructors so you can cache instances of POSTaggerME & SentenceDetectorME to improve performance of creating lots of instances of the RakeAlgorithm class

* Fixed grammar issue in README

* Make minor refactorings to #7

* Add test covering constructor introduced in #7

* Merged in PR changes

---------

Co-authored-by: chris <[email protected]>
  • Loading branch information
dswitzer and crew102 authored Jun 10, 2024
1 parent 6bfd081 commit d3c787a
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 80 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,12 @@ public class Example {
RakeParams params = new RakeParams(stopWords, stopPOS, minWordChar, shouldStem, phraseDelims);

// Create a RakeAlgorithm object
// You can use the RakeAlgorithm(RakeParams, POSTaggerME, SentenceDetectorME)
// constructor instead of the one shown below if you want to pass in
// pre-initialized opennlp models.
String POStaggerURL = "model-bin/en-pos-maxent.bin"; // The path to your POS tagging model
String SentDetecURL = "model-bin/en-sent.bin"; // The path to your sentence detection model
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, POStaggerURL, SentDetecURL);
String SentDetectURL = "model-bin/en-sent.bin"; // The path to your sentence detection model
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, POStaggerURL, SentDetectURL);

// Call the rake method
String txt = "dogs are great, don't you agree? I love dogs, especially big dogs";
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<packaging>jar</packaging>
<artifactId>rapidrake</artifactId>
<name>${project.groupId}:${project.artifactId}</name>
<version>0.1.4</version>
<version>0.1.5</version>
<description>A fast version of the Rapid Automatic Keyword Extraction (RAKE) algorithm</description>
<url>https://github.com/crew102/rapidrake-java</url>

Expand Down
47 changes: 43 additions & 4 deletions src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,35 @@ public RakeAlgorithm(RakeParams rakeParams, InputStream taggerStream, InputStrea
this.tagger = new Tagger(taggerStream).getPosTagger();
this.sentDetector = new SentDetector(sentDectStream).getSentDetector();
}


/**
* Constructor. See below for example usage. Use this constructor if you want
* to avoid the overhead of creating the POS tagger and sentence detector
* models each time you create a new instance of RakeAlgorithm.
*
* <pre>
* String POStaggerURL = "model-bin/en-pos-maxent.bin";
* String SentDetectURL = "model-bin/en-sent.bin";
*
* // create the required model classes, you can cache these instances in a singleton
* POSTaggerME tagger = new Tagger(POStaggerURL).getPosTagger();
* SentenceDetectorME sentDetect = new SentDetector(SentDetectURL).getSentDetector();
*
* // now creating an instance of the RakeAlgorithm is fast
* RakeAlgorithm rakeAlg = new RakeAlgorithm(params, tagger, sentDetect);
* </pre>
*
* @param rakeParams the parameters RAKE will use
* @param posTaggerME An instance of opennlp.tools.postag.POSTaggerME.
* @param sentDetectorME An instance of opennlp.tools.sentdetect.SentenceDetectorME.
* @see RakeParams
*/
public RakeAlgorithm(RakeParams rakeParams, POSTaggerME posTaggerME, SentenceDetectorME sentDetectorME) {
this.rakeParams = rakeParams;
this.tagger = posTaggerME;
this.sentDetector = sentDetectorME;
}

/**
* Run RAKE on a single string.
*
Expand All @@ -87,12 +115,23 @@ private String[] getTokens(String txtEl) {
ArrayList<String> tokenList = new ArrayList<String>();
Pattern anyWordChar = Pattern.compile("[a-z]");

String[] sents = sentDetector.sentDetect(txtPadded);
String[] sents;
// Make sure that the tagger is thread safe
synchronized(sentDetector){
sents = sentDetector.sentDetect(txtPadded);
}

WhitespaceTokenizer wsTokenizer = WhitespaceTokenizer.INSTANCE;

for (String sentence : sents) {

String[] tokenArray = WhitespaceTokenizer.INSTANCE.tokenize(sentence);
String[] tags = tagger.tag(tokenArray);
String[] tokenArray = wsTokenizer.tokenize(sentence);

String[] tags;
// Make sure that the tagger is thread safe
synchronized(tagger){
tags = tagger.tag(tokenArray);
}

for (int i = 0; i < tokenArray.length; i++) {

Expand Down
180 changes: 107 additions & 73 deletions src/test/java/io/github/crew102/rapidrake/TestRapidRake.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,125 +6,159 @@
import org.junit.Test;

import opennlp.tools.stemmer.snowball.SnowballStemmer;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;

import io.github.crew102.rapidrake.data.SmartWords;
import io.github.crew102.rapidrake.model.*;
import io.github.crew102.rapidrake.opennlpUtils.Tagger;
import io.github.crew102.rapidrake.opennlpUtils.SentDetector;

public class TestRapidRake {

private static String delims = "[-,.?():;\"!/]";
private static String posUrl = "model-bin/en-pos-maxent.bin";
private static String sentUrl = "model-bin/en-sent.bin";

@Test
public void testMinAlg() throws java.io.IOException {

@Test
public void testMinAlg() throws java.io.IOException {

String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, false, delims);
RakeAlgorithm minRakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
Result aRes = minRakeAlg.rake("here is some text");
String[] keys = aRes.getFullKeywords();
float[] scores = aRes.getScores();

assertEquals("here is some text", keys[0]);
assertEquals("Incorrect scoring", 16, scores[0], 0);

Result res2 = minRakeAlg.rake("also, here is some more. text.");
String[] keys2 = res2.getFullKeywords();
float[] scores2 = res2.getScores();

assertEquals("also", keys2[0]);
assertEquals("here is some more", keys2[1]);
assertEquals("text", keys2[2]);
assertEquals("Incorrect scoring", 1, scores2[0], 0);
}
Result aRes = minRakeAlg.rake("here is some text");
String[] keys = aRes.getFullKeywords();
float[] scores = aRes.getScores();
assertEquals("here is some text", keys[0]);
assertEquals("Incorrect scoring", 16, scores[0], 0);
Result res2 = minRakeAlg.rake("also, here is some more. text.");
String[] keys2 = res2.getFullKeywords();
float[] scores2 = res2.getScores();
assertEquals("also", keys2[0]);
assertEquals("here is some more", keys2[1]);
assertEquals("text", keys2[2]);
assertEquals("Incorrect scoring", 1, scores2[0], 0);
}

@Test
public void testStopWordRemoval() throws java.io.IOException {

@Test
public void testStopWordRemoval() throws java.io.IOException {
String[] stopWords = {"text"};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
Result aRes = rakeAlg.rake("here is some. text");
String[] keys = aRes.getFullKeywords();

assertEquals("Stopword removal", 1, keys.length, 0);
}
Result aRes = rakeAlg.rake("here is some. text");
String[] keys = aRes.getFullKeywords();
assertEquals("Stopword removal", 1, keys.length, 0);
}

@Test
public void testStopPOSRemoval() throws java.io.IOException {
@Test
public void testStopPOSRemoval() throws java.io.IOException {

String[] stopWords = {""};
String[] stopPOS = {"VBD"};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
Result aRes = rakeAlg.rake("I ran to the store");
String[] keys = aRes.getFullKeywords();

assertEquals("StopPOS removal", 2, keys.length, 0);
}
Result aRes = rakeAlg.rake("I ran to the store");
String[] keys = aRes.getFullKeywords();
assertEquals("StopPOS removal", 2, keys.length, 0);
}

@Test
public void testMinWordCharRemoval() throws java.io.IOException {

@Test
public void testMinWordCharRemoval() throws java.io.IOException {
String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 2, true, delims);
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
Result aRes = rakeAlg.rake("I ran to the store");
String[] keys = aRes.getFullKeywords();

assertEquals("ran to the store", keys[0]);
}
Result aRes = rakeAlg.rake("I ran to the store");
String[] keys = aRes.getFullKeywords();
assertEquals("ran to the store", keys[0]);
}

@Test
public void testStemScoring() throws java.io.IOException {

@Test
public void testStemScoring() throws java.io.IOException {
String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
Result aRes = rakeAlg.rake("good dogs. good. dog");
float[] scores = aRes.getScores();

Result aRes = rakeAlg.rake("good dogs. good. dog");
float[] scores = aRes.getScores();
RakeParams params2 = new RakeParams(stopWords, stopPOS, 0, false, delims);
RakeAlgorithm rakeAlg2 = new RakeAlgorithm(params2, posUrl, sentUrl);
Result aRes2 = rakeAlg2.rake("good dog. good. dog");
float[] scores2 = aRes2.getScores();

assertArrayEquals(scores, scores2, 0);
}
Result aRes2 = rakeAlg2.rake("good dog. good. dog");
float[] scores2 = aRes2.getScores();
assertArrayEquals(scores, scores2, 0);
}

@Test
public void testDelims() throws java.io.IOException {

@Test
public void testDelims() throws java.io.IOException {
String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, "[.]"); // no comma
RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl);
Result aRes = rakeAlg.rake("good dogs, good. dog");
String[] keys = aRes.getFullKeywords();
Result aRes = rakeAlg.rake("good dogs, good. dog");
String[] keys = aRes.getFullKeywords();

assertEquals(2, keys.length, 0);
}

assertEquals(2, keys.length, 0);
}
@Test
public void testStemmerLang() throws java.io.IOException {

String[] stopWords = new SmartWords().getSmartWords();
String[] stopPOS = {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"};
String txtEl = "dependent dogs. dependable dogs";

RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
RakeAlgorithm alg = new RakeAlgorithm(params, posUrl, sentUrl);
Result res = alg.rake(txtEl);

RakeParams frenchStem = new RakeParams(stopWords, stopPOS, 0, true, delims,
SnowballStemmer.ALGORITHM.FRENCH);
RakeAlgorithm frenchAlg = new RakeAlgorithm(frenchStem, posUrl, sentUrl);
Result frenchRes = frenchAlg.rake(txtEl);

assertFalse(Arrays.equals(res.getStemmedKeywords(), frenchRes.getStemmedKeywords()));
}

@Test
public void testStemmerLang() throws java.io.IOException {
public void testSecondaryConstructor() throws java.io.IOException {

String[] stopWords = {""};
String[] stopPOS = {""};
RakeParams params = new RakeParams(stopWords, stopPOS, 0, false, delims);
// create the required model classes, you can cache these instances in a singleton
POSTaggerME tagger = new Tagger(posUrl).getPosTagger();
SentenceDetectorME sentDetect = new SentDetector(sentUrl).getSentDetector();

String[] stopWords = new SmartWords().getSmartWords();
String[] stopPOS = {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"};
String txtEl = "dependent dogs. dependable dogs";
RakeAlgorithm minRakeAlg = new RakeAlgorithm(params, tagger, sentDetect);
Result aRes = minRakeAlg.rake("here is some text");
String[] keys = aRes.getFullKeywords();
float[] scores = aRes.getScores();

RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims);
RakeAlgorithm alg = new RakeAlgorithm(params, posUrl, sentUrl);
Result res = alg.rake(txtEl);
assertEquals("here is some text", keys[0]);
assertEquals("Incorrect scoring", 16, scores[0], 0);

RakeParams frenchStem = new RakeParams(stopWords, stopPOS, 0, true, delims,
SnowballStemmer.ALGORITHM.FRENCH);
RakeAlgorithm frenchAlg = new RakeAlgorithm(frenchStem, posUrl, sentUrl);
Result frenchRes = frenchAlg.rake(txtEl);
Result res2 = minRakeAlg.rake("also, here is some more. text.");
String[] keys2 = res2.getFullKeywords();
float[] scores2 = res2.getScores();

assertFalse(Arrays.equals(res.getStemmedKeywords(), frenchRes.getStemmedKeywords()));
assertEquals("also", keys2[0]);
assertEquals("here is some more", keys2[1]);
assertEquals("text", keys2[2]);
assertEquals("Incorrect scoring", 1, scores2[0], 0);
}

}

0 comments on commit d3c787a

Please sign in to comment.