diff --git a/README.md b/README.md index a579cd8..296ffd4 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,12 @@ public class Example { RakeParams params = new RakeParams(stopWords, stopPOS, minWordChar, shouldStem, phraseDelims); // Create a RakeAlgorithm object + // You can use the RakeAlgorithm(RakeParams, POSTaggerME, SentenceDetectorME) + // constructor instead of the one shown below if you want to pass in + // pre-initialized opennlp models. String POStaggerURL = "model-bin/en-pos-maxent.bin"; // The path to your POS tagging model - String SentDetecURL = "model-bin/en-sent.bin"; // The path to your sentence detection model - RakeAlgorithm rakeAlg = new RakeAlgorithm(params, POStaggerURL, SentDetecURL); + String SentDetectURL = "model-bin/en-sent.bin"; // The path to your sentence detection model + RakeAlgorithm rakeAlg = new RakeAlgorithm(params, POStaggerURL, SentDetectURL); // Call the rake method String txt = "dogs are great, don't you agree? I love dogs, especially big dogs"; diff --git a/pom.xml b/pom.xml index ecc8ed0..b25dc19 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ jar rapidrake ${project.groupId}:${project.artifactId} - 0.1.4 + 0.1.5 A fast version of the Rapid Automatic Keyword Extraction (RAKE) algorithm https://github.com/crew102/rapidrake-java diff --git a/src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java b/src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java index 96c0c5a..444bdf0 100644 --- a/src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java +++ b/src/main/java/io/github/crew102/rapidrake/RakeAlgorithm.java @@ -64,7 +64,35 @@ public RakeAlgorithm(RakeParams rakeParams, InputStream taggerStream, InputStrea this.tagger = new Tagger(taggerStream).getPosTagger(); this.sentDetector = new SentDetector(sentDectStream).getSentDetector(); } - + + /** + * Constructor. See below for example usage. Use this constructor if you want + * to avoid the overhead of creating the POS tagger and sentence detector + * models each time you create a new instance of RakeAlgorithm. + * + *
+   * String POStaggerURL = "model-bin/en-pos-maxent.bin";
+   * String SentDetectURL = "model-bin/en-sent.bin";
+   *
+   * // create the required model classes, you can cache these instances in a singleton
+   * POSTaggerME tagger = new Tagger(POStaggerURL).getPosTagger();
+   * SentenceDetectorME sentDetect = new SentDetector(SentDetectURL).getSentDetector();
+   *
+   * // now creating an instance of the RakeAlgorithm is fast
+   * RakeAlgorithm rakeAlg = new RakeAlgorithm(params, tagger, sentDetect);
+   * 
+ * + * @param rakeParams the parameters RAKE will use + * @param posTaggerME An instance of opennlp.tools.postag.POSTaggerME. + * @param sentDetectorME An instance of opennlp.tools.sentdetect.SentenceDetectorME. + * @see RakeParams + */ + public RakeAlgorithm(RakeParams rakeParams, POSTaggerME posTaggerME, SentenceDetectorME sentDetectorME) { + this.rakeParams = rakeParams; + this.tagger = posTaggerME; + this.sentDetector = sentDetectorME; + } + /** * Run RAKE on a single string. * @@ -87,12 +115,23 @@ private String[] getTokens(String txtEl) { ArrayList tokenList = new ArrayList(); Pattern anyWordChar = Pattern.compile("[a-z]"); - String[] sents = sentDetector.sentDetect(txtPadded); + String[] sents; + // Make sure that the tagger is thread safe + synchronized(sentDetector){ + sents = sentDetector.sentDetect(txtPadded); + } + + WhitespaceTokenizer wsTokenizer = WhitespaceTokenizer.INSTANCE; for (String sentence : sents) { - String[] tokenArray = WhitespaceTokenizer.INSTANCE.tokenize(sentence); - String[] tags = tagger.tag(tokenArray); + String[] tokenArray = wsTokenizer.tokenize(sentence); + + String[] tags; + // Make sure that the tagger is thread safe + synchronized(tagger){ + tags = tagger.tag(tokenArray); + } for (int i = 0; i < tokenArray.length; i++) { diff --git a/src/test/java/io/github/crew102/rapidrake/TestRapidRake.java b/src/test/java/io/github/crew102/rapidrake/TestRapidRake.java index 423f134..cfc5877 100644 --- a/src/test/java/io/github/crew102/rapidrake/TestRapidRake.java +++ b/src/test/java/io/github/crew102/rapidrake/TestRapidRake.java @@ -6,125 +6,159 @@ import org.junit.Test; import opennlp.tools.stemmer.snowball.SnowballStemmer; +import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.sentdetect.SentenceDetectorME; import io.github.crew102.rapidrake.data.SmartWords; import io.github.crew102.rapidrake.model.*; +import io.github.crew102.rapidrake.opennlpUtils.Tagger; +import io.github.crew102.rapidrake.opennlpUtils.SentDetector; public class TestRapidRake { - + private static String delims = "[-,.?():;\"!/]"; private static String posUrl = "model-bin/en-pos-maxent.bin"; private static String sentUrl = "model-bin/en-sent.bin"; - - @Test - public void testMinAlg() throws java.io.IOException { + + @Test + public void testMinAlg() throws java.io.IOException { + String[] stopWords = {""}; String[] stopPOS = {""}; RakeParams params = new RakeParams(stopWords, stopPOS, 0, false, delims); RakeAlgorithm minRakeAlg = new RakeAlgorithm(params, posUrl, sentUrl); - Result aRes = minRakeAlg.rake("here is some text"); - String[] keys = aRes.getFullKeywords(); - float[] scores = aRes.getScores(); - - assertEquals("here is some text", keys[0]); - assertEquals("Incorrect scoring", 16, scores[0], 0); - - Result res2 = minRakeAlg.rake("also, here is some more. text."); - String[] keys2 = res2.getFullKeywords(); - float[] scores2 = res2.getScores(); - - assertEquals("also", keys2[0]); - assertEquals("here is some more", keys2[1]); - assertEquals("text", keys2[2]); - assertEquals("Incorrect scoring", 1, scores2[0], 0); - } + Result aRes = minRakeAlg.rake("here is some text"); + String[] keys = aRes.getFullKeywords(); + float[] scores = aRes.getScores(); + + assertEquals("here is some text", keys[0]); + assertEquals("Incorrect scoring", 16, scores[0], 0); + + Result res2 = minRakeAlg.rake("also, here is some more. text."); + String[] keys2 = res2.getFullKeywords(); + float[] scores2 = res2.getScores(); + + assertEquals("also", keys2[0]); + assertEquals("here is some more", keys2[1]); + assertEquals("text", keys2[2]); + assertEquals("Incorrect scoring", 1, scores2[0], 0); + } - @Test - public void testStopWordRemoval() throws java.io.IOException { - + @Test + public void testStopWordRemoval() throws java.io.IOException { + String[] stopWords = {"text"}; String[] stopPOS = {""}; RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims); RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl); - Result aRes = rakeAlg.rake("here is some. text"); - String[] keys = aRes.getFullKeywords(); - - assertEquals("Stopword removal", 1, keys.length, 0); - } + Result aRes = rakeAlg.rake("here is some. text"); + String[] keys = aRes.getFullKeywords(); + + assertEquals("Stopword removal", 1, keys.length, 0); + } - @Test - public void testStopPOSRemoval() throws java.io.IOException { + @Test + public void testStopPOSRemoval() throws java.io.IOException { + String[] stopWords = {""}; String[] stopPOS = {"VBD"}; RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims); RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl); - Result aRes = rakeAlg.rake("I ran to the store"); - String[] keys = aRes.getFullKeywords(); - - assertEquals("StopPOS removal", 2, keys.length, 0); - } + Result aRes = rakeAlg.rake("I ran to the store"); + String[] keys = aRes.getFullKeywords(); + + assertEquals("StopPOS removal", 2, keys.length, 0); + } - @Test - public void testMinWordCharRemoval() throws java.io.IOException { - + @Test + public void testMinWordCharRemoval() throws java.io.IOException { + String[] stopWords = {""}; String[] stopPOS = {""}; RakeParams params = new RakeParams(stopWords, stopPOS, 2, true, delims); RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl); - Result aRes = rakeAlg.rake("I ran to the store"); - String[] keys = aRes.getFullKeywords(); - - assertEquals("ran to the store", keys[0]); - } + Result aRes = rakeAlg.rake("I ran to the store"); + String[] keys = aRes.getFullKeywords(); + + assertEquals("ran to the store", keys[0]); + } - @Test - public void testStemScoring() throws java.io.IOException { - + @Test + public void testStemScoring() throws java.io.IOException { + String[] stopWords = {""}; String[] stopPOS = {""}; RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims); RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl); - Result aRes = rakeAlg.rake("good dogs. good. dog"); - float[] scores = aRes.getScores(); - + Result aRes = rakeAlg.rake("good dogs. good. dog"); + float[] scores = aRes.getScores(); + RakeParams params2 = new RakeParams(stopWords, stopPOS, 0, false, delims); RakeAlgorithm rakeAlg2 = new RakeAlgorithm(params2, posUrl, sentUrl); - Result aRes2 = rakeAlg2.rake("good dog. good. dog"); - float[] scores2 = aRes2.getScores(); - - assertArrayEquals(scores, scores2, 0); - } + Result aRes2 = rakeAlg2.rake("good dog. good. dog"); + float[] scores2 = aRes2.getScores(); + + assertArrayEquals(scores, scores2, 0); + } - @Test - public void testDelims() throws java.io.IOException { - + @Test + public void testDelims() throws java.io.IOException { + String[] stopWords = {""}; String[] stopPOS = {""}; RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, "[.]"); // no comma RakeAlgorithm rakeAlg = new RakeAlgorithm(params, posUrl, sentUrl); - Result aRes = rakeAlg.rake("good dogs, good. dog"); - String[] keys = aRes.getFullKeywords(); + Result aRes = rakeAlg.rake("good dogs, good. dog"); + String[] keys = aRes.getFullKeywords(); + + assertEquals(2, keys.length, 0); + } - assertEquals(2, keys.length, 0); - } + @Test + public void testStemmerLang() throws java.io.IOException { + + String[] stopWords = new SmartWords().getSmartWords(); + String[] stopPOS = {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}; + String txtEl = "dependent dogs. dependable dogs"; + + RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims); + RakeAlgorithm alg = new RakeAlgorithm(params, posUrl, sentUrl); + Result res = alg.rake(txtEl); + + RakeParams frenchStem = new RakeParams(stopWords, stopPOS, 0, true, delims, + SnowballStemmer.ALGORITHM.FRENCH); + RakeAlgorithm frenchAlg = new RakeAlgorithm(frenchStem, posUrl, sentUrl); + Result frenchRes = frenchAlg.rake(txtEl); + + assertFalse(Arrays.equals(res.getStemmedKeywords(), frenchRes.getStemmedKeywords())); + } @Test - public void testStemmerLang() throws java.io.IOException { + public void testSecondaryConstructor() throws java.io.IOException { + + String[] stopWords = {""}; + String[] stopPOS = {""}; + RakeParams params = new RakeParams(stopWords, stopPOS, 0, false, delims); + // create the required model classes, you can cache these instances in a singleton + POSTaggerME tagger = new Tagger(posUrl).getPosTagger(); + SentenceDetectorME sentDetect = new SentDetector(sentUrl).getSentDetector(); - String[] stopWords = new SmartWords().getSmartWords(); - String[] stopPOS = {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}; - String txtEl = "dependent dogs. dependable dogs"; + RakeAlgorithm minRakeAlg = new RakeAlgorithm(params, tagger, sentDetect); + Result aRes = minRakeAlg.rake("here is some text"); + String[] keys = aRes.getFullKeywords(); + float[] scores = aRes.getScores(); - RakeParams params = new RakeParams(stopWords, stopPOS, 0, true, delims); - RakeAlgorithm alg = new RakeAlgorithm(params, posUrl, sentUrl); - Result res = alg.rake(txtEl); + assertEquals("here is some text", keys[0]); + assertEquals("Incorrect scoring", 16, scores[0], 0); - RakeParams frenchStem = new RakeParams(stopWords, stopPOS, 0, true, delims, - SnowballStemmer.ALGORITHM.FRENCH); - RakeAlgorithm frenchAlg = new RakeAlgorithm(frenchStem, posUrl, sentUrl); - Result frenchRes = frenchAlg.rake(txtEl); + Result res2 = minRakeAlg.rake("also, here is some more. text."); + String[] keys2 = res2.getFullKeywords(); + float[] scores2 = res2.getScores(); - assertFalse(Arrays.equals(res.getStemmedKeywords(), frenchRes.getStemmedKeywords())); + assertEquals("also", keys2[0]); + assertEquals("here is some more", keys2[1]); + assertEquals("text", keys2[2]); + assertEquals("Incorrect scoring", 1, scores2[0], 0); } }