improved text parsing algorithm

berzi · Feb 21, 2018 · 44cb509 · 44cb509
1 parent ecf421e
commit 44cb509
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -22,6 +22,9 @@ You can use the `help` command at any moment to learn about the functionalities
 - Recognise verb conjugations.
 - More statistical data on words.
 
+# Known limitations/bugs
+- Currently, words containing apostrophes will be counted as two separate words. Example: `C'thulhu` = `C`, `thulhu`.
+
 # Supported formats:
 - Plain text (`.txt` etc).
 - `.pdf` (including encrypted).

diff --git a/uniqword.py b/uniqword.py
@@ -6,6 +6,7 @@
 import os  # Used for directory-wide operations.
 import time  # Used by the command-line interface for sleep() when bidding farewell to the user.
 import zipfile  # Used to read odt files.
+import re  # Used for text parsing.
 from typing import Optional  # Used for type hinting.
 
 import PyPDF2  # Used to read PDF files.
@@ -18,6 +19,15 @@
 # The default number of elements for frequency lists if unspecified by user input.
 FREQUENCY_TOP = 20
 
+# Symbols to accept within words.
+ACCEPT = ("-", "_")
+
+# Subset of ACCEPT to remove from start/end of words.
+REMOVE = ("-",)
+
+# Symbols (regex) to count as word separators.
+SEPARATORS = r"\s'"
+
 
 class DecryptionError(Exception):
     """Catches the event in which an encrypted file is provided with a wrong password or none at all."""
@@ -140,18 +150,18 @@ def purify_words(contents: str) -> list:
         all_words = []
 
         # Separate words.
-        contents = contents.split(" ")
+        contents = re.split(r"["+SEPARATORS+r"]", contents)
 
         # Filter out "empty" words and filter characters inside words to make sure we only get real(istic) words.
         for word in filter(lambda w: w not in ["", "\n"], contents):
             # Get all alphanumeric characters, plus hyphens and underscores.
-            word = [char for char in word if char.isalnum() or char in ["-", "_"]]
+            word = [char for char in word if char.isalnum() or char in ACCEPT]
             while len(word):  # Ensure we're not working on an empty word.
                 # Remove hyphens at start or end.
-                if word[0] == "-":
+                if word[0] in REMOVE:
                     word.pop(0)
                     continue
-                if word[-1] == "-":
+                if word[-1] in REMOVE:
                     word.pop(-1)
                     continue