diff --git a/README.md b/README.md index cff5895..0c75855 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,9 @@ You can use the `help` command at any moment to learn about the functionalities - Recognise verb conjugations. - More statistical data on words. +# Known limitations/bugs +- Currently, words containing apostrophes will be counted as two separate words. Example: `C'thulhu` = `C`, `thulhu`. + # Supported formats: - Plain text (`.txt` etc). - `.pdf` (including encrypted). diff --git a/uniqword.py b/uniqword.py index 52c057c..ca2cfd0 100644 --- a/uniqword.py +++ b/uniqword.py @@ -6,6 +6,7 @@ import os # Used for directory-wide operations. import time # Used by the command-line interface for sleep() when bidding farewell to the user. import zipfile # Used to read odt files. +import re # Used for text parsing. from typing import Optional # Used for type hinting. import PyPDF2 # Used to read PDF files. @@ -18,6 +19,15 @@ # The default number of elements for frequency lists if unspecified by user input. FREQUENCY_TOP = 20 +# Symbols to accept within words. +ACCEPT = ("-", "_") + +# Subset of ACCEPT to remove from start/end of words. +REMOVE = ("-",) + +# Symbols (regex) to count as word separators. +SEPARATORS = r"\s'" + class DecryptionError(Exception): """Catches the event in which an encrypted file is provided with a wrong password or none at all.""" @@ -140,18 +150,18 @@ def purify_words(contents: str) -> list: all_words = [] # Separate words. - contents = contents.split(" ") + contents = re.split(r"["+SEPARATORS+r"]", contents) # Filter out "empty" words and filter characters inside words to make sure we only get real(istic) words. for word in filter(lambda w: w not in ["", "\n"], contents): # Get all alphanumeric characters, plus hyphens and underscores. - word = [char for char in word if char.isalnum() or char in ["-", "_"]] + word = [char for char in word if char.isalnum() or char in ACCEPT] while len(word): # Ensure we're not working on an empty word. # Remove hyphens at start or end. - if word[0] == "-": + if word[0] in REMOVE: word.pop(0) continue - if word[-1] == "-": + if word[-1] in REMOVE: word.pop(-1) continue