Skip to content
This repository has been archived by the owner on Jul 1, 2018. It is now read-only.

Commit

Permalink
improved text parsing algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
berzi committed Feb 21, 2018
1 parent ecf421e commit 44cb509
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ You can use the `help` command at any moment to learn about the functionalities
- Recognise verb conjugations.
- More statistical data on words.

# Known limitations/bugs
- Currently, words containing apostrophes will be counted as two separate words. Example: `C'thulhu` = `C`, `thulhu`.

# Supported formats:
- Plain text (`.txt` etc).
- `.pdf` (including encrypted).
Expand Down
18 changes: 14 additions & 4 deletions uniqword.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os # Used for directory-wide operations.
import time # Used by the command-line interface for sleep() when bidding farewell to the user.
import zipfile # Used to read odt files.
import re # Used for text parsing.
from typing import Optional # Used for type hinting.

import PyPDF2 # Used to read PDF files.
Expand All @@ -18,6 +19,15 @@
# The default number of elements for frequency lists if unspecified by user input.
FREQUENCY_TOP = 20

# Symbols to accept within words.
ACCEPT = ("-", "_")

# Subset of ACCEPT to remove from start/end of words.
REMOVE = ("-",)

# Symbols (regex) to count as word separators.
SEPARATORS = r"\s'"


class DecryptionError(Exception):
"""Catches the event in which an encrypted file is provided with a wrong password or none at all."""
Expand Down Expand Up @@ -140,18 +150,18 @@ def purify_words(contents: str) -> list:
all_words = []

# Separate words.
contents = contents.split(" ")
contents = re.split(r"["+SEPARATORS+r"]", contents)

# Filter out "empty" words and filter characters inside words to make sure we only get real(istic) words.
for word in filter(lambda w: w not in ["", "\n"], contents):
# Get all alphanumeric characters, plus hyphens and underscores.
word = [char for char in word if char.isalnum() or char in ["-", "_"]]
word = [char for char in word if char.isalnum() or char in ACCEPT]
while len(word): # Ensure we're not working on an empty word.
# Remove hyphens at start or end.
if word[0] == "-":
if word[0] in REMOVE:
word.pop(0)
continue
if word[-1] == "-":
if word[-1] in REMOVE:
word.pop(-1)
continue

Expand Down

0 comments on commit 44cb509

Please sign in to comment.