diff --git a/filters/alliteration/README.md b/filters/alliteration/README.md new file mode 100644 index 000000000..995d70cdf --- /dev/null +++ b/filters/alliteration/README.md @@ -0,0 +1,81 @@ +## Alliteration filter + +**Author: Marie Tolkiehn**\ +Center for Data and Computing in Natural Sciences, Universität Hamburg\ +marie.tolkiehn@desy.de + + +## What type of a filter is this? + +This filter returns True if any of the input sentences is an alliteration and False otherwise. +By default, stop words are removed and do not count to the alliteration. +However, should the sentence solely consist of stop words, they will not be removed. + +A sentence is deemed an alliteration if it contains words starting with the same character or digraph ("ch", "ph", "sh", "th"). +The minimum alliteration length then governs how many words starting with the same first phoneme are required to be deemed a valid alliteration. +The default minimum alliteration length is 3. + +These alliterative words do not need to appear contiguously in the sentence. +This means that e.g. "Peter Aquarium prepared a pepperoni pizza." is a valid alliteration +as it contains more than (default) 3 alliterative non-stopword words (despite "Aquarium"). + +## Why is this filter important? +Alliterations attract audiences. +Alliterations are a stylistic device and trope of literature or poetry. +However, alliterations are around us all the time. From newspaper headlines +("Beer Baron Beats Banner" or "Banner Bars Booze (Booze Barred By Banner)" (c) The Simpsons) +over ads ("Taco Tuesdays"), and company/brand names ("Coca Cola", "Bed, Bath & Beyond", "PayPal"), +protagonists ("Peter Pevensie", "Peter Pan", "Bilbo Baggins", "Donald Duck") +and even academic publications, writers often use alliterations to catch the reader's (or listener's) attention, +as through sound repetition, they are catchy and easy to remember. +Alliterations generally sound pleasing and different phonemes create different rhythms and vibes. +For example, alliterations starting with S are often connected to snake-like features, +whereas alliterations with plosives such as P create a particular rhythm. + +This filter could check just how prevalent alliterations are in various types of texts and if there are particular areas they are particularly prevalent. +A good language model may then be able to generate synonymous alliterations from non-alliterative texts. + +## Robustness Evaluation +### Removing Stopwords (True), minimum alliteration length = 3 +Here is the performance of the model on the filtered set: +* **IMDB**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-imdb" -d "imdb" -p 20`\ + The accuracy on this subset which has 612 examples = 95.0 + +* **SST-2**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-SST-2" -d "sst2" -p 20`\ + The accuracy on this subset which has 17 examples = 88.0 + +* **QQP** \ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/bert-base-uncased-QQP" -d "qqp" -p 20`\ + The accuracy on this subset which has 31 examples = 97.0 + +* **MNLI**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ + The accuracy on this subset which has 128 examples = 91.0 + + +### Not removing stopwords (False), minimum alliteration length = 3 +* **IMDB**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-imdb" -d "imdb" -p 20`\ + The accuracy on this subset which has 886 examples = 95.0 +* **SST-2**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-SST-2" -d "sst2" -p 20`\ + The accuracy on this subset which has 34 examples = 97.0 +* **QQP** \ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/bert-base-uncased-QQP" -d "qqp" -p 20`\ + The accuracy on this subset which has 111 examples = 94.0 +* **MNLI**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ + The accuracy on this subset which has 233 examples = 92.0\ + + + +## Data and code source +Data was fully created by the author. +Only the test case involving "Peter and his famous pickled peppers" first appeared in print in 1813 in John Harris's Peter Piper's Practical Principles of Plain and Perfect Pronunciation. + + +## What are the limitations of this filter? +There may be phonetic alliterations that are not captured by a graphematic approach. For example, `Phonetic` and `Fine` are phonetic alliterations but not graphematic ones. +This could be ameliorated e.g. by using more sophisticated methods such as a pronouncing dictionary by Carnegie Mellon's to compare each word. \ No newline at end of file diff --git a/filters/alliteration/__init__.py b/filters/alliteration/__init__.py new file mode 100644 index 000000000..1e78c9bed --- /dev/null +++ b/filters/alliteration/__init__.py @@ -0,0 +1 @@ +from .filter import * diff --git a/filters/alliteration/filter.py b/filters/alliteration/filter.py new file mode 100644 index 000000000..14264d17f --- /dev/null +++ b/filters/alliteration/filter.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# *_* coding: utf-8 *_* + +import string + +import numpy as np +import spacy + +from initialize import spacy_nlp +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + + +class Alliteration(SentenceOperation): + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] + languages = ["en"] + keywords = ["morphological"] + + def __init__( + self, + stopwords: bool = True, + min_alliteration_length: int = 3, + allowed_offwords: int = 2, + ): + super().__init__() + self.stopwords = stopwords + self.min_alliteration_length = min_alliteration_length + self.allowed_offwords = allowed_offwords + self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + + def filter(self, sentence: str = None, min_sentence_length=3) -> bool: + """ + This filter returns True if any of the input sentences is an alliteration. + A sentence is deemed an alliteration if it contains a minimum alliteration length of (Default) 3. + These alliterative words do not need to appear contiguously. + This means that e.g. "Peter Aquarium prepared a pepperoni pizza." is an alliteration + as it contains more than 3 alliterative non-stopword words (despite "Aquarium"). + By default, stop words are removed and do not count to the alliteration. + """ + + def get_phonemes(word: str): + """ + We are adding some digraphs to avoid 'sand' and 'shady' to alliterate. + Then we check for these digraphs first + """ + digraphs = ["ch", "ph", "sh", "th"] + if word[:2] in digraphs: + return word[:2] + else: + return word[:1] + + def segment_sentences(self, sentence, min_sentence_length): + """ + If the input contains multiple sentences, only take the sentences that have the min_sentence_length + and that do contain alphanumeric characters. + """ + sent = self.nlp(sentence.lstrip()) + segmented_sentence = list(sent.sents) + all_stopwords = self.nlp.Defaults.stop_words + filt_sentences = [] + for k in segmented_sentence: + # Skip any too short 'sentences' that contain no alphanumeric characters + if ( + len(k.text) > min_sentence_length + and k.text.lower().islower() + ): + valid_sentences = k.text + else: + continue + + # Convert to lower, remove punctuation, tokenize into words + sentenceS = ( + valid_sentences.lower() + .translate(str.maketrans("", "", string.punctuation)) + .split() + ) + + if self.stopwords: + if not set(sentenceS).issubset( + self.nlp.Defaults.stop_words + ): + # Remove all stopwords from our sentence + sentenceS = [ + word + for word in sentenceS + if word not in all_stopwords + ] + filt_sentences.append(sentenceS) + + return filt_sentences + + def rolling_window(data, windowlen): + """ + Create a 1-dimensional rolling window of size windowlen. + If the windowlen is larger than the length of the data, use the length of the data instead. + """ + if len(data) < windowlen: + windowlen = len(data) + shape = data.shape[:-1] + ( + data.shape[-1] - windowlen + 1, + windowlen, + ) + strides = data.strides + (data.strides[-1],) + return np.lib.stride_tricks.as_strided( + data, shape=shape, strides=strides + ) + + def find_contiguous_elements( + elements, min_alliteration_length, allowed_offwords + ): + """ + Create rolling windows of size min_alliteration_length + allowed_offwords + and check if any window contains a block of the same elements of the size min_alliteration_length. + Return True if any window with the min_alliteration_length is found, False otherwise. + """ + rolling_sent = rolling_window( + elements, min_alliteration_length + allowed_offwords + ) + + for windows in rolling_sent: + if ( + windows == max(set(windows), key=sorted(windows).count) + ).sum() >= min_alliteration_length: + return True + + return False + + # Process input sentences + sentenceS = segment_sentences(self, sentence, min_sentence_length) + + # Iterate through sentences + sentence_count = [] + for sen in sentenceS: + cat_sentence = np.array([get_phonemes(word) for word in sen]) + phonemes_bool = find_contiguous_elements( + cat_sentence, + self.min_alliteration_length, + self.allowed_offwords, + ) + sentence_count.append(phonemes_bool) + + return any( + sentence_count + ) # return True if any of the input sentences are alliterative diff --git a/filters/alliteration/requirements.txt b/filters/alliteration/requirements.txt new file mode 100644 index 000000000..88c873737 --- /dev/null +++ b/filters/alliteration/requirements.txt @@ -0,0 +1 @@ +spacytextblob==3.0.1 \ No newline at end of file diff --git a/filters/alliteration/test.json b/filters/alliteration/test.json new file mode 100644 index 000000000..891137c02 --- /dev/null +++ b/filters/alliteration/test.json @@ -0,0 +1,118 @@ +{ + "type": "alliteration", + "test_cases": [ + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "Andrew always asks Anne about anchovies." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "She showed Shawn shady shandy." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "She showed Shawn some shady shandy." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "Peter Piper picked a peck of pickled peppers." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false + }, + "inputs": { + "sentence": "Andrew always asks Anne about anchovies." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false + }, + "inputs": { + "sentence": "She showed Shawn shady shandy." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false + }, + "inputs": { + "sentence": "She showed Shawn some shady shandy." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false + }, + "inputs": { + "sentence": "Peter Piper picked a peck of pickled peppers." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "4 *((( ::). She showed Aquarium Shawn shady shandy. This is the second sentence Sandy sorted. It is imminent in Iowa." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false, + "min_alliteration_length": 5 + }, + "inputs": { + "sentence": "4 *((( ::). She offered Shawn super shandy. This is the second sentence Sandy sorted. It is imminent in Iowa." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true, + "min_alliteration_length": 5 + }, + "inputs": { + "sentence": "4 *((( ::). She offered Shawn super shandy. This is the second sentence Sandy sorted. It is imminent in Iowa." + }, + "outputs": false + } + ] +} +