From 70c5a5c910eca6de36cf314afa8128bdfe172c9b Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Sun, 5 Nov 2017 21:59:24 -0500 Subject: [PATCH 01/13] Add rudimentary web scraping --- ebooks.py | 20 ++++++++++++++++---- local_settings_example.py | 3 +++ requirements.txt | 3 ++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/ebooks.py b/ebooks.py index 6e99c4c..a859511 100644 --- a/ebooks.py +++ b/ebooks.py @@ -3,6 +3,8 @@ import sys import twitter import markov +from bs4 import BeautifulSoup +from urllib.request import urlopen try: # Python 3 from html.entities import name2codepoint as n2c @@ -50,7 +52,15 @@ def filter_tweet(tweet): tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e return tweet.text - +def scrape_page(src_url, span_name): + print(">>> Generating from {0}".format(src_url)) + page = urlopen(src_url) + soup = BeautifulSoup(page, 'html.parser') + spans = soup.find_all('span', attrs={'class': span_name}) + titles = [] + for span in spans: + titles.append(str(span.string)) + return(titles) def grab_tweets(api, max_id=None): source_tweets=[] @@ -72,22 +82,24 @@ def grab_tweets(api, max_id=None): guess = 0 if guess == 0: + api=connect() if STATIC_TEST==True: file = TEST_SOURCE print(">>> Generating from {0}".format(file)) string_list = open(file).readlines() for item in string_list: - source_tweets = item.split(",") + source_tweets = item.split(",") + elif SCRAPE_URL==True: + source_tweets = scrape_page(SRC_URL, SPAN_NAME) else: source_tweets = [] for handle in SOURCE_ACCOUNTS: user=handle - api=connect() handle_stats = api.GetUser(screen_name=user) status_count = handle_stats.statuses_count max_id=None if status_count<3200: - my_range = (status_count/200) + 1 + my_range = int((status_count/200) + 1) else: my_range = 17 for x in range(my_range)[1:]: diff --git a/local_settings_example.py b/local_settings_example.py index 81e5945..d851a35 100644 --- a/local_settings_example.py +++ b/local_settings_example.py @@ -15,4 +15,7 @@ DEBUG = True #Set this to False to start Tweeting live STATIC_TEST = False #Set this to True if you want to test Markov generation from a static file instead of the API. TEST_SOURCE = ".txt" #The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed. +SCRAPE_URL = False #Set this to true to scrape a webpage. +SRC_URL = '' #The URL to scrape +SPAN_NAME = '' #The class of the that contains the words you are looking for, e.g. "title" TWEET_ACCOUNT = "" #The name of the account you're tweeting to. diff --git a/requirements.txt b/requirements.txt index 4658fe9..be748d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -python-twitter \ No newline at end of file +python-twitter +beautifulsoup4 From c669302006163f12ee148a9748325f86797dc6d9 Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Sun, 5 Nov 2017 22:02:55 -0500 Subject: [PATCH 02/13] Update ebooks.py --- ebooks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ebooks.py b/ebooks.py index a859511..16de503 100644 --- a/ebooks.py +++ b/ebooks.py @@ -4,13 +4,14 @@ import twitter import markov from bs4 import BeautifulSoup -from urllib.request import urlopen try: # Python 3 from html.entities import name2codepoint as n2c + from urllib.request import urlopen except ImportError: # Python 2 from htmlentitydefs import name2codepoint as n2c + from urllib2 import urlopen chr = unichr from local_settings import * From 40ff672296a3e4eac532a7863c6ea6710bbf9518 Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Tue, 7 Nov 2017 10:25:33 -0500 Subject: [PATCH 03/13] Make web scraping more flexible --- ebooks.py | 21 +++++++++++---------- local_settings_example.py | 5 +++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/ebooks.py b/ebooks.py index a859511..df695b0 100644 --- a/ebooks.py +++ b/ebooks.py @@ -52,15 +52,16 @@ def filter_tweet(tweet): tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e return tweet.text -def scrape_page(src_url, span_name): - print(">>> Generating from {0}".format(src_url)) - page = urlopen(src_url) - soup = BeautifulSoup(page, 'html.parser') - spans = soup.find_all('span', attrs={'class': span_name}) - titles = [] - for span in spans: - titles.append(str(span.string)) - return(titles) +def scrape_page(src_url, web_context, web_attributes): + tweets = [] + for i in range(len(src_url)): + print(">>> Scraping {0}".format(src_url[i])) + page = urlopen(src_url[i]) + soup = BeautifulSoup(page, 'html.parser') + hits = soup.find_all(web_context[i], attrs=web_attributes[i]) + for hit in hits: + tweets.append(str(hit.text).strip()) + return(tweets) def grab_tweets(api, max_id=None): source_tweets=[] @@ -90,7 +91,7 @@ def grab_tweets(api, max_id=None): for item in string_list: source_tweets = item.split(",") elif SCRAPE_URL==True: - source_tweets = scrape_page(SRC_URL, SPAN_NAME) + source_tweets = scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES) else: source_tweets = [] for handle in SOURCE_ACCOUNTS: diff --git a/local_settings_example.py b/local_settings_example.py index d851a35..26f41c9 100644 --- a/local_settings_example.py +++ b/local_settings_example.py @@ -16,6 +16,7 @@ STATIC_TEST = False #Set this to True if you want to test Markov generation from a static file instead of the API. TEST_SOURCE = ".txt" #The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed. SCRAPE_URL = False #Set this to true to scrape a webpage. -SRC_URL = '' #The URL to scrape -SPAN_NAME = '' #The class of the that contains the words you are looking for, e.g. "title" +SRC_URL =['http://www.example.com/one', 'https://www.example.com/two'] #A comma-separated list of URLs to scrape +WEB_CONTEXT = ['span', 'h2'] #A comma-separated list of the tag or object to search for in each page above. +WEB_ATTRIBUTES = [{'class': 'example-text'}, {}] #A list of dictionaries containing the attributes for each page. TWEET_ACCOUNT = "" #The name of the account you're tweeting to. From 35c8ee29768e957a0940e9fb02ffb8cc3723556c Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Tue, 7 Nov 2017 10:28:09 -0500 Subject: [PATCH 04/13] Update local_settings_example.py --- local_settings_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/local_settings_example.py b/local_settings_example.py index 26f41c9..e5e83b9 100644 --- a/local_settings_example.py +++ b/local_settings_example.py @@ -16,7 +16,7 @@ STATIC_TEST = False #Set this to True if you want to test Markov generation from a static file instead of the API. TEST_SOURCE = ".txt" #The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed. SCRAPE_URL = False #Set this to true to scrape a webpage. -SRC_URL =['http://www.example.com/one', 'https://www.example.com/two'] #A comma-separated list of URLs to scrape +SRC_URL = ['http://www.example.com/one', 'https://www.example.com/two'] #A comma-separated list of URLs to scrape WEB_CONTEXT = ['span', 'h2'] #A comma-separated list of the tag or object to search for in each page above. WEB_ATTRIBUTES = [{'class': 'example-text'}, {}] #A list of dictionaries containing the attributes for each page. TWEET_ACCOUNT = "" #The name of the account you're tweeting to. From d32e3171bd9fd32a459dc465bf409444b8535cb3 Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Tue, 7 Nov 2017 11:47:18 -0500 Subject: [PATCH 05/13] Add minimal error handling. --- ebooks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ebooks.py b/ebooks.py index ad40c2e..cbd8009 100644 --- a/ebooks.py +++ b/ebooks.py @@ -57,7 +57,13 @@ def scrape_page(src_url, web_context, web_attributes): tweets = [] for i in range(len(src_url)): print(">>> Scraping {0}".format(src_url[i])) - page = urlopen(src_url[i]) + try: + page = urlopen(src_url[i]) + except Exception: + import traceback + print(">>> Error scraping {0}:".format(src_url[i])) + print(traceback.format_exc()) + continue soup = BeautifulSoup(page, 'html.parser') hits = soup.find_all(web_context[i], attrs=web_attributes[i]) for hit in hits: From a9b0988844bcd9821632d66dbb03f26b2f2c2daa Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Tue, 7 Nov 2017 21:34:57 -0500 Subject: [PATCH 06/13] Allow all three sources at once --- ebooks.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/ebooks.py b/ebooks.py index cbd8009..5dd841f 100644 --- a/ebooks.py +++ b/ebooks.py @@ -67,7 +67,9 @@ def scrape_page(src_url, web_context, web_attributes): soup = BeautifulSoup(page, 'html.parser') hits = soup.find_all(web_context[i], attrs=web_attributes[i]) for hit in hits: - tweets.append(str(hit.text).strip()) + tweet = str(hit.text).strip() + if len(tweet) >= 0: + tweets.append(tweet) return(tweets) def grab_tweets(api, max_id=None): @@ -91,16 +93,17 @@ def grab_tweets(api, max_id=None): if guess == 0: api=connect() + source_tweets = [] if STATIC_TEST==True: file = TEST_SOURCE print(">>> Generating from {0}".format(file)) string_list = open(file).readlines() for item in string_list: - source_tweets = item.split(",") - elif SCRAPE_URL==True: - source_tweets = scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES) - else: - source_tweets = [] + source_tweets += item.split(",") + if SCRAPE_URL==True: + source_tweets += scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES) + if len(SOURCE_ACCOUNTS[0]) > 0: + twitter_tweets = [] for handle in SOURCE_ACCOUNTS: user=handle handle_stats = api.GetUser(screen_name=user) @@ -111,12 +114,14 @@ def grab_tweets(api, max_id=None): else: my_range = 17 for x in range(my_range)[1:]: - source_tweets_iter, max_id = grab_tweets(api,max_id) - source_tweets += source_tweets_iter - print("{0} tweets found in {1}".format(len(source_tweets), handle)) - if len(source_tweets) == 0: + twitter_tweets_iter, max_id = grab_tweets(api,max_id) + twitter_tweets += twitter_tweets_iter + print("{0} tweets found in {1}".format(len(twitter_tweets), handle)) + if len(twitter_tweets) == 0: print("Error fetching tweets from Twitter. Aborting.") sys.exit() + else: + source_tweets += twitter_tweets mine = markov.MarkovChainer(order) for tweet in source_tweets: if re.search('([\.\!\?\"\']$)', tweet): From 425732b7eccd89b280b5207cfccd4b919ee0a85e Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Tue, 7 Nov 2017 22:09:17 -0500 Subject: [PATCH 07/13] Implement code-cleanup (replaces #34) --- ebooks.py | 143 +++++++++++++++++++++++++++--------------------------- markov.py | 24 ++++----- 2 files changed, 84 insertions(+), 83 deletions(-) diff --git a/ebooks.py b/ebooks.py index 5dd841f..1d19baa 100644 --- a/ebooks.py +++ b/ebooks.py @@ -15,12 +15,13 @@ chr = unichr from local_settings import * + def connect(): - api = twitter.Api(consumer_key=MY_CONSUMER_KEY, - consumer_secret=MY_CONSUMER_SECRET, - access_token_key=MY_ACCESS_TOKEN_KEY, - access_token_secret=MY_ACCESS_TOKEN_SECRET) - return api + return twitter.Api(consumer_key=MY_CONSUMER_KEY, + consumer_secret=MY_CONSUMER_SECRET, + access_token_key=MY_ACCESS_TOKEN_KEY, + access_token_secret=MY_ACCESS_TOKEN_SECRET) + def entity(text): if text[:2] == "&#": @@ -37,141 +38,139 @@ def entity(text): try: text = chr(numero) except KeyError: - pass + pass return text + def filter_tweet(tweet): - tweet.text = re.sub(r'\b(RT|MT) .+','',tweet.text) #take out anything after RT or MT - tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+','',tweet.text) #Take out URLs, hashtags, hts, etc. - tweet.text = re.sub(r'\n','', tweet.text) #take out new lines. - tweet.text = re.sub(r'\"|\(|\)', '', tweet.text) #take out quotes. - tweet.text = re.sub(r'\s+\(?(via|says)\s@\w+\)?', '', tweet.text) # remove attribution + tweet.text = re.sub(r'\b(RT|MT) .+', '', tweet.text) # take out anything after RT or MT + tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+', '', tweet.text) # Take out URLs, hashtags, hts, etc. + tweet.text = tweet.text.replace('\n', '') # take out new lines. + tweet.text = re.sub(r'\"|\(|\)', '', tweet.text) # take out quotes. + tweet.text = re.sub(r'\s+\(?(via|says)\s@\w+\)?', '', tweet.text) # remove attribution htmlsents = re.findall(r'&\w+;', tweet.text) - if len(htmlsents) > 0 : - for item in htmlsents: - tweet.text = re.sub(item, entity(item), tweet.text) - tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e + for item in htmlsents: + tweet.text = tweet.text.replace(item, entity(item)) + tweet.text = tweet.text.replace('\xe9', 'e') # take out accented e return tweet.text - + + def scrape_page(src_url, web_context, web_attributes): tweets = [] for i in range(len(src_url)): print(">>> Scraping {0}".format(src_url[i])) - try: - page = urlopen(src_url[i]) + try: + page = urlopen(src_url[i]) except Exception: - import traceback - print(">>> Error scraping {0}:".format(src_url[i])) - print(traceback.format_exc()) - continue + import traceback + print(">>> Error scraping {0}:".format(src_url[i])) + print(traceback.format_exc()) + continue soup = BeautifulSoup(page, 'html.parser') hits = soup.find_all(web_context[i], attrs=web_attributes[i]) for hit in hits: tweet = str(hit.text).strip() - if len(tweet) >= 0: + if tweet: tweets.append(tweet) return(tweets) - + + def grab_tweets(api, max_id=None): - source_tweets=[] + source_tweets = [] user_tweets = api.GetUserTimeline(screen_name=user, count=200, max_id=max_id, include_rts=True, trim_user=True, exclude_replies=True) - max_id = user_tweets[len(user_tweets)-1].id-1 + max_id = user_tweets[-1].id - 1 for tweet in user_tweets: tweet.text = filter_tweet(tweet) if re.search(SOURCE_EXCLUDE, tweet.text): continue - if len(tweet.text) != 0: + if tweet.text: source_tweets.append(tweet.text) return source_tweets, max_id -if __name__=="__main__": + +if __name__ == "__main__": order = ORDER - if DEBUG==False: - guess = random.choice(range(ODDS)) - else: - guess = 0 + guess = 0 + if ODDS and not DEBUG: + guess = random.randint(0, ODDS - 1) - if guess == 0: - api=connect() + if guess: + print(str(guess) + " No, sorry, not this time.") # message if the random number fails. + sys.exit() + else: + api = connect() source_tweets = [] - if STATIC_TEST==True: + if STATIC_TEST: file = TEST_SOURCE print(">>> Generating from {0}".format(file)) string_list = open(file).readlines() for item in string_list: source_tweets += item.split(",") - if SCRAPE_URL==True: + if SCRAPE_URL: source_tweets += scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES) if len(SOURCE_ACCOUNTS[0]) > 0: - twitter_tweets = [] + twitter_tweets = [] for handle in SOURCE_ACCOUNTS: - user=handle + user = handle handle_stats = api.GetUser(screen_name=user) status_count = handle_stats.statuses_count - max_id=None - if status_count<3200: - my_range = int((status_count/200) + 1) - else: - my_range = 17 - for x in range(my_range)[1:]: - twitter_tweets_iter, max_id = grab_tweets(api,max_id) + max_id = None + my_range = min(17, int((status_count/200) + 1)) + for x in range(1, my_range): + twitter_tweets_iter, max_id = grab_tweets(api, max_id) twitter_tweets += twitter_tweets_iter print("{0} tweets found in {1}".format(len(twitter_tweets), handle)) - if len(twitter_tweets) == 0: + if not twitter_tweets: print("Error fetching tweets from Twitter. Aborting.") sys.exit() else: source_tweets += twitter_tweets mine = markov.MarkovChainer(order) for tweet in source_tweets: - if re.search('([\.\!\?\"\']$)', tweet): - pass - else: - tweet+="." + if not re.search('([\.\!\?\"\']$)', tweet): + tweet += "." mine.add_text(tweet) - - for x in range(0,10): + + for x in range(0, 10): ebook_tweet = mine.generate_sentence() - #randomly drop the last word, as Horse_ebooks appears to do. - if random.randint(0,4) == 0 and re.search(r'(in|to|from|for|with|by|our|of|your|around|under|beyond)\s\w+$', ebook_tweet) != None: - print("Losing last word randomly") - ebook_tweet = re.sub(r'\s\w+.$','',ebook_tweet) - print(ebook_tweet) - - #if a tweet is very short, this will randomly add a second sentence to it. - if ebook_tweet != None and len(ebook_tweet) < 40: - rando = random.randint(0,10) - if rando == 0 or rando == 7: + # randomly drop the last word, as Horse_ebooks appears to do. + if random.randint(0, 4) == 0 and re.search(r'(in|to|from|for|with|by|our|of|your|around|under|beyond)\s\w+$', ebook_tweet) is not None: + print("Losing last word randomly") + ebook_tweet = re.sub(r'\s\w+.$', '', ebook_tweet) + print(ebook_tweet) + + # if a tweet is very short, this will randomly add a second sentence to it. + if ebook_tweet is not None and len(ebook_tweet) < 40: + rando = random.randint(0, 10) + if rando == 0 or rando == 7: print("Short tweet. Adding another sentence randomly") newer_tweet = mine.generate_sentence() - if newer_tweet != None: + if newer_tweet is not None: ebook_tweet += " " + mine.generate_sentence() else: ebook_tweet = ebook_tweet elif rando == 1: - #say something crazy/prophetic in all caps + # say something crazy/prophetic in all caps print("ALL THE THINGS") ebook_tweet = ebook_tweet.upper() - #throw out tweets that match anything from the source account. - if ebook_tweet != None and len(ebook_tweet) < 110: + # throw out tweets that match anything from the source account. + if ebook_tweet is not None and len(ebook_tweet) < 110: for tweet in source_tweets: if ebook_tweet[:-1] not in tweet: continue - else: + else: print("TOO SIMILAR: " + ebook_tweet) sys.exit() - - if DEBUG == False: + + if not DEBUG: status = api.PostUpdate(ebook_tweet) print(status.text.encode('utf-8')) else: print(ebook_tweet) - elif ebook_tweet == None: + elif not ebook_tweet: print("Tweet is empty, sorry.") else: print("TOO LONG: " + ebook_tweet) - else: - print(str(guess) + " No, sorry, not this time.") #message if the random number fails. diff --git a/markov.py b/markov.py index b9f78a0..9a71556 100644 --- a/markov.py +++ b/markov.py @@ -1,23 +1,24 @@ import random import re + class MarkovChainer(object): def __init__(self, order): - self.order=order + self.order = order self.beginnings = [] self.freq = {} - #pass a string with a terminator to the function to add it to the markov lists. + # pass a string with a terminator to the function to add it to the markov lists. def add_sentence(self, string, terminator): data = "".join(string) words = data.split() buf = [] if len(words) > self.order: words.append(terminator) - self.beginnings.append(words[0:self.order]) + self.beginnings.append(words[0:self.order]) else: pass - + for word in words: buf.append(word) if len(buf) == self.order + 1: @@ -44,21 +45,21 @@ def add_text(self, text): else: sentence = piece - #Generate the goofy sentences that become your tweet. + # Generate the goofy sentences that become your tweet. def generate_sentence(self): res = random.choice(self.beginnings) res = res[:] - if len(res)==self.order: + if len(res) == self.order: nw = True - while nw != None: + while nw is not None: restup = (res[-2], res[-1]) try: nw = self.next_word_for(restup) - if nw != None: + if nw is not None: res.append(nw) else: continue - except: + except Exception: nw = False new_res = res[0:-2] if new_res[0].istitle() or new_res[0].isupper(): @@ -79,8 +80,9 @@ def next_word_for(self, words): arr = self.freq[words] next_words = random.choice(arr) return next_words - except: - return None + except Exception: + return None + if __name__ == "__main__": print("Try running ebooks.py first") From f091a59ac13d9116173b9fcf4ebfa49c644a032c Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Wed, 8 Nov 2017 09:04:37 -0500 Subject: [PATCH 08/13] Liniting for local_settings.py too --- local_settings_example.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/local_settings_example.py b/local_settings_example.py index e5e83b9..2062009 100644 --- a/local_settings_example.py +++ b/local_settings_example.py @@ -2,21 +2,21 @@ Local Settings for a heroku_ebooks account. #fill in the name of the account you're tweeting from here. ''' -#configuration +# Configuration MY_CONSUMER_KEY = 'Your Twitter API Consumer Key' MY_CONSUMER_SECRET = 'Your Consumer Secret Key' MY_ACCESS_TOKEN_KEY = 'Your Twitter API Access Token Key' MY_ACCESS_TOKEN_SECRET = 'Your Access Token Secret' -SOURCE_ACCOUNTS = [""] #A list of comma-separated, quote-enclosed Twitter handles of account that you'll generate tweets based on. It should look like ["account1", "account2"]. If you want just one account, no comma needed. -ODDS = 8 #How often do you want this to run? 1/8 times? -ORDER = 2 #how closely do you want this to hew to sensical? 2 is low and 4 is high. -SOURCE_EXCLUDE = r'^$' #Source tweets that match this regexp will not be added to the Markov chain. You might want to filter out inappropriate words for example. -DEBUG = True #Set this to False to start Tweeting live -STATIC_TEST = False #Set this to True if you want to test Markov generation from a static file instead of the API. -TEST_SOURCE = ".txt" #The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed. -SCRAPE_URL = False #Set this to true to scrape a webpage. -SRC_URL = ['http://www.example.com/one', 'https://www.example.com/two'] #A comma-separated list of URLs to scrape -WEB_CONTEXT = ['span', 'h2'] #A comma-separated list of the tag or object to search for in each page above. -WEB_ATTRIBUTES = [{'class': 'example-text'}, {}] #A list of dictionaries containing the attributes for each page. -TWEET_ACCOUNT = "" #The name of the account you're tweeting to. +SOURCE_ACCOUNTS = [""] # A list of comma-separated, quote-enclosed Twitter handles of account that you'll generate tweets based on. It should look like ["account1", "account2"]. If you want just one account, no comma needed. +ODDS = 8 # How often do you want this to run? 1/8 times? +ORDER = 2 # How closely do you want this to hew to sensical? 2 is low and 4 is high. +SOURCE_EXCLUDE = r'^$' # Source tweets that match this regexp will not be added to the Markov chain. You might want to filter out inappropriate words for example. +DEBUG = True # Set this to False to start Tweeting live +STATIC_TEST = False #S et this to True if you want to test Markov generation from a static file instead of the API. +TEST_SOURCE = ".txt" # The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed. +SCRAPE_URL = False # Set this to true to scrape a webpage. +SRC_URL = ['http://www.example.com/one', 'https://www.example.com/two'] # A comma-separated list of URLs to scrape +WEB_CONTEXT = ['span', 'h2'] # A comma-separated list of the tag or object to search for in each page above. +WEB_ATTRIBUTES = [{'class': 'example-text'}, {}] # A list of dictionaries containing the attributes for each page. +TWEET_ACCOUNT = "" # The name of the account you're tweeting to. From 48158a604233473e9df133d55aa80424f37831a6 Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Fri, 10 Nov 2017 19:34:57 -0500 Subject: [PATCH 09/13] Document scraping, make last couple of tweaks. --- README.md | 22 ++++++++++++++++++++-- ebooks.py | 21 ++++++++++++--------- markov.py | 2 +- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index ac741d7..23795c1 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ This is a basic Python port of [@harrisj's](https://twitter.com/harrisj) [iron_e ## Configuring -There are several parameters that control the behavior of the bot. You can adjust them by setting them in your `local_settings.py` file. +There are several parameters that control the behavior of the bot. You can adjust them by setting them in your `local_settings.py` file. ``` ODDS = 8 @@ -40,6 +40,24 @@ ORDER = 2 The ORDER variable represents the Markov index, which is a measure of associativity in the generated Markov chains. 2 is generally more incoherent and 3 or 4 is more lucid. I tend to stick with 2. +### Additional sources + +This bot was originally designed to pull tweets from a Twitter account, however, it can also process comma-separated text in a text file, or scrape content from the web. + +If you wish to use _only_ a textfile or a web resource, make sure that `SOURCE_ACCOUNTS` in your `local_settings.py` file is exactly `[""]`. + +#### Static Text +To use a local text file, set `STATIC_TEST = True` and specify the name of a text file containing comma-separated "tweets" as `TEST_SOURCE`. + +#### Web Content +To scrape content from the web, set `SCRAPE_URL` to `True`. This bot makes use of the [`find_all()` method](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all) of Python's BeautfulSoup library. The implementation of this method requires the definition of three inputs in `local_settings.py`. + +1. A list of URLs to scrape as `SRC_URL`. +2. A list, `WEB_CONTEXT`, of the [names](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#id11) of the elements to extract from the corresponding URL. This can be "div", "h1" for level-one headings, "a" for links, etc. If you wish to search for more than one name for a single page, repeat the URL in the `SRC_URL` list for as many names as you wish to extract. +3. A list, `WEB_ATTRIBUTES` of dictionaries containing [attributes](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#attrs) to filter by. For instance, to limit the search to divs of class "title", one would pass the directory: `{"class": "title"}`. Use an empty dictionary, `{}`, for any page and name for which you don't wish to specify attributes. + +__Note:__ Web scraping is experimental and may give you unexpected results. Make sure to test the bot in debugging mode before publishing. + ## Debugging If you want to test the script or to debug the tweet generation, you can skip the random number generation and not publish the resulting tweets to Twitter. @@ -47,7 +65,7 @@ If you want to test the script or to debug the tweet generation, you can skip th First, adjust the `DEBUG` variable in `local_settings.py`. ``` -DEBUG = True +DEBUG = True ``` After that, commit the change and `git push heroku master`. Then run the command `heroku run worker` on the command line and watch what happens. diff --git a/ebooks.py b/ebooks.py index 1d19baa..6d4c235 100644 --- a/ebooks.py +++ b/ebooks.py @@ -57,16 +57,19 @@ def filter_tweet(tweet): def scrape_page(src_url, web_context, web_attributes): tweets = [] + last_url = "" for i in range(len(src_url)): - print(">>> Scraping {0}".format(src_url[i])) - try: - page = urlopen(src_url[i]) - except Exception: - import traceback - print(">>> Error scraping {0}:".format(src_url[i])) - print(traceback.format_exc()) - continue - soup = BeautifulSoup(page, 'html.parser') + if src_url[i] != last_url: + last_url = src_url[i] + print(">>> Scraping {0}".format(src_url[i])) + try: + page = urlopen(src_url[i]) + except Exception: + import traceback + print(">>> Error scraping {0}:".format(src_url[i])) + print(traceback.format_exc()) + continue + soup = BeautifulSoup(page, 'html.parser') hits = soup.find_all(web_context[i], attrs=web_attributes[i]) for hit in hits: tweet = str(hit.text).strip() diff --git a/markov.py b/markov.py index 9a71556..915d14a 100644 --- a/markov.py +++ b/markov.py @@ -69,7 +69,7 @@ def generate_sentence(self): sentence = "" for word in new_res: sentence += word + " " - sentence += res[-2] + res[-1] + sentence += res[-2] + " " + res[-1] else: sentence = None From 58ac2006d498bf62d296b18800e04f9e1b4a0f0b Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Fri, 10 Nov 2017 19:40:44 -0500 Subject: [PATCH 10/13] Re-try download for edge case where download fails and next URL is the same. --- ebooks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ebooks.py b/ebooks.py index 6d4c235..a168e20 100644 --- a/ebooks.py +++ b/ebooks.py @@ -65,6 +65,7 @@ def scrape_page(src_url, web_context, web_attributes): try: page = urlopen(src_url[i]) except Exception: + last_url = "ERROR" import traceback print(">>> Error scraping {0}:".format(src_url[i])) print(traceback.format_exc()) From a00b2796b55ae459fe3c70a654fc3429735ffdb8 Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Fri, 10 Nov 2017 20:03:39 -0500 Subject: [PATCH 11/13] One misplaced space! --- local_settings_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/local_settings_example.py b/local_settings_example.py index 2062009..2b39cce 100644 --- a/local_settings_example.py +++ b/local_settings_example.py @@ -13,7 +13,7 @@ ORDER = 2 # How closely do you want this to hew to sensical? 2 is low and 4 is high. SOURCE_EXCLUDE = r'^$' # Source tweets that match this regexp will not be added to the Markov chain. You might want to filter out inappropriate words for example. DEBUG = True # Set this to False to start Tweeting live -STATIC_TEST = False #S et this to True if you want to test Markov generation from a static file instead of the API. +STATIC_TEST = False # Set this to True if you want to test Markov generation from a static file instead of the API. TEST_SOURCE = ".txt" # The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed. SCRAPE_URL = False # Set this to true to scrape a webpage. SRC_URL = ['http://www.example.com/one', 'https://www.example.com/two'] # A comma-separated list of URLs to scrape From c124288e281e77f0d40a3bb0056ded656ccf8791 Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Sat, 11 Nov 2017 11:23:20 -0500 Subject: [PATCH 12/13] Responses to review. --- ebooks.py | 25 ++++++++++++++++++------- markov.py | 2 +- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/ebooks.py b/ebooks.py index a168e20..41b5255 100644 --- a/ebooks.py +++ b/ebooks.py @@ -45,13 +45,13 @@ def entity(text): def filter_tweet(tweet): tweet.text = re.sub(r'\b(RT|MT) .+', '', tweet.text) # take out anything after RT or MT tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+', '', tweet.text) # Take out URLs, hashtags, hts, etc. - tweet.text = tweet.text.replace('\n', '') # take out new lines. + tweet.text = re.sub('\s+', ' ', tweet.text) # collaspse consecutive whitespace to single spaces. tweet.text = re.sub(r'\"|\(|\)', '', tweet.text) # take out quotes. tweet.text = re.sub(r'\s+\(?(via|says)\s@\w+\)?', '', tweet.text) # remove attribution htmlsents = re.findall(r'&\w+;', tweet.text) for item in htmlsents: tweet.text = tweet.text.replace(item, entity(item)) - tweet.text = tweet.text.replace('\xe9', 'e') # take out accented e + tweet.text = re.sub(r'\xe9', 'e', tweet.text) # take out accented e return tweet.text @@ -72,10 +72,21 @@ def scrape_page(src_url, web_context, web_attributes): continue soup = BeautifulSoup(page, 'html.parser') hits = soup.find_all(web_context[i], attrs=web_attributes[i]) - for hit in hits: - tweet = str(hit.text).strip() - if tweet: - tweets.append(tweet) + if not hits: + print(">>> No results found!") + continue + else: + errors = 0 + for hit in hits: + try: + tweet = str(hit.text).strip() + except (UnicodeEncodeError, UnicodeDecodeError): + errors += 1 + continue + if tweet: + tweets.append(tweet) + if errors > 0: + print(">>> We had trouble reading {} result{}.".format(errors, "s" if errors > 1 else "")) return(tweets) @@ -112,7 +123,7 @@ def grab_tweets(api, max_id=None): source_tweets += item.split(",") if SCRAPE_URL: source_tweets += scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES) - if len(SOURCE_ACCOUNTS[0]) > 0: + if SOURCE_ACCOUNTS and len(SOURCE_ACCOUNTS[0]) > 0: twitter_tweets = [] for handle in SOURCE_ACCOUNTS: user = handle diff --git a/markov.py b/markov.py index 915d14a..6ff8530 100644 --- a/markov.py +++ b/markov.py @@ -69,7 +69,7 @@ def generate_sentence(self): sentence = "" for word in new_res: sentence += word + " " - sentence += res[-2] + " " + res[-1] + sentence += res[-2] + ("" if res[-1] in ".!?;:" else " ") + res[-1] else: sentence = None From 3af89c6eacb3a43c88f834f5eca0525e20e02558 Mon Sep 17 00:00:00 2001 From: Conor Anderson Date: Sat, 11 Nov 2017 11:24:28 -0500 Subject: [PATCH 13/13] Remove warning in README (handled in code). --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 23795c1..d7b59b7 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,6 @@ The ORDER variable represents the Markov index, which is a measure of associativ This bot was originally designed to pull tweets from a Twitter account, however, it can also process comma-separated text in a text file, or scrape content from the web. -If you wish to use _only_ a textfile or a web resource, make sure that `SOURCE_ACCOUNTS` in your `local_settings.py` file is exactly `[""]`. - #### Static Text To use a local text file, set `STATIC_TEST = True` and specify the name of a text file containing comma-separated "tweets" as `TEST_SOURCE`.