From 70c5a5c910eca6de36cf314afa8128bdfe172c9b Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Sun, 5 Nov 2017 21:59:24 -0500
Subject: [PATCH 01/13] Add rudimentary web scraping

---
 ebooks.py                 | 20 ++++++++++++++++----
 local_settings_example.py |  3 +++
 requirements.txt          |  3 ++-
 3 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/ebooks.py b/ebooks.py
index 6e99c4c..a859511 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -3,6 +3,8 @@
 import sys
 import twitter
 import markov
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
 try:
     # Python 3
     from html.entities import name2codepoint as n2c
@@ -50,7 +52,15 @@ def filter_tweet(tweet):
     tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e
     return tweet.text
                      
-                     
+def scrape_page(src_url, span_name):
+    print(">>> Generating from {0}".format(src_url))
+    page = urlopen(src_url)
+    soup = BeautifulSoup(page, 'html.parser')
+    spans = soup.find_all('span', attrs={'class': span_name})
+    titles = []
+    for span in spans:
+        titles.append(str(span.string))
+    return(titles)
                                                     
 def grab_tweets(api, max_id=None):
     source_tweets=[]
@@ -72,22 +82,24 @@ def grab_tweets(api, max_id=None):
         guess = 0
 
     if guess == 0:
+        api=connect()
         if STATIC_TEST==True:
             file = TEST_SOURCE
             print(">>> Generating from {0}".format(file))
             string_list = open(file).readlines()
             for item in string_list:
-                source_tweets = item.split(",")    
+                source_tweets = item.split(",")
+        elif SCRAPE_URL==True:
+            source_tweets = scrape_page(SRC_URL, SPAN_NAME)
         else:
             source_tweets = []
             for handle in SOURCE_ACCOUNTS:
                 user=handle
-                api=connect()
                 handle_stats = api.GetUser(screen_name=user)
                 status_count = handle_stats.statuses_count
                 max_id=None
                 if status_count<3200:
-                    my_range = (status_count/200) + 1
+                    my_range = int((status_count/200) + 1)
                 else:
                     my_range = 17
                 for x in range(my_range)[1:]:
diff --git a/local_settings_example.py b/local_settings_example.py
index 81e5945..d851a35 100644
--- a/local_settings_example.py
+++ b/local_settings_example.py
@@ -15,4 +15,7 @@
 DEBUG = True #Set this to False to start Tweeting live
 STATIC_TEST = False #Set this to True if you want to test Markov generation from a static file instead of the API.
 TEST_SOURCE = ".txt" #The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed.
+SCRAPE_URL = False #Set this to true to scrape a webpage.
+SRC_URL = '' #The URL to scrape
+SPAN_NAME = '' #The class of the <span> that contains the words you are looking for, e.g. "title"
 TWEET_ACCOUNT = "" #The name of the account you're tweeting to.
diff --git a/requirements.txt b/requirements.txt
index 4658fe9..be748d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
-python-twitter
\ No newline at end of file
+python-twitter
+beautifulsoup4

From c669302006163f12ee148a9748325f86797dc6d9 Mon Sep 17 00:00:00 2001
From: Conor Anderson <ConorIA@users.noreply.github.com>
Date: Sun, 5 Nov 2017 22:02:55 -0500
Subject: [PATCH 02/13] Update ebooks.py

---
 ebooks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ebooks.py b/ebooks.py
index a859511..16de503 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -4,13 +4,14 @@
 import twitter
 import markov
 from bs4 import BeautifulSoup
-from urllib.request import urlopen
 try:
     # Python 3
     from html.entities import name2codepoint as n2c
+    from urllib.request import urlopen
 except ImportError:
     # Python 2
     from htmlentitydefs import name2codepoint as n2c
+    from urllib2 import urlopen
     chr = unichr
 from local_settings import *
 

From 40ff672296a3e4eac532a7863c6ea6710bbf9518 Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Tue, 7 Nov 2017 10:25:33 -0500
Subject: [PATCH 03/13] Make web scraping more flexible

---
 ebooks.py                 | 21 +++++++++++----------
 local_settings_example.py |  5 +++--
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/ebooks.py b/ebooks.py
index a859511..df695b0 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -52,15 +52,16 @@ def filter_tweet(tweet):
     tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e
     return tweet.text
                      
-def scrape_page(src_url, span_name):
-    print(">>> Generating from {0}".format(src_url))
-    page = urlopen(src_url)
-    soup = BeautifulSoup(page, 'html.parser')
-    spans = soup.find_all('span', attrs={'class': span_name})
-    titles = []
-    for span in spans:
-        titles.append(str(span.string))
-    return(titles)
+def scrape_page(src_url, web_context, web_attributes):
+    tweets = []
+    for i in range(len(src_url)):
+        print(">>> Scraping {0}".format(src_url[i]))
+        page = urlopen(src_url[i])
+        soup = BeautifulSoup(page, 'html.parser')
+        hits = soup.find_all(web_context[i], attrs=web_attributes[i])
+        for hit in hits:
+            tweets.append(str(hit.text).strip())
+    return(tweets)
                                                     
 def grab_tweets(api, max_id=None):
     source_tweets=[]
@@ -90,7 +91,7 @@ def grab_tweets(api, max_id=None):
             for item in string_list:
                 source_tweets = item.split(",")
         elif SCRAPE_URL==True:
-            source_tweets = scrape_page(SRC_URL, SPAN_NAME)
+            source_tweets = scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES)
         else:
             source_tweets = []
             for handle in SOURCE_ACCOUNTS:
diff --git a/local_settings_example.py b/local_settings_example.py
index d851a35..26f41c9 100644
--- a/local_settings_example.py
+++ b/local_settings_example.py
@@ -16,6 +16,7 @@
 STATIC_TEST = False #Set this to True if you want to test Markov generation from a static file instead of the API.
 TEST_SOURCE = ".txt" #The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed.
 SCRAPE_URL = False #Set this to true to scrape a webpage.
-SRC_URL = '' #The URL to scrape
-SPAN_NAME = '' #The class of the <span> that contains the words you are looking for, e.g. "title"
+SRC_URL =['http://www.example.com/one', 'https://www.example.com/two'] #A comma-separated list of URLs to scrape
+WEB_CONTEXT = ['span', 'h2'] #A comma-separated list of the tag or object to search for in each page above.
+WEB_ATTRIBUTES = [{'class': 'example-text'}, {}] #A list of dictionaries containing the attributes for each page.
 TWEET_ACCOUNT = "" #The name of the account you're tweeting to.

From 35c8ee29768e957a0940e9fb02ffb8cc3723556c Mon Sep 17 00:00:00 2001
From: Conor Anderson <ConorIA@users.noreply.github.com>
Date: Tue, 7 Nov 2017 10:28:09 -0500
Subject: [PATCH 04/13] Update local_settings_example.py

---
 local_settings_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/local_settings_example.py b/local_settings_example.py
index 26f41c9..e5e83b9 100644
--- a/local_settings_example.py
+++ b/local_settings_example.py
@@ -16,7 +16,7 @@
 STATIC_TEST = False #Set this to True if you want to test Markov generation from a static file instead of the API.
 TEST_SOURCE = ".txt" #The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed.
 SCRAPE_URL = False #Set this to true to scrape a webpage.
-SRC_URL =['http://www.example.com/one', 'https://www.example.com/two'] #A comma-separated list of URLs to scrape
+SRC_URL = ['http://www.example.com/one', 'https://www.example.com/two'] #A comma-separated list of URLs to scrape
 WEB_CONTEXT = ['span', 'h2'] #A comma-separated list of the tag or object to search for in each page above.
 WEB_ATTRIBUTES = [{'class': 'example-text'}, {}] #A list of dictionaries containing the attributes for each page.
 TWEET_ACCOUNT = "" #The name of the account you're tweeting to.

From d32e3171bd9fd32a459dc465bf409444b8535cb3 Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Tue, 7 Nov 2017 11:47:18 -0500
Subject: [PATCH 05/13] Add minimal error handling.

---
 ebooks.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ebooks.py b/ebooks.py
index ad40c2e..cbd8009 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -57,7 +57,13 @@ def scrape_page(src_url, web_context, web_attributes):
     tweets = []
     for i in range(len(src_url)):
         print(">>> Scraping {0}".format(src_url[i]))
-        page = urlopen(src_url[i])
+        try: 
+          page = urlopen(src_url[i])
+        except Exception:
+          import traceback
+          print(">>> Error scraping {0}:".format(src_url[i]))
+          print(traceback.format_exc())
+          continue
         soup = BeautifulSoup(page, 'html.parser')
         hits = soup.find_all(web_context[i], attrs=web_attributes[i])
         for hit in hits:

From a9b0988844bcd9821632d66dbb03f26b2f2c2daa Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Tue, 7 Nov 2017 21:34:57 -0500
Subject: [PATCH 06/13] Allow all three sources at once

---
 ebooks.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/ebooks.py b/ebooks.py
index cbd8009..5dd841f 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -67,7 +67,9 @@ def scrape_page(src_url, web_context, web_attributes):
         soup = BeautifulSoup(page, 'html.parser')
         hits = soup.find_all(web_context[i], attrs=web_attributes[i])
         for hit in hits:
-            tweets.append(str(hit.text).strip())
+            tweet = str(hit.text).strip()
+            if len(tweet) >= 0:
+                tweets.append(tweet)
     return(tweets)
                                                     
 def grab_tweets(api, max_id=None):
@@ -91,16 +93,17 @@ def grab_tweets(api, max_id=None):
 
     if guess == 0:
         api=connect()
+        source_tweets = []
         if STATIC_TEST==True:
             file = TEST_SOURCE
             print(">>> Generating from {0}".format(file))
             string_list = open(file).readlines()
             for item in string_list:
-                source_tweets = item.split(",")
-        elif SCRAPE_URL==True:
-            source_tweets = scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES)
-        else:
-            source_tweets = []
+                source_tweets += item.split(",")
+        if SCRAPE_URL==True:
+            source_tweets += scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES)
+        if len(SOURCE_ACCOUNTS[0]) > 0:
+            twitter_tweets =  []
             for handle in SOURCE_ACCOUNTS:
                 user=handle
                 handle_stats = api.GetUser(screen_name=user)
@@ -111,12 +114,14 @@ def grab_tweets(api, max_id=None):
                 else:
                     my_range = 17
                 for x in range(my_range)[1:]:
-                    source_tweets_iter, max_id = grab_tweets(api,max_id)
-                    source_tweets += source_tweets_iter
-                print("{0} tweets found in {1}".format(len(source_tweets), handle))
-                if len(source_tweets) == 0:
+                    twitter_tweets_iter, max_id = grab_tweets(api,max_id)
+                    twitter_tweets += twitter_tweets_iter
+                print("{0} tweets found in {1}".format(len(twitter_tweets), handle))
+                if len(twitter_tweets) == 0:
                     print("Error fetching tweets from Twitter. Aborting.")
                     sys.exit()
+                else:
+                    source_tweets += twitter_tweets
         mine = markov.MarkovChainer(order)
         for tweet in source_tweets:
             if re.search('([\.\!\?\"\']$)', tweet):

From 425732b7eccd89b280b5207cfccd4b919ee0a85e Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Tue, 7 Nov 2017 22:09:17 -0500
Subject: [PATCH 07/13] Implement code-cleanup (replaces #34)

---
 ebooks.py | 143 +++++++++++++++++++++++++++---------------------------
 markov.py |  24 ++++-----
 2 files changed, 84 insertions(+), 83 deletions(-)

diff --git a/ebooks.py b/ebooks.py
index 5dd841f..1d19baa 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -15,12 +15,13 @@
     chr = unichr
 from local_settings import *
 
+
 def connect():
-    api = twitter.Api(consumer_key=MY_CONSUMER_KEY,
-                          consumer_secret=MY_CONSUMER_SECRET,
-                          access_token_key=MY_ACCESS_TOKEN_KEY,
-                          access_token_secret=MY_ACCESS_TOKEN_SECRET)
-    return api
+    return twitter.Api(consumer_key=MY_CONSUMER_KEY,
+                       consumer_secret=MY_CONSUMER_SECRET,
+                       access_token_key=MY_ACCESS_TOKEN_KEY,
+                       access_token_secret=MY_ACCESS_TOKEN_SECRET)
+
 
 def entity(text):
     if text[:2] == "&#":
@@ -37,141 +38,139 @@ def entity(text):
         try:
             text = chr(numero)
         except KeyError:
-            pass    
+            pass
     return text
 
+
 def filter_tweet(tweet):
-    tweet.text = re.sub(r'\b(RT|MT) .+','',tweet.text) #take out anything after RT or MT
-    tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+','',tweet.text) #Take out URLs, hashtags, hts, etc.
-    tweet.text = re.sub(r'\n','', tweet.text) #take out new lines.
-    tweet.text = re.sub(r'\"|\(|\)', '', tweet.text) #take out quotes.
-    tweet.text = re.sub(r'\s+\(?(via|says)\s@\w+\)?', '', tweet.text) # remove attribution
+    tweet.text = re.sub(r'\b(RT|MT) .+', '', tweet.text)  # take out anything after RT or MT
+    tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+', '', tweet.text)  # Take out URLs, hashtags, hts, etc.
+    tweet.text = tweet.text.replace('\n', '')  # take out new lines.
+    tweet.text = re.sub(r'\"|\(|\)', '', tweet.text)  # take out quotes.
+    tweet.text = re.sub(r'\s+\(?(via|says)\s@\w+\)?', '', tweet.text)  # remove attribution
     htmlsents = re.findall(r'&\w+;', tweet.text)
-    if len(htmlsents) > 0 :
-        for item in htmlsents:
-            tweet.text = re.sub(item, entity(item), tweet.text)    
-    tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e
+    for item in htmlsents:
+        tweet.text = tweet.text.replace(item, entity(item))
+    tweet.text = tweet.text.replace('\xe9', 'e')  # take out accented e
     return tweet.text
-                     
+
+
 def scrape_page(src_url, web_context, web_attributes):
     tweets = []
     for i in range(len(src_url)):
         print(">>> Scraping {0}".format(src_url[i]))
-        try: 
-          page = urlopen(src_url[i])
+        try:
+            page = urlopen(src_url[i])
         except Exception:
-          import traceback
-          print(">>> Error scraping {0}:".format(src_url[i]))
-          print(traceback.format_exc())
-          continue
+            import traceback
+            print(">>> Error scraping {0}:".format(src_url[i]))
+            print(traceback.format_exc())
+            continue
         soup = BeautifulSoup(page, 'html.parser')
         hits = soup.find_all(web_context[i], attrs=web_attributes[i])
         for hit in hits:
             tweet = str(hit.text).strip()
-            if len(tweet) >= 0:
+            if tweet:
                 tweets.append(tweet)
     return(tweets)
-                                                    
+
+
 def grab_tweets(api, max_id=None):
-    source_tweets=[]
+    source_tweets = []
     user_tweets = api.GetUserTimeline(screen_name=user, count=200, max_id=max_id, include_rts=True, trim_user=True, exclude_replies=True)
-    max_id = user_tweets[len(user_tweets)-1].id-1
+    max_id = user_tweets[-1].id - 1
     for tweet in user_tweets:
         tweet.text = filter_tweet(tweet)
         if re.search(SOURCE_EXCLUDE, tweet.text):
             continue
-        if len(tweet.text) != 0:
+        if tweet.text:
             source_tweets.append(tweet.text)
     return source_tweets, max_id
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     order = ORDER
-    if DEBUG==False:
-        guess = random.choice(range(ODDS))
-    else:
-        guess = 0
+    guess = 0
+    if ODDS and not DEBUG:
+        guess = random.randint(0, ODDS - 1)
 
-    if guess == 0:
-        api=connect()
+    if guess:
+        print(str(guess) + " No, sorry, not this time.")  # message if the random number fails.
+        sys.exit()
+    else:
+        api = connect()
         source_tweets = []
-        if STATIC_TEST==True:
+        if STATIC_TEST:
             file = TEST_SOURCE
             print(">>> Generating from {0}".format(file))
             string_list = open(file).readlines()
             for item in string_list:
                 source_tweets += item.split(",")
-        if SCRAPE_URL==True:
+        if SCRAPE_URL:
             source_tweets += scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES)
         if len(SOURCE_ACCOUNTS[0]) > 0:
-            twitter_tweets =  []
+            twitter_tweets = []
             for handle in SOURCE_ACCOUNTS:
-                user=handle
+                user = handle
                 handle_stats = api.GetUser(screen_name=user)
                 status_count = handle_stats.statuses_count
-                max_id=None
-                if status_count<3200:
-                    my_range = int((status_count/200) + 1)
-                else:
-                    my_range = 17
-                for x in range(my_range)[1:]:
-                    twitter_tweets_iter, max_id = grab_tweets(api,max_id)
+                max_id = None
+                my_range = min(17, int((status_count/200) + 1))
+                for x in range(1, my_range):
+                    twitter_tweets_iter, max_id = grab_tweets(api, max_id)
                     twitter_tweets += twitter_tweets_iter
                 print("{0} tweets found in {1}".format(len(twitter_tweets), handle))
-                if len(twitter_tweets) == 0:
+                if not twitter_tweets:
                     print("Error fetching tweets from Twitter. Aborting.")
                     sys.exit()
                 else:
                     source_tweets += twitter_tweets
         mine = markov.MarkovChainer(order)
         for tweet in source_tweets:
-            if re.search('([\.\!\?\"\']$)', tweet):
-                pass
-            else:
-                tweet+="."
+            if not re.search('([\.\!\?\"\']$)', tweet):
+                tweet += "."
             mine.add_text(tweet)
-            
-        for x in range(0,10):
+
+        for x in range(0, 10):
             ebook_tweet = mine.generate_sentence()
 
-        #randomly drop the last word, as Horse_ebooks appears to do.
-        if random.randint(0,4) == 0 and re.search(r'(in|to|from|for|with|by|our|of|your|around|under|beyond)\s\w+$', ebook_tweet) != None: 
-           print("Losing last word randomly")
-           ebook_tweet = re.sub(r'\s\w+.$','',ebook_tweet) 
-           print(ebook_tweet)
-    
-        #if a tweet is very short, this will randomly add a second sentence to it.
-        if ebook_tweet != None and len(ebook_tweet) < 40:
-            rando = random.randint(0,10)
-            if rando == 0 or rando == 7: 
+        # randomly drop the last word, as Horse_ebooks appears to do.
+        if random.randint(0, 4) == 0 and re.search(r'(in|to|from|for|with|by|our|of|your|around|under|beyond)\s\w+$', ebook_tweet) is not None:
+            print("Losing last word randomly")
+            ebook_tweet = re.sub(r'\s\w+.$', '', ebook_tweet)
+            print(ebook_tweet)
+
+        # if a tweet is very short, this will randomly add a second sentence to it.
+        if ebook_tweet is not None and len(ebook_tweet) < 40:
+            rando = random.randint(0, 10)
+            if rando == 0 or rando == 7:
                 print("Short tweet. Adding another sentence randomly")
                 newer_tweet = mine.generate_sentence()
-                if newer_tweet != None:
+                if newer_tweet is not None:
                     ebook_tweet += " " + mine.generate_sentence()
                 else:
                     ebook_tweet = ebook_tweet
             elif rando == 1:
-                #say something crazy/prophetic in all caps
+                # say something crazy/prophetic in all caps
                 print("ALL THE THINGS")
                 ebook_tweet = ebook_tweet.upper()
 
-        #throw out tweets that match anything from the source account.
-        if ebook_tweet != None and len(ebook_tweet) < 110:
+        # throw out tweets that match anything from the source account.
+        if ebook_tweet is not None and len(ebook_tweet) < 110:
             for tweet in source_tweets:
                 if ebook_tweet[:-1] not in tweet:
                     continue
-                else: 
+                else:
                     print("TOO SIMILAR: " + ebook_tweet)
                     sys.exit()
-                          
-            if DEBUG == False:
+
+            if not DEBUG:
                 status = api.PostUpdate(ebook_tweet)
                 print(status.text.encode('utf-8'))
             else:
                 print(ebook_tweet)
 
-        elif ebook_tweet == None:
+        elif not ebook_tweet:
             print("Tweet is empty, sorry.")
         else:
             print("TOO LONG: " + ebook_tweet)
-    else:
-        print(str(guess) + " No, sorry, not this time.") #message if the random number fails.
diff --git a/markov.py b/markov.py
index b9f78a0..9a71556 100644
--- a/markov.py
+++ b/markov.py
@@ -1,23 +1,24 @@
 import random
 import re
 
+
 class MarkovChainer(object):
     def __init__(self, order):
-        self.order=order
+        self.order = order
         self.beginnings = []
         self.freq = {}
 
-    #pass a string with a terminator to the function to add it to the markov lists.
+    # pass a string with a terminator to the function to add it to the markov lists.
     def add_sentence(self, string, terminator):
         data = "".join(string)
         words = data.split()
         buf = []
         if len(words) > self.order:
             words.append(terminator)
-            self.beginnings.append(words[0:self.order])    
+            self.beginnings.append(words[0:self.order])
         else:
             pass
-        
+
         for word in words:
             buf.append(word)
             if len(buf) == self.order + 1:
@@ -44,21 +45,21 @@ def add_text(self, text):
                 else:
                     sentence = piece
 
-    #Generate the goofy sentences that become your tweet.
+    # Generate the goofy sentences that become your tweet.
     def generate_sentence(self):
         res = random.choice(self.beginnings)
         res = res[:]
-        if len(res)==self.order:
+        if len(res) == self.order:
             nw = True
-            while nw != None:
+            while nw is not None:
                 restup = (res[-2], res[-1])
                 try:
                     nw = self.next_word_for(restup)
-                    if nw != None:
+                    if nw is not None:
                         res.append(nw)
                     else:
                         continue
-                except:
+                except Exception:
                     nw = False
             new_res = res[0:-2]
             if new_res[0].istitle() or new_res[0].isupper():
@@ -79,8 +80,9 @@ def next_word_for(self, words):
             arr = self.freq[words]
             next_words = random.choice(arr)
             return next_words
-        except:
-            return None        
+        except Exception:
+            return None
+
 
 if __name__ == "__main__":
     print("Try running ebooks.py first")

From f091a59ac13d9116173b9fcf4ebfa49c644a032c Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Wed, 8 Nov 2017 09:04:37 -0500
Subject: [PATCH 08/13] Liniting for local_settings.py too

---
 local_settings_example.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/local_settings_example.py b/local_settings_example.py
index e5e83b9..2062009 100644
--- a/local_settings_example.py
+++ b/local_settings_example.py
@@ -2,21 +2,21 @@
 Local Settings for a heroku_ebooks account. #fill in the name of the account you're tweeting from here.
 '''
 
-#configuration
+# Configuration
 MY_CONSUMER_KEY = 'Your Twitter API Consumer Key'
 MY_CONSUMER_SECRET = 'Your Consumer Secret Key'
 MY_ACCESS_TOKEN_KEY = 'Your Twitter API Access Token Key'
 MY_ACCESS_TOKEN_SECRET = 'Your Access Token Secret'
 
-SOURCE_ACCOUNTS = [""] #A list of comma-separated, quote-enclosed Twitter handles of account that you'll generate tweets based on. It should look like ["account1", "account2"]. If you want just one account, no comma needed.
-ODDS = 8 #How often do you want this to run? 1/8 times?
-ORDER = 2 #how closely do you want this to hew to sensical? 2 is low and 4 is high.
-SOURCE_EXCLUDE = r'^$' #Source tweets that match this regexp will not be added to the Markov chain. You might want to filter out inappropriate words for example.
-DEBUG = True #Set this to False to start Tweeting live
-STATIC_TEST = False #Set this to True if you want to test Markov generation from a static file instead of the API.
-TEST_SOURCE = ".txt" #The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed.
-SCRAPE_URL = False #Set this to true to scrape a webpage.
-SRC_URL = ['http://www.example.com/one', 'https://www.example.com/two'] #A comma-separated list of URLs to scrape
-WEB_CONTEXT = ['span', 'h2'] #A comma-separated list of the tag or object to search for in each page above.
-WEB_ATTRIBUTES = [{'class': 'example-text'}, {}] #A list of dictionaries containing the attributes for each page.
-TWEET_ACCOUNT = "" #The name of the account you're tweeting to.
+SOURCE_ACCOUNTS = [""]  # A list of comma-separated, quote-enclosed Twitter handles of account that you'll generate tweets based on. It should look like ["account1", "account2"]. If you want just one account, no comma needed.
+ODDS = 8  # How often do you want this to run? 1/8 times?
+ORDER = 2  # How closely do you want this to hew to sensical? 2 is low and 4 is high.
+SOURCE_EXCLUDE = r'^$'  # Source tweets that match this regexp will not be added to the Markov chain. You might want to filter out inappropriate words for example.
+DEBUG = True  # Set this to False to start Tweeting live
+STATIC_TEST = False  #S et this to True if you want to test Markov generation from a static file instead of the API.
+TEST_SOURCE = ".txt"  # The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed.
+SCRAPE_URL = False  # Set this to true to scrape a webpage.
+SRC_URL = ['http://www.example.com/one', 'https://www.example.com/two']  # A comma-separated list of URLs to scrape
+WEB_CONTEXT = ['span', 'h2']  # A comma-separated list of the tag or object to search for in each page above.
+WEB_ATTRIBUTES = [{'class': 'example-text'}, {}]  # A list of dictionaries containing the attributes for each page.
+TWEET_ACCOUNT = ""  # The name of the account you're tweeting to.

From 48158a604233473e9df133d55aa80424f37831a6 Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Fri, 10 Nov 2017 19:34:57 -0500
Subject: [PATCH 09/13] Document scraping, make last couple of tweaks.

---
 README.md | 22 ++++++++++++++++++++--
 ebooks.py | 21 ++++++++++++---------
 markov.py |  2 +-
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index ac741d7..23795c1 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ This is a basic Python port of [@harrisj's](https://twitter.com/harrisj) [iron_e
 
 ## Configuring
 
-There are several parameters that control the behavior of the bot. You can adjust them by setting them in your `local_settings.py` file. 
+There are several parameters that control the behavior of the bot. You can adjust them by setting them in your `local_settings.py` file.
 
 ```
 ODDS = 8
@@ -40,6 +40,24 @@ ORDER = 2
 
 The ORDER variable represents the Markov index, which is a measure of associativity in the generated Markov chains. 2 is generally more incoherent and 3 or 4 is more lucid. I tend to stick with 2.
 
+### Additional sources
+
+This bot was originally designed to pull tweets from a Twitter account, however, it can also process comma-separated text in a text file, or scrape content from the web.
+
+If you wish to use _only_ a textfile or a web resource, make sure that `SOURCE_ACCOUNTS` in your `local_settings.py` file is exactly `[""]`.
+
+#### Static Text
+To use a local text file, set `STATIC_TEST = True` and specify the name of a text file containing comma-separated "tweets" as `TEST_SOURCE`.
+
+#### Web Content
+To scrape content from the web, set `SCRAPE_URL` to `True`. This bot makes use of the [`find_all()` method](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all) of Python's BeautfulSoup library. The implementation of this method requires the definition of three inputs in `local_settings.py`.
+
+1. A list of URLs to scrape as `SRC_URL`.
+2. A list, `WEB_CONTEXT`, of the [names](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#id11) of the elements to extract from the corresponding URL. This can be "div", "h1" for level-one headings, "a" for links, etc. If you wish to search for more than one name for a single page, repeat the URL in the `SRC_URL` list for as many names as you wish to extract.
+3. A list, `WEB_ATTRIBUTES` of dictionaries containing [attributes](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#attrs) to filter by. For instance, to limit the search to divs of class "title", one would pass the directory: `{"class": "title"}`. Use an empty dictionary, `{}`, for any page and name for which you don't wish to specify attributes.
+
+__Note:__ Web scraping is experimental and may give you unexpected results. Make sure to test the bot in debugging mode before publishing.
+
 ## Debugging
 
 If you want to test the script or to debug the tweet generation, you can skip the random number generation and not publish the resulting tweets to Twitter.
@@ -47,7 +65,7 @@ If you want to test the script or to debug the tweet generation, you can skip th
 First, adjust the `DEBUG` variable in `local_settings.py`.
 
 ```
-DEBUG = True 
+DEBUG = True
 ```
 
 After that, commit the change and `git push heroku master`. Then run the command `heroku run worker` on the command line and watch what happens.
diff --git a/ebooks.py b/ebooks.py
index 1d19baa..6d4c235 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -57,16 +57,19 @@ def filter_tweet(tweet):
 
 def scrape_page(src_url, web_context, web_attributes):
     tweets = []
+    last_url = ""
     for i in range(len(src_url)):
-        print(">>> Scraping {0}".format(src_url[i]))
-        try:
-            page = urlopen(src_url[i])
-        except Exception:
-            import traceback
-            print(">>> Error scraping {0}:".format(src_url[i]))
-            print(traceback.format_exc())
-            continue
-        soup = BeautifulSoup(page, 'html.parser')
+        if src_url[i] != last_url:
+            last_url = src_url[i]
+            print(">>> Scraping {0}".format(src_url[i]))
+            try:
+                page = urlopen(src_url[i])
+            except Exception:
+                import traceback
+                print(">>> Error scraping {0}:".format(src_url[i]))
+                print(traceback.format_exc())
+                continue
+            soup = BeautifulSoup(page, 'html.parser')
         hits = soup.find_all(web_context[i], attrs=web_attributes[i])
         for hit in hits:
             tweet = str(hit.text).strip()
diff --git a/markov.py b/markov.py
index 9a71556..915d14a 100644
--- a/markov.py
+++ b/markov.py
@@ -69,7 +69,7 @@ def generate_sentence(self):
             sentence = ""
             for word in new_res:
                 sentence += word + " "
-            sentence += res[-2] + res[-1]
+            sentence += res[-2] + " " + res[-1]
 
         else:
             sentence = None

From 58ac2006d498bf62d296b18800e04f9e1b4a0f0b Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Fri, 10 Nov 2017 19:40:44 -0500
Subject: [PATCH 10/13] Re-try download for edge case where download fails and
 next URL is the same.

---
 ebooks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ebooks.py b/ebooks.py
index 6d4c235..a168e20 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -65,6 +65,7 @@ def scrape_page(src_url, web_context, web_attributes):
             try:
                 page = urlopen(src_url[i])
             except Exception:
+                last_url = "ERROR"
                 import traceback
                 print(">>> Error scraping {0}:".format(src_url[i]))
                 print(traceback.format_exc())

From a00b2796b55ae459fe3c70a654fc3429735ffdb8 Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Fri, 10 Nov 2017 20:03:39 -0500
Subject: [PATCH 11/13] One misplaced space!

---
 local_settings_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/local_settings_example.py b/local_settings_example.py
index 2062009..2b39cce 100644
--- a/local_settings_example.py
+++ b/local_settings_example.py
@@ -13,7 +13,7 @@
 ORDER = 2  # How closely do you want this to hew to sensical? 2 is low and 4 is high.
 SOURCE_EXCLUDE = r'^$'  # Source tweets that match this regexp will not be added to the Markov chain. You might want to filter out inappropriate words for example.
 DEBUG = True  # Set this to False to start Tweeting live
-STATIC_TEST = False  #S et this to True if you want to test Markov generation from a static file instead of the API.
+STATIC_TEST = False  # Set this to True if you want to test Markov generation from a static file instead of the API.
 TEST_SOURCE = ".txt"  # The name of a text file of a string-ified list for testing. To avoid unnecessarily hitting Twitter API. You can use the included testcorpus.txt, if needed.
 SCRAPE_URL = False  # Set this to true to scrape a webpage.
 SRC_URL = ['http://www.example.com/one', 'https://www.example.com/two']  # A comma-separated list of URLs to scrape

From c124288e281e77f0d40a3bb0056ded656ccf8791 Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Sat, 11 Nov 2017 11:23:20 -0500
Subject: [PATCH 12/13] Responses to review.

---
 ebooks.py | 25 ++++++++++++++++++-------
 markov.py |  2 +-
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/ebooks.py b/ebooks.py
index a168e20..41b5255 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -45,13 +45,13 @@ def entity(text):
 def filter_tweet(tweet):
     tweet.text = re.sub(r'\b(RT|MT) .+', '', tweet.text)  # take out anything after RT or MT
     tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+', '', tweet.text)  # Take out URLs, hashtags, hts, etc.
-    tweet.text = tweet.text.replace('\n', '')  # take out new lines.
+    tweet.text = re.sub('\s+', ' ', tweet.text)  # collaspse consecutive whitespace to single spaces.
     tweet.text = re.sub(r'\"|\(|\)', '', tweet.text)  # take out quotes.
     tweet.text = re.sub(r'\s+\(?(via|says)\s@\w+\)?', '', tweet.text)  # remove attribution
     htmlsents = re.findall(r'&\w+;', tweet.text)
     for item in htmlsents:
         tweet.text = tweet.text.replace(item, entity(item))
-    tweet.text = tweet.text.replace('\xe9', 'e')  # take out accented e
+    tweet.text = re.sub(r'\xe9', 'e', tweet.text)  # take out accented e
     return tweet.text
 
 
@@ -72,10 +72,21 @@ def scrape_page(src_url, web_context, web_attributes):
                 continue
             soup = BeautifulSoup(page, 'html.parser')
         hits = soup.find_all(web_context[i], attrs=web_attributes[i])
-        for hit in hits:
-            tweet = str(hit.text).strip()
-            if tweet:
-                tweets.append(tweet)
+        if not hits:
+            print(">>> No results found!")
+            continue
+        else:
+            errors = 0
+            for hit in hits:
+                try:
+                    tweet = str(hit.text).strip()
+                except (UnicodeEncodeError, UnicodeDecodeError):
+                    errors += 1
+                    continue
+                if tweet:
+                    tweets.append(tweet)
+            if errors > 0:
+                print(">>> We had trouble reading {} result{}.".format(errors, "s" if errors > 1 else ""))
     return(tweets)
 
 
@@ -112,7 +123,7 @@ def grab_tweets(api, max_id=None):
                 source_tweets += item.split(",")
         if SCRAPE_URL:
             source_tweets += scrape_page(SRC_URL, WEB_CONTEXT, WEB_ATTRIBUTES)
-        if len(SOURCE_ACCOUNTS[0]) > 0:
+        if SOURCE_ACCOUNTS and len(SOURCE_ACCOUNTS[0]) > 0:
             twitter_tweets = []
             for handle in SOURCE_ACCOUNTS:
                 user = handle
diff --git a/markov.py b/markov.py
index 915d14a..6ff8530 100644
--- a/markov.py
+++ b/markov.py
@@ -69,7 +69,7 @@ def generate_sentence(self):
             sentence = ""
             for word in new_res:
                 sentence += word + " "
-            sentence += res[-2] + " " + res[-1]
+            sentence += res[-2] + ("" if res[-1] in ".!?;:" else " ") + res[-1]
 
         else:
             sentence = None

From 3af89c6eacb3a43c88f834f5eca0525e20e02558 Mon Sep 17 00:00:00 2001
From: Conor Anderson <conor@conr.ca>
Date: Sat, 11 Nov 2017 11:24:28 -0500
Subject: [PATCH 13/13] Remove warning in README (handled in code).

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 23795c1..d7b59b7 100644
--- a/README.md
+++ b/README.md
@@ -44,8 +44,6 @@ The ORDER variable represents the Markov index, which is a measure of associativ
 
 This bot was originally designed to pull tweets from a Twitter account, however, it can also process comma-separated text in a text file, or scrape content from the web.
 
-If you wish to use _only_ a textfile or a web resource, make sure that `SOURCE_ACCOUNTS` in your `local_settings.py` file is exactly `[""]`.
-
 #### Static Text
 To use a local text file, set `STATIC_TEST = True` and specify the name of a text file containing comma-separated "tweets" as `TEST_SOURCE`.