added script, gitignore, readme files

gliserma · Oct 31, 2021 · 17985d0 · 17985d0
commit 17985d0
Show file tree

Hide file tree

Showing 3 changed files with 258 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+.DS_Store
+.vscode/
+authentication.js
+node_modules
+notes/
+**/*-todo.md
diff --git a/README.md b/README.md
@@ -0,0 +1,22 @@
+# Broken Links Webcrawler
+Developed to identify broken links on website for
+the Gilder Lehrman Insitute of American History.
+
+## Implementation.
+- A simple python script that can be launched from the terminal.
+
+## Optional Command Line Arguments:
+- --fname: desired name for the output file
+- --number: how many pages should be searched
+
+## Output: Two Files
+- fname: All pages visited and all the links contained in those pages as a csv
+- broken_fname: All broken links, i.e. origin page, destination page, anchor text
+
+## Requirements
+- Python3.6+
+- Scrapy 2.5.0
+
+## Future Steps
+- find broken images
+- find pages with code fragments showing as text
diff --git a/broken_link_search.py b/broken_link_search.py
@@ -0,0 +1,230 @@
+import argparse
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy.crawler import CrawlerProcess
+import csv
+
+# Dictionary containing http status codes
+# with their corresponding meanings.
+RESPONSE_CODES = {
+    200: "OK",
+    301: "Moved Permanently",
+    302: "Found",
+    400: "Bad Request",
+    401: "Unauthorized",
+    403: "Forbidden",
+    404: "Not Found"
+}
+# Names of the CSV fields
+FIELDS = [
+    'origin_url',           # 0
+    'origin_status_code',   # 1
+    'status_description',   # 2
+    'outbound_anchor_text', # 3
+    'outbound_hyperlink',   # 4
+    'outbound_status_code', # 5
+]
+
+class GLI_Spider(CrawlSpider):
+    """
+    Searches the Gilder Lehrman website 
+    for all hyperlinks and page statuses.
+    """
+    # Parameters for this Crawl Spider
+    name = 'broken_links_from_homepage'
+    allowed_domains = ['gilderlehrman.org']
+    start_urls = [
+        'https://www.gilderlehrman.org/', 
+        'https://www.gilderlehrman.org/news/'
+    ]
+    handle_httpstatus_list = [
+        200, 
+        301, 
+        302, 
+        400, 
+        401, 
+        403, 
+        404
+    ]
+    # How and where the spider will crawl
+    rules = [ 
+        Rule(LinkExtractor(allow_domains='gilderlehrman.org'), 
+        callback='parse_info', 
+        follow=True) 
+    ]
+
+    # Another link extractor -- not for crawling but 
+    # for finding all hyperlinks on a given page.
+    le = LinkExtractor(
+        allow_domains='gilderlehrman.org', 
+        unique=False
+    )
+
+    def parse_info(self, response):
+        """
+        Defines what the spider will extract 
+        from a page that it visits.
+        """
+        # Extract information from the reponse
+        status = response.status
+        desc = RESPONSE_CODES[status]
+        internal_links = self.le.extract_links(response)
+
+        # OUTPUT for the CSV
+        # If the page is not working:
+        if status >= 400:
+            yield {
+                # Collect only the url and status for this page
+                    FIELDS[0] : response.url,
+                    FIELDS[1] : status,
+                    FIELDS[2] : desc,
+            }
+        # If the page is working:
+        else: 
+            for out_link in internal_links:
+                # Also collect all outbound links on this page
+                yield {
+                    FIELDS[0] : response.url,
+                    FIELDS[1] : status,
+                    FIELDS[2] : desc,
+                    FIELDS[3] : format_for_csv(out_link.text.strip()),
+                    FIELDS[4] : remove_bookmarks(out_link.url)
+                }
+
+def remove_bookmarks(hyperlink):
+    """
+    Strips any text following a hashtag ('#') in a hyperlink.
+    """
+    if '#' in hyperlink:
+        index = hyperlink.find('#')
+        hyperlink = hyperlink[0:index]
+    return hyperlink
+
+def format_for_csv(description):
+    """
+    Removes commas and newlines from text that should
+    be a standalone cell in the output csv.
+    """
+    split_desc = str(description).split(sep=',')
+    split_desc = " ".join(split_desc).splitlines()
+    return (" ".join(split_desc))
+
+class CSV_URLs():
+    """
+    Processes the raw results of the webcrawl and outputs
+    a new csv file of the broken links.
+
+    One limitation of Scrapy is that it is not always
+    able to report the origin page by which it arrived
+    at a broken page. This class resolves this issue by
+    essentially reversing the direction of the directed
+    graph so that the destination pages now point to their
+    origins. The ultimate output of this process is a new
+    csv file exclusively containing the broken links.
+    """
+    def __init__(self, fname):
+        self.filename = fname
+        self.broken_pages = dict()
+        self.find_broken_pages()
+        self.broken_links = list()
+        self.find_broken_links()
+        self.rewrite_csv()
+
+    def find_broken_pages(self):
+        """
+        Scans the csv file for any pages with a status code
+        of 400 or greater.
+        """
+        # Open the CSV file and prepare it for reading
+        with open(self.filename, newline='') as link_list:
+            link_reader = csv.DictReader(link_list, delimiter=',')
+
+            # Find all broken pages in the csv
+            for row in link_reader:
+                status = int(row[FIELDS[1]])
+                if status == 400 or status == 401 or status == 404:
+                    self.broken_pages[row[FIELDS[0]]] = status
+
+    def find_broken_links(self):
+        """
+        Scans the csv file for any outbound links that
+        lead to a broken page.
+        """
+        with open(self.filename, newline='') as link_list:
+            link_reader = csv.DictReader(link_list, delimiter=',')
+
+            # Find all broken links in the csv
+            for row in link_reader:
+                if row[FIELDS[4]] in self.broken_pages:
+                    self.broken_links.append({
+                        FIELDS[0]: row[FIELDS[0]],
+                        FIELDS[1]: row[FIELDS[1]],
+                        FIELDS[2]: row[FIELDS[2]],
+                        FIELDS[3]: row[FIELDS[3]],
+                        FIELDS[4]: row[FIELDS[4]],
+                        FIELDS[5]: self.broken_pages[row[FIELDS[4]]],
+                    })
+
+    def rewrite_csv(self):
+        """
+        Only pages with broken links persist in the 
+        final version of the output csv
+        """
+        with open("broken_" + self.filename, 'w', newline='') as broken_link_list:
+            bll_writer = csv.DictWriter(broken_link_list, FIELDS)
+            bll_writer.writeheader()
+
+            for entry in self.broken_links:
+                bll_writer.writerow({
+                        FIELDS[0]: entry[FIELDS[0]],
+                        FIELDS[1]: entry[FIELDS[1]],
+                        FIELDS[2]: entry[FIELDS[2]],
+                        FIELDS[3]: entry[FIELDS[3]],
+                        FIELDS[4]: entry[FIELDS[4]],
+                        FIELDS[5]: entry[FIELDS[5]],
+                })
+
+
+def main():
+    # GET ARGUMENTS
+    parser = argparse.ArgumentParser(description='options')
+    # Parameters: filename and number of pages to search
+    parser.add_argument('--fname', dest='fname', help='Name of output file')
+    parser.add_argument('--number', dest='number', help='Number of pages to search')
+    args = parser.parse_args()
+    # Parse the arguments
+    if args.fname is not None:
+        links_fname = args.fname + ".csv"
+    else:
+        links_fname = 'gli_hyperlinks.csv'
+
+    if args.number is not None and args.number.isdigit():
+        num_searches = int(args.number)
+    else:
+        num_searches = 10000
+
+    # Initialize settings for webcrawl
+    process = CrawlerProcess(settings={
+        'FEEDS': {
+            links_fname : {
+                'format': 'csv',
+                'fields' : FIELDS,
+                'overwrite': True,
+            },
+        },
+        'CLOSESPIDER_PAGECOUNT': num_searches,
+        'LOG_LEVEL': 'CRITICAL',
+        'DEPTH_PRIORITY': 1,
+        'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
+        'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
+    })
+
+    # Start the webcrawl
+    process.crawl(GLI_Spider)
+    process.start()
+
+    # For all broken pages, find the source hyperlink
+    adjust_csv = CSV_URLs(links_fname)
+
+if __name__ == "__main__":
+    main()