Skip to content

Commit

Permalink
added script, gitignore, readme files
Browse files Browse the repository at this point in the history
  • Loading branch information
gliserma committed Oct 31, 2021
0 parents commit 17985d0
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.DS_Store
.vscode/
authentication.js
node_modules
notes/
**/*-todo.md
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Broken Links Webcrawler
Developed to identify broken links on website for
the Gilder Lehrman Insitute of American History.

## Implementation.
- A simple python script that can be launched from the terminal.

## Optional Command Line Arguments:
- --fname: desired name for the output file
- --number: how many pages should be searched

## Output: Two Files
- fname: All pages visited and all the links contained in those pages as a csv
- broken_fname: All broken links, i.e. origin page, destination page, anchor text

## Requirements
- Python3.6+
- Scrapy 2.5.0

## Future Steps
- find broken images
- find pages with code fragments showing as text
230 changes: 230 additions & 0 deletions broken_link_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
import argparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
import csv

# Dictionary containing http status codes
# with their corresponding meanings.
RESPONSE_CODES = {
200: "OK",
301: "Moved Permanently",
302: "Found",
400: "Bad Request",
401: "Unauthorized",
403: "Forbidden",
404: "Not Found"
}
# Names of the CSV fields
FIELDS = [
'origin_url', # 0
'origin_status_code', # 1
'status_description', # 2
'outbound_anchor_text', # 3
'outbound_hyperlink', # 4
'outbound_status_code', # 5
]

class GLI_Spider(CrawlSpider):
"""
Searches the Gilder Lehrman website
for all hyperlinks and page statuses.
"""
# Parameters for this Crawl Spider
name = 'broken_links_from_homepage'
allowed_domains = ['gilderlehrman.org']
start_urls = [
'https://www.gilderlehrman.org/',
'https://www.gilderlehrman.org/news/'
]
handle_httpstatus_list = [
200,
301,
302,
400,
401,
403,
404
]
# How and where the spider will crawl
rules = [
Rule(LinkExtractor(allow_domains='gilderlehrman.org'),
callback='parse_info',
follow=True)
]

# Another link extractor -- not for crawling but
# for finding all hyperlinks on a given page.
le = LinkExtractor(
allow_domains='gilderlehrman.org',
unique=False
)

def parse_info(self, response):
"""
Defines what the spider will extract
from a page that it visits.
"""
# Extract information from the reponse
status = response.status
desc = RESPONSE_CODES[status]
internal_links = self.le.extract_links(response)

# OUTPUT for the CSV
# If the page is not working:
if status >= 400:
yield {
# Collect only the url and status for this page
FIELDS[0] : response.url,
FIELDS[1] : status,
FIELDS[2] : desc,
}
# If the page is working:
else:
for out_link in internal_links:
# Also collect all outbound links on this page
yield {
FIELDS[0] : response.url,
FIELDS[1] : status,
FIELDS[2] : desc,
FIELDS[3] : format_for_csv(out_link.text.strip()),
FIELDS[4] : remove_bookmarks(out_link.url)
}

def remove_bookmarks(hyperlink):
"""
Strips any text following a hashtag ('#') in a hyperlink.
"""
if '#' in hyperlink:
index = hyperlink.find('#')
hyperlink = hyperlink[0:index]
return hyperlink

def format_for_csv(description):
"""
Removes commas and newlines from text that should
be a standalone cell in the output csv.
"""
split_desc = str(description).split(sep=',')
split_desc = " ".join(split_desc).splitlines()
return (" ".join(split_desc))

class CSV_URLs():
"""
Processes the raw results of the webcrawl and outputs
a new csv file of the broken links.
One limitation of Scrapy is that it is not always
able to report the origin page by which it arrived
at a broken page. This class resolves this issue by
essentially reversing the direction of the directed
graph so that the destination pages now point to their
origins. The ultimate output of this process is a new
csv file exclusively containing the broken links.
"""
def __init__(self, fname):
self.filename = fname
self.broken_pages = dict()
self.find_broken_pages()
self.broken_links = list()
self.find_broken_links()
self.rewrite_csv()

def find_broken_pages(self):
"""
Scans the csv file for any pages with a status code
of 400 or greater.
"""
# Open the CSV file and prepare it for reading
with open(self.filename, newline='') as link_list:
link_reader = csv.DictReader(link_list, delimiter=',')

# Find all broken pages in the csv
for row in link_reader:
status = int(row[FIELDS[1]])
if status == 400 or status == 401 or status == 404:
self.broken_pages[row[FIELDS[0]]] = status

def find_broken_links(self):
"""
Scans the csv file for any outbound links that
lead to a broken page.
"""
with open(self.filename, newline='') as link_list:
link_reader = csv.DictReader(link_list, delimiter=',')

# Find all broken links in the csv
for row in link_reader:
if row[FIELDS[4]] in self.broken_pages:
self.broken_links.append({
FIELDS[0]: row[FIELDS[0]],
FIELDS[1]: row[FIELDS[1]],
FIELDS[2]: row[FIELDS[2]],
FIELDS[3]: row[FIELDS[3]],
FIELDS[4]: row[FIELDS[4]],
FIELDS[5]: self.broken_pages[row[FIELDS[4]]],
})

def rewrite_csv(self):
"""
Only pages with broken links persist in the
final version of the output csv
"""
with open("broken_" + self.filename, 'w', newline='') as broken_link_list:
bll_writer = csv.DictWriter(broken_link_list, FIELDS)
bll_writer.writeheader()

for entry in self.broken_links:
bll_writer.writerow({
FIELDS[0]: entry[FIELDS[0]],
FIELDS[1]: entry[FIELDS[1]],
FIELDS[2]: entry[FIELDS[2]],
FIELDS[3]: entry[FIELDS[3]],
FIELDS[4]: entry[FIELDS[4]],
FIELDS[5]: entry[FIELDS[5]],
})


def main():
# GET ARGUMENTS
parser = argparse.ArgumentParser(description='options')
# Parameters: filename and number of pages to search
parser.add_argument('--fname', dest='fname', help='Name of output file')
parser.add_argument('--number', dest='number', help='Number of pages to search')
args = parser.parse_args()
# Parse the arguments
if args.fname is not None:
links_fname = args.fname + ".csv"
else:
links_fname = 'gli_hyperlinks.csv'

if args.number is not None and args.number.isdigit():
num_searches = int(args.number)
else:
num_searches = 10000

# Initialize settings for webcrawl
process = CrawlerProcess(settings={
'FEEDS': {
links_fname : {
'format': 'csv',
'fields' : FIELDS,
'overwrite': True,
},
},
'CLOSESPIDER_PAGECOUNT': num_searches,
'LOG_LEVEL': 'CRITICAL',
'DEPTH_PRIORITY': 1,
'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
})

# Start the webcrawl
process.crawl(GLI_Spider)
process.start()

# For all broken pages, find the source hyperlink
adjust_csv = CSV_URLs(links_fname)

if __name__ == "__main__":
main()

0 comments on commit 17985d0

Please sign in to comment.