-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki-scraper.py
156 lines (139 loc) · 5.99 KB
/
wiki-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import sys
import requests # to download
import bs4
import csv
import re
def findMovieName(infoSoup):
'''
The simplest of them all.
'''
return(infoSoup.tr.string)
def findDirector(infoSoup):
'''
Almost as simple as movie name. But directors seem to be stored in 2 different ways.
'''
director = ""
all_tr = infoSoup.find_all('tr') #saving all table rows to this, can iterate through them
for tr in all_tr:
if tr.th != None:
if tr.th.string == "Directed by":
director = tr.th.next_sibling.string
if director == None:
director = tr.th.next_sibling.get_text()
return(director)
def findReleaseDate(infoSoup):
'''
Dates in html are stored a little differently to how we want them, so I process them here with regex
'''
releaseDates = []
all_tr = infoSoup.find_all('tr') #saving all table rows to this, can iterate through them
for tr in all_tr:
if tr.th != None:
if tr.th.string == "Release date":
li = tr.th.next_sibling.find_all('li')
for tag in li:
dateBlob = tag.get_text() #this comes in a weird format from which date and location need to be extracted
date = re.search("(\(\d{4}-\d{2}-\d{2})\s*(\D+)?", dateBlob) # -->format will be: (yyyy-mm-dd) (location)
releaseDates.append(date.group(0))
return(releaseDates)
def findStars(infoSoup):
'''
Stars are stored in either the 'li' tag or the 'a' or in the 'td' tag itself- different layers of nesting
'''
stars = []
all_tr = infoSoup.find_all('tr') #saving all table rows to this, can iterate through them
for tr in all_tr:
if tr.th != None:
if tr.th.string == "Starring":
li = tr.td.find_all('li')
for tag in li:
stars.append(tag.string)
if stars == []:
a = tr.td.find_all('a')
for tag in a:
stars.append(tag.string)
if stars == []: # if we still havent found any stars, they must simply be in the top layer 'td'
stars.append(tr.td.string)
if stars == None or stars == []: return(["No stars."])
else: return(stars)
def findPlot(soup):
"""
This was tricky because for some or other reason wikipedia has decided not to hierarchically order paragraphs under headings
like you would expect (heading > paragraph) but instead has them as siblings, which is what made it quite difficult to access
the three paragraphs that come right after the plot 'heading'
"""
plot = ""
rightDiv = None
divs = soup.find_all('div') # div is an identifiable tag
for d in divs:
if d(class_="mw-parser-output"): # the dig we are looking for has this attribute
rightDiv = d
if rightDiv != None:
siblings = rightDiv.find_all(['h2', 'p']) # can find all headings (h2) and all paragraphs (p) under this div (remember, h2 and p are siblings)
count = 0
for tag in siblings:
if tag(id="Plot"):
count += 1
if count > 0 and count <= 3:
if tag.get_text() == "Cast[edit]": #this seems to be the next heading under plot and we dont want it
break
elif tag.get_text() == "Plot[edit]":
plot += "Plot" + "\n"
else:
plot += tag.get_text() + "\n"
count += 1
elif count > 3:
count = 0 # because we only want to get the first three siblings (including the "Plot" heading)
return(plot)
def scrapeWikiMovie(url):
print(url)
res = requests.get(url) # retrieves the page
res.raise_for_status() # checks for any errors
wikiSoup = bs4.BeautifulSoup(res.text, "html.parser") # res.text is all text from page, html.parser helps to structure the text into html format
infoTable = wikiSoup.find(class_="infobox vevent")
directedBy = findDirector(infoTable)
releaseDates = findReleaseDate(infoTable)
movieName = findMovieName(infoTable)
starring = findStars(infoTable) # some of these still dont come out right
plot = findPlot(wikiSoup)
"""
some testing
"""
if movieName == "" or movieName == None:
print("Something wrong with movieName " + url)
print("\n")
if releaseDates == [] or releaseDates == None:
print("Something wrong with releaseDates "+ url)
print("\n")
if starring == [] or starring == None:
print("Something wrong with starring " + url)
print("\n")
result = [movieName, starring, directedBy, releaseDates, plot]
return(result)
#print("Movie: " + movieName + "\n" + "Starring: " + str(starring) + "\n" + "Directed by: " + directedBy + "\n" + "Release date: " + str(releaseDates) + "\n")
#print("\n")
#print(plot)
#scrapeWikiMovie("https://en.wikipedia.org/wiki/Chorus_(2015_film)")
#scrapeWikiMovie("https://en.wikipedia.org/wiki/Joker_(2019_film)")
#scrapeWikiMovie("https://en.wikipedia.org/wiki/Summertime_(2015_film)")
#scrapeWikiMovie("https://en.wikipedia.org/wiki/Pain_and_Glory")
#scrapeWikiMovie("https://en.wikipedia.org/wiki/Once_Upon_a_Time_in_Hollywood")
miniList = ["Joker_(2019_film)", "Summertime_(2015_film)", "Pain_and_Glory", "Once_Upon_a_Time_in_Hollywood"]
f = open('movienames.txt', 'r')
text = f.readline()
f.close()
txtList = text.split(";")
movieList = []
for movie in txtList:
movieList.append(movie.replace(" ", "_")) #prepping for the url
if movie.replace(" ", "_") == "%0A": break
movieList = movieList[:-1]
print(str(len(movieList)))
for m in movieList:
if m == "":
movieList.remove(m)
with open('movieData.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter='\t')
for movie in movieList:
lineTextList = scrapeWikiMovie("https://en.wikipedia.org/wiki/" + movie)
writer.writerow(lineTextList)