-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_ncaa.py
261 lines (224 loc) · 8.87 KB
/
scrape_ncaa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#! /usr/bin/env python
'''
Scrape the ESPN website to get all NCAA Men's Basketball rankings from week 1
to the current week. Produce diagnostic pltos of rankings vs. time.
Usage:
python scrape_ncaa.py
'''
__author__ = "Jo Taylor"
__date__ = "03-01-2017"
__maintainer__ = "Jo Taylor"
__email__ = "[email protected]"
import matplotlib.pyplot as pl
pl.style.use("ggplot")
import numpy as np
from bs4 import BeautifulSoup
import requests
#-----------------------------------------------------------------------------#
#-----------------------------------------------------------------------------#
def parse_espn():
'''
Get the HTML source code from ESPN and gather the rankings from the AP
Top 25 and USA Today Coaches Poll.
Parameters:
-----------
None
Returns:
--------
espn_dict : dictionary
Dictionary describing rankings as a function of poll.
season : str
Year of season of interest.
'''
# The current rankings should always be at this URL.
url = "http://www.espn.com/mens-college-basketball/rankings"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
# Initialize dictionary.
espn_dict = {"ap": {},
"usa": {} }
# Get the header of the webpage, which describes the current year (season)
h1 = soup.find("h1")
header = h1.get_text()
words = header.split()
season = words[0]
# Preseason = Week 1
if "Preseason" in header:
final_week = "1"
elif "Postseason" in header:
final_week = "18"
else:
final_week = words[words.index("Week") + 1]
# Work backwards from the final week to week 1
for week in np.arange(1, int(final_week)+1)[::-1]:
espn_dict["ap"][week] = {}
espn_dict["usa"][week] = {}
if week != final_week:
# For all weeks not the one we started at, change the URL
# and get HTML text
url = "http://www.espn.com/mens-college-basketball/rankings/_/year/{0}/week/{1}/".format(season, week)
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
# Get the HTML tables (there should be 2, one for each poll).
all_tables = soup.find_all("table")
for table in all_tables:
# Get all rows from the table
rows = table.find_all("tr")
for i in range(len(rows)):
cols = rows[i].find_all("td")
# The first row contains the poll information.
if i == 0:
polltype = cols[0].get_text()
if polltype == "AP Top 25":
poll = "ap"
elif polltype == "USA Today Coaches Poll":
poll = "usa"
# The second row contains the Column names, which we don't want
elif i == 1:
continue
else:
if "No rankings available" in cols[0].get_text():
nodata = True
else:
nodata = False
# Get the rank, team, and season record
for ind, key in enumerate(["rank","team","record"]):
if not key in espn_dict[poll][week].keys():
espn_dict[poll][week][key] = []
if nodata is True:
espn_dict[poll][week][key].append(0)
continue
# Need to do split and strip on result to ensure you
# get full team name (e.g. Notre Dame)
colval = cols[ind].get_text()
keyval = colval.split("(")[0].strip()
if key == "rank":
keyval = int(keyval)
espn_dict[poll][week][key].append(keyval)
return espn_dict, season
#-----------------------------------------------------------------------------#
#-----------------------------------------------------------------------------#
def plot_rank_v_week(team_dict, season, save):
'''
Plot the rank vs. week for each team that is ranked at least once during
the season.
Paramaters:
-----------
team_dict : dictionary
Dictionary describing teams as a function of rank.
season : str
Year of season of interest.
save : Bool
Switch to save figure.
Returns:
--------
None
'''
for team in team_dict["ap"].keys():
fig, ax = pl.subplots(figsize=(9, 6))
ap_ranks = team_dict["ap"][team]["ranks"]
ap_weeks = team_dict["ap"][team]["weeks"]
ax.plot(ap_weeks, ap_ranks, "o-", color="royalblue", label="AP")
# The AP and USA polls differ sometimes, so check if team is ranked in
# both polls.
try:
usa_ranks = team_dict["usa"][team]["ranks"]
usa_weeks = team_dict["usa"][team]["weeks"]
except KeyError:
pass
# If the team is ranked in USA as well, plot both. Otherwise only plot
# AP poll.
else:
ax.plot(usa_weeks, usa_ranks, "o-", color="mediumturquoise", label="USA")
ax.set_ylim(26, -1)
ax.set_xlim(0, 19)
ax.legend(loc="best")
ax.set_xlabel("Week")
ax.set_ylabel("Rank")
ax.set_title("{0} {1} Rankings".format(season, team))
if save:
figname = "{}_{}_rank_v_time.png".format(season, team.replace(" ", ""))
fig.savefig(figname, bbox_inches="tight", dpi=200)
print("Saved {0}".format(figname))
else:
fig.show()
this = input("Press enter to continue")
fig.clf()
#-----------------------------------------------------------------------------#
#-----------------------------------------------------------------------------#
def compile_team_info(espn_dict):
'''
Take the output from parse_espn, and create a new dictionary that describes
ranks as a function of week for each team ranked at least once during the
season, for each poll.
E.g. {"ap": {"indiana": [1,1,1,1,1], "purdue": [25,30,30,30,30],...},
"usa": ...}
Parameters:
-----------
espn_dict : dictionary
Dictionary describing rankings as a function of poll.
Returns:
--------
team_dict : dictionary
Dictionary describing teams as a function of rank.
'''
from collections import defaultdict
# Initialize dictionary.
team_dict = {}
for poll in espn_dict.keys():
# Create dictionary for each poll (ap vs usa).
team_dict[poll] = {}
# Loop over each week in espn_dict and get all teams that were ranked.
for week in espn_dict[poll].keys():
teams = espn_dict[poll][week]["team"]
ranks = espn_dict[poll][week]["rank"]
# Loop over teams that in espn_dict.
for i in range(len(teams)):
# If team not ranked, add it to team_dict and fill in previous
# weeks (if not week 1) with "unranked", which I call rank=30.
if teams[i] not in team_dict[poll].keys():
team_dict[poll][teams[i]] = defaultdict(list)
team_dict[poll][teams[i]]["weeks"].append(int(week))
team_dict[poll][teams[i]]["ranks"].append(ranks[i])
return team_dict
#-----------------------------------------------------------------------------#
#-----------------------------------------------------------------------------#
def print_indiana():
template = """
______
|_ _|
______ | | ______
|_ _| | | |_ _|
| | | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | | |
| |___| |___| |
\ /
-----| |-----
_| |_
|______|
"""
print("Indiana, Our Indiana")
print("Indiana, we're all for you")
print("We will fight for")
print("the Cream & Crimson")
print("For the glory of old IU")
print("Never daunted, we cannot falter")
print("In a battle, we're tried and true")
print("Indiana, Our Indiana")
print("Indiana, we're all for you!")
print(template)
#-----------------------------------------------------------------------------#
#-----------------------------------------------------------------------------#
if __name__ == "__main__":
LINEOUT = "#-----------------------------------------------------------------------------#"
print("{0}\n HOO HOO HOO HOOSIERS!\n{1}".format(LINEOUT, LINEOUT))
espn_dict, season = parse_espn()
team_dict = compile_team_info(espn_dict)
plot_rank_v_week(team_dict, season, True)
print(LINEOUT)
#print_indiana()