-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmsv.py
123 lines (107 loc) · 2.72 KB
/
msv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import bs4
from common import *
from dateutil import parser
import re
def get_track(meta):
if meta.startswith('bhgp'):
return 'Brands Hatch Grand Prix'
elif meta.startswith('gp'):
return 'Brands Hatch Grand Prix'
elif meta.startswith('bh'):
return 'Brands Hatch Indy'
elif meta.startswith('op'):
return 'Oulton Park'
elif meta.startswith('cp'):
return 'Cadwell Park'
# maybe bad assumptions about the track used...
elif meta.startswith('sn'):
return 'Snetterton 300'
elif meta.startswith('dp'):
return 'Donington Park Grand Prix'
elif meta.startswith('ba'):
return 'Bedford Autodrome SW'
return None
def get_kind(meta):
if meta == 'n':
return Kind.NOVICE
elif meta == 'nov':
return Kind.NOVICE
elif meta == 'e':
return Kind.EVENING
elif meta == 'eve':
return Kind.EVENING
elif meta == 'opl':
return Kind.OPL
elif meta == 'rbo':
return Kind.RBO
elif meta == 'q':
return Kind.NORMAL
elif meta == '3gp':
return Kind.NORMAL
elif meta == '4grp':
return Kind.NORMAL
elif meta == 'bsb':
pass
elif meta == 'am':
pass
else:
return None
def is_int(string):
try:
int(string)
return True
except ValueError:
return False
def parse(elem):
if not elem.a:
return None
# Grab the URL, which has all the data we need
# Always of the form: /bike/calendar/2017/apr/15-sn-n.aspx
# Where 'sn' is the track (Snetterton 300)
# and 'n' is a type modifier (Novice)
url = elem.a['href']
match = re.search('(\d\d\d\d/.*/\d\d?)(.*)\/', url)
if not match:
print 'FAILED TO MATCH ', url
return None
try:
date = parser.parse(match.group(1))
except ValueError:
print 'Skipping invalid date {}; url = {}'.format(match.group(1), url)
return None
meta = match.group(2).strip('-').split('-')
desc = elem.a.parent.find_previous('td').contents[0]
event = {
'company': Company.MSV,
'date': date,
'track': None,
'kind': Kind.NORMAL,
'desc': desc,
'url': 'http://www.msvtrackdays.com' + url
}
for m in meta:
track = get_track(m.strip())
kind = get_kind(m.strip())
if is_int(m):
print 'Ignoring number in metadata: {!r}; url {}'.format(m, url)
continue
if track:
event['track'] = track
continue
if kind:
event['kind'] = kind
continue
print 'Metadata unsupported: {!r}; url {}'.format(m, url)
return None
if not event['track']:
print 'No track for url ' + url
return None
return event
def scrape():
soup = fetch_soup('msv.html',
'http://www.msvtrackdays.com/bike/calendar.aspx')
events = map(parse, soup.find_all('tr'))
return filter(bool, events)
if __name__ == '__main__':
for e in scrape():
print e