-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathscrape_gm.py
202 lines (158 loc) · 5.39 KB
/
scrape_gm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python
'''
Run the google maps popularity scraper
'''
import os
import sys
import time
import urllib.parse
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
# load local params from config.py
import config
# gmaps starts their weeks on sunday
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
# generate unique runtime for this job
run_time = datetime.now().strftime('%Y%m%d_%H%M%S')
def main():
# read the list of URLs from a URL, or path to a local csv
if not config.DEBUG:
if len(sys.argv) > 1:
# read path to file from system arguments
urls = pd.read_csv(sys.argv[1])
else:
# get path to file from config.py
urls = pd.read_csv(config.URL_PATH_INPUT)
else:
# debugging case
print('RUNNING TEST URLS...')
urls = pd.read_csv(config.URL_PATH_INPUT_TEST)
# write to folder logs to remember the state of the config file
urls.to_csv('logs' + os.sep + run_time + '.log', index = False)
url_list = urls.iloc[:, 0].tolist()
for url in url_list:
#print(urllib.parse.urlparse(url))
#print (url)
try:
data = run_scraper(url)
except:
print('ERROR:', url, run_time)
# go to next url
continue
if len(data) > 0:
# valid data to be written
file_name = make_file_name(url)
with open('data' + os.sep + file_name + '.' + run_time + '.csv', 'w') as f:
# write header
f.write(config.DELIM.join(config.HEADER_COLUMNS)+'\n')
# write data
for row in data:
f.write(config.DELIM.join((file_name,url,run_time)) + config.DELIM + config.DELIM.join([str(x or '') for x in row])+'\n')
print('DONE:', url, run_time)
else:
print('WARNING: no data', url, run_time)
def run_scraper(u):
# because scraping takes some time, write the actual timestamp instead of the runtime
scrape_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# get html source (note this uses headless Chrome via Selenium)
html = get_html(u, 'html' + os.sep + make_file_name(u) + '.' + run_time + '.html')
# parse html (uses beautifulsoup4)
data = parse_html(html)
return data
def make_file_name(u):
# generate filename from gmaps url
# TODO - maybe clean this up
try:
file_name = u.split('/')[5].split(',')[0]
file_name = urllib.parse.unquote(file_name).replace('+','_').replace('?','_')
except:
# maybe the URL is a short one, or whatever
file_name = u.split('/')[-1]
file_name = urllib.parse.unquote(file_name).replace('+','_').replace('?','_')
#print(file_name)
#file_name = file_name + '.' + run_time
return file_name
def get_html(u,file_name):
# if the html source exists as a local file, don't bother to scrape it
# this shouldn't run
if False and os.path.isfile(file_name):
with open(file_name,'r') as f:
html = f.read()
return html
else:
# requires chromedriver
options = webdriver.ChromeOptions()
#options.add_argument('--start-maximized')
options.add_argument('--headless')
# https://stackoverflow.com/a/55152213/2327328
# I choose German because the time is 24h, less to parse
options.add_argument('--lang=de-DE')
options.binary_location = config.CHROME_BINARY_LOCATION
chrome_driver_binary = config.CHROMEDRIVER_BINARY_LOCATION
d = webdriver.Chrome(chrome_driver_binary, options=options)
# get page
d.get(u)
# sleep to let the page render, it can take some time
# timeout after max N seconds (config.py)
# based on https://stackoverflow.com/questions/26566799/wait-until-page-is-loaded-with-selenium-webdriver-for-python
try:
WebDriverWait(d, config.SLEEP_SEC).until(EC.presence_of_element_located((By.CLASS_NAME, 'section-popular-times-bar')))
except TimeoutException:
print('ERROR: Timeout! (This could be due to missing "popular times" data, or not enough waiting.)',u)
# save html local file
if config.SAVE_HTML:
with open(file_name, 'w') as f:
f.write(d.page_source)
# save html as variable
html = d.page_source
d.quit()
return html
def parse_html(html):
soup = BeautifulSoup(html,features='html.parser')
pops = soup.find_all('div', {'class': 'section-popular-times-bar'})
hour = 0
dow = 0
data = []
for pop in pops:
# note that data is stored sunday first, regardless of the local
t = pop['aria-label']
# debugging
#print(t)
hour_prev = hour
freq_now = None
try:
if 'normal' not in t:
hour = int(t.split()[1])
freq = int(t.split()[4]) # gm uses int
else:
# the current hour has special text
# hour is the previous value + 1
hour = hour + 1
freq = int(t.split()[-2])
# gmaps gives the current popularity,
# but only the current hour has it
try:
freq_now = int(t.split()[2])
except:
freq_now = None
if hour < hour_prev:
# increment the day if the hour decreases
dow += 1
data.append([days[dow % 7], hour, freq, freq_now])
# could also store an array of dictionaries
#data.append({'day' : days[dow % 7], 'hour' : hour, 'popularity' : freq})
except:
# if a day is missing, the line(s) won't be parsable
# this can happen if the place is closed on that day
# skip them, hope it's only 1 day per line,
# and increment the day counter
dow += 1
return data
if __name__ == '__main__':
main()