-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgithub-searcher.py
451 lines (373 loc) · 16.2 KB
/
github-searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
#-------------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2019 Michael Schröder, Jürgen Cito
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#-------------------------------------------------------------------------------
# This script implements a tool that exhaustively samples GitHub Code Search
# results. It is written in a semi-literal style: it should be possible to read
# through the source in a linear fashion, more or less. Enjoy.
import os, sys, argparse, shutil, time, signal
import base64, sqlite3, csv
import requests
# Before we get to the fun stuff, we need to parse and validate arguments, check
# environment variables, set up the help text and so on.
# fix for argparse: ensure terminal width is determined correctly
os.environ['COLUMNS'] = str(shutil.get_terminal_size().columns)
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='''Exhaustively sample the GitHub Code Search API.''')
parser.add_argument('query', metavar='QUERY', help='search query')
parser.add_argument('--database', metavar='FILE', default='results.db',
help='search results database file (default: results.db)')
parser.add_argument('--statistics', metavar='FILE', default='sampling.csv',
help='sampling statistics file (default: sampling.csv)')
parser.add_argument('--min-size', metavar='BYTES', type=int, default=1,
help='minimum code file size (default: 1)')
# Only files smaller than 384 KB are searchable via the GitHub API.
MAX_FILE_SIZE = 393216
parser.add_argument('--max-size', metavar='BYTES', type=int,
default=MAX_FILE_SIZE,
help=f'maximum code file size (default: {MAX_FILE_SIZE})')
parser.add_argument('--stratum-size', metavar='BYTES', type=int, default=1,
help='''length of file size ranges into which population is partitioned
(default: 1)''')
parser.add_argument('--no-throttle', dest='throttle', action='store_false',
help='disable request throttling')
parser.add_argument('--github-token', metavar='TOKEN',
default=os.environ.get('GITHUB_TOKEN'),
help='''personal access token for GitHub
(by default, the environment variable GITHUB_TOKEN is used)''')
args = parser.parse_args()
if args.min_size < 1:
sys.exit('min-size must be positive')
if args.min_size >= args.max_size:
sys.exit('min-size must be less than or equal to max-size')
if args.max_size < 1:
sys.exit('max-size must be positive')
if args.max_size > MAX_FILE_SIZE:
sys.exit(f'max-size must be less than or equal to {MAX_FILE_SIZE}')
if args.stratum_size < 1:
sys.exit('stratum-size must be positive')
if not args.github_token:
sys.exit('missing environment variable GITHUB_TOKEN')
#-------------------------------------------------------------------------------
# The GitHub Code Search API is limited to 1000 results per query. To get around
# this limitation, we can take advantage of the ability to restrict searches to
# files of a certain size. By repeatedly searching with the same query but
# different size ranges, we can reach a pretty good sample of the overall
# population. This is a technique known as *stratified sampling*. The strata in
# our case are non-overlapping file size ranges.
# Let's start with some global definitions. We need to keep track of the first
# and last size in the current stratum...
strat_first = args.min_size
strat_last = min(args.min_size + args.stratum_size - 1, args.max_size)
# ...as well as the current stratum's population and the amount of files sampled
# so far (in the current stratum). A value of -1 indicates "unknown".
pop = -1
sam = -1
# We also have an estimate of the overall population (although it's gonna be
# very unreliable), and we keep track of the total (cumulative) sample size so
# far.
est_pop = -1
total_sam = -1
#-------------------------------------------------------------------------------
# During the search we want to display a table of all the strata sampled so far,
# plus the stratum currently being sampled, some summary information, and a
# status message. These last three items will be continuously updated to signal
# the progress that's being made.
# First, let's just print the table header.
print(' ┌────────────┬────────────┐')
print(' │ population │ sample │')
print(' ├────────────┼────────────┤')
# Now we define some functions to print information about the current stratum.
# By default, this will simply add a new line to the output. However, to be able
# to show live progress, there is also an option to overwrite the current line.
def print_stratum(overwrite=False):
if overwrite:
sys.stdout.write('\033[F\r\033[J')
if strat_first == strat_last:
size = '%d' % strat_first
else:
size = '%d .. %d' % (strat_first, strat_last)
pop_str = str(pop) if pop > -1 else ''
sam_str = str(sam) if sam > -1 else ''
per = '%6.2f%%' % (sam/pop*100) if pop > 0 else ''
print('%16s │ %10s │ %10s │ %6s' % (size, pop_str, sam_str, per))
# Another function will print the footer of the table, including summary
# statistics and the status message. Here we provide a separate function to
# clear the footer again.
status_msg = ''
def print_footer():
if args.min_size == args.max_size:
size = '%d' % args.min_size
else:
size = '%d .. %d' % (args.min_size, args.max_size)
pop_str = str(est_pop) if est_pop > -1 else ''
sam_str = str(total_sam) if total_sam > -1 else ''
per = '%6.2f%%' % (total_sam/est_pop*100) if est_pop > 0 else ''
print(' ├────────────┼────────────┤')
print(' │ population │ sample │')
print(' └────────────┴────────────┘')
print('%16s %10s %10s %6s' % (size, pop_str, sam_str, per))
print(' (estimated)') if est_pop > -1 else print()
print()
print(status_msg)
def clear_footer():
sys.stdout.write(f'\033[7F\r\033[J')
# For convenience, we also have function for just updating the status message.
# It returns the old message so it can be restored later if desired.
def update_status(msg):
global status_msg
old_msg = status_msg
status_msg = msg
sys.stdout.write('\033[F\r\033[J')
print(status_msg)
return old_msg
#-------------------------------------------------------------------------------
# To access the the GitHub API, we define a little helper function that makes an
# authorized GET request and throttles the number of requests per second so as
# not to run afoul of GitHub's rate limiting. Should a rate limiting error occur
# nonetheless, the function waits the appropriate amount of time before
# automatically retrying the request.
def get(url, params={}):
if args.throttle:
time.sleep(0.72) # throttle requests to ~5000 per hour
res = requests.get(url, params, headers=
{'Authorization': f'token {args.github_token}'})
if res.status_code == 403:
return handle_rate_limit_error(res)
else:
res.raise_for_status()
return res
def handle_rate_limit_error(res):
t = res.headers.get('X-RateLimit-Reset')
if t is not None:
t = max(0, int(int(t) - time.time()))
else:
t = int(res.headers.get('Retry-After', 60))
err_msg = f'Exceeded rate limit. Retrying after {t} seconds...'
old_msg = update_status(err_msg)
time.sleep(t)
update_status(old_msg)
return get(res.url)
# We also define a convenient function to do the code search for a specific
# stratum. Note that we sort the search results by how recently a file has been
# indexed by GitHub.
def search(a,b,order='asc'):
return get('https://api.github.com/search/code',
params={'q': f'{args.query} size:{a}..{b}',
'sort': 'indexed', 'order': order, 'per_page': 100})
# To download all files returned by a code search (up to the limit of 1000
# imposed by GitHub), we need to deal with pagination. On each page, we download
# all files and add them and their metadata to our results database (which will
# be set up in the next section), provided they're not already in the database
# (which can happen when continuing a previous search). Also, if any of the
# files can not be downloaded, for whatever reason, they are simply skipped over
# and count as not sampled.
def download_all_files(res):
global pop
download_files_from_page(res)
while 'next' in res.links:
update_status('Getting next page of search results...')
res = get(res.links['next']['url'])
pop2 = res.json()['total_count']
pop = max(pop,pop2)
download_files_from_page(res)
if sam >= pop:
break
update_status('')
def download_files_from_page(res):
global sam, total_sam
update_status('Downloading files...')
for item in res.json()['items']:
if not known_file(item):
repo = item['repository']
insert_repo(repo)
try:
url = item['url'].replace('#', '%23')
file = get(url).json()
except:
continue
if file['type'] == 'file':
insert_file(file, repo['id'])
sam += 1
total_sam += 1
clear_footer()
print_stratum(overwrite=True)
print_footer()
if sam >= pop:
return
#-------------------------------------------------------------------------------
# This is a good place to open the connection to the results database, or create
# one if it doesn't exist yet. The database schema follows the GitHub API
# response schema. Our 'insert_repo' and 'insert_file' functions directly take a
# JSON response dictionary.
db = sqlite3.connect(args.database)
db.executescript('''
CREATE TABLE IF NOT EXISTS repo
( repo_id INTEGER PRIMARY KEY
, name TEXT NOT NULL
, full_name TEXT NOT NULL
, description TEXT
, url TEXT NOT NULL
, fork INTEGER NOT NULL
, owner_id INTEGER NOT NULL
, owner_login TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS file
( file_id INTEGER PRIMARY KEY
, name TEXT NOT NULL
, path TEXT NOT NULL
, size INTEGER NOT NULL
, sha TEXT NOT NULL
, content TEXT NOT NULL
, repo_id INTEGER NOT NULL
, FOREIGN KEY (repo_id) REFERENCES repo(repo_id)
, UNIQUE(path,repo_id)
);
''')
def insert_repo(repo):
db.execute('''
INSERT OR IGNORE INTO repo
( repo_id, name, full_name, description, url, fork
, owner_id, owner_login
)
VALUES (?,?,?,?,?,?,?,?)
''',
( repo['id']
, repo['name']
, repo['full_name']
, repo['description']
, repo['url']
, int(repo['fork'])
, repo['owner']['id']
, repo['owner']['login']
))
db.commit()
def insert_file(file,repo_id):
db.execute('''
INSERT OR IGNORE INTO file
(name, path, size, sha, content, repo_id)
VALUES (?,?,?,?,?,?)
''',
( file['name']
, file['path']
, file['size']
, file['sha']
, base64.b64decode(file['content']).decode('UTF-8')
, repo_id
))
db.commit()
def known_file(item):
cur = db.execute("select count(*) from file where path = ? and repo_id = ?",
(item['path'], item['repository']['id']))
return cur.fetchone()[0] == 1
#-------------------------------------------------------------------------------
# Now we can finally get into it!
# First, let's get an estimate of the total population. Note that this is a
# very, very unstable number that can not be relied upon!
status_msg = 'Getting an estimate of the overall population...'
print_footer()
res = search(args.min_size, args.max_size)
est_pop = int(res.json()['total_count'])
total_sam = 0
# Before starting the iterative search process, let's see if we have a sampling
# statistics file that we could use to continue a previous search. If so, let's
# get our data structures and UI up-to-date; otherwise, create a new statistics
# file.
if os.path.isfile(args.statistics):
update_status('Continuing previous search...')
with open(args.statistics, 'r') as f:
fr = csv.reader(f)
next(fr) # skip header
for row in fr:
strat_first = int(row[0])
strat_last = int(row[1])
pop = int(row[2])
sam = int(row[3])
total_sam += sam
clear_footer()
print_stratum()
print_footer()
if pop > -1:
strat_first += args.stratum_size
strat_last = min(strat_last + args.stratum_size, args.max_size)
pop = -1
sam = -1
else:
with open(args.statistics, 'w') as f:
f.write('stratum_first,stratum_last,population,sample\n')
statsfile = open(args.statistics, 'a', newline='')
stats = csv.writer(statsfile)
#-------------------------------------------------------------------------------
# Let's also quickly define a signal handler to cleanly deal with Ctrl-C. If the
# user quits the program and cancels the search, we want to allow him to later
# continue more-or-less where he left of. So we need to properly close the
# database and statistic file.
def signal_handler(sig,frame):
db.commit()
db.close()
statsfile.flush()
statsfile.close()
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
#-------------------------------------------------------------------------------
clear_footer()
print_stratum()
print_footer()
# Iterating through all the strata, we want to sample as much as we can.
while strat_first <= args.max_size:
update_status('Searching...')
res = search(strat_first, strat_last)
pop = int(res.json()['total_count'])
sam = 0
clear_footer()
print_stratum(overwrite=True)
print_footer()
download_all_files(res)
# To stretch the 1000-results-per-query limit, we can simply repeat the
# search with the sort order reversed, thus sampling the stratum population
# from both ends, so to speak. This gives us a maximum sample size of 2000
# per stratum.
if pop > 1000:
update_status('Repeating search with reverse sort order...')
res = search(strat_first, strat_last, order='desc')
# Due to the instability of search results, we might get a different
# population count on the second query. We will take the maximum of the
# two population counts for this stratum as a conservative estimate.
pop2 = int(res.json()['total_count'])
pop = max(pop,pop2)
clear_footer()
print_stratum(overwrite=True)
print_footer()
download_all_files(res)
# After we've sampled as much as we could of the current strata, commit it
# to the table and move on to the next one.
stats.writerow([strat_first,strat_last,pop,sam])
statsfile.flush()
strat_first += args.stratum_size
strat_last = min(strat_last + args.stratum_size, args.max_size)
pop = -1
sam = -1
clear_footer()
print_stratum()
print_footer()
update_status('Done.')