-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprediction.py
executable file
·144 lines (114 loc) · 4.59 KB
/
prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
"""
Print out the name of the best candidate based on historical data of
previous candidates.
The past candidates have a name, a list of tags and an interview count.
The new candidates have the same structure, but no interview count.
The interview count will be used to generate a score for each combination
of tags, which is then used to rank the new candidates, and the candidate with
the highest rank is output.
Running this script with `python prediction.py` will output the best choice.
Running this script with `python prediction.py -v` will output all existing
and new candidates ranked by score, and the best choice.
Since the candidates only have a name and tags, my first thought was to get an
average score for each individual tag based on the interview count, but several
of the past candidates were out of order when ranked by score.
Calculating an average score for each combination of tags based on the interview
count orders the past candidates very well, only two consecutive past
candidates are transposed.
"""
import json
import sys
from collections import defaultdict
from itertools import chain, combinations
def main(verbose):
past_candidates = get_candidates_from_files([
'past_candidates1.json',
'past_candidates2.json',
'past_candidates3.json'])
scores = calculate_tag_combination_scores(past_candidates)
if verbose:
update_candidate_scores_and_sort(past_candidates, scores)
print_candidates(past_candidates)
# the README said the file would be named future_candidates.json,
# but it appears to be named new_candidates.json
new_candidates = get_candidates_from_files([
'new_candidates.json'])
update_candidate_scores_and_sort(new_candidates, scores)
if verbose:
print_candidates(new_candidates)
the_winner = new_candidates[0].name
print 'The candidate with the most interviews should be... ' + \
the_winner + '\n'
class Candidate(object):
'''
Simple class to pull out some fields for easier access.
'''
def __init__(self, jsonobj):
self.name = '{} {}'.format(jsonobj['User']['first_name'],
jsonobj['User']['last_name'])
self.tags = jsonobj['Tag']
self.interview_count = int(jsonobj['User'].get('interview_count', 0))
self.score = 0
def __repr__(self):
return '{:20} {:10} {:5.2f} {}'.format(
self.name, self.interview_count, self.score,
', '.join(self.tags))
def get_candidates_from_files(filenames):
'''
Load json files and return a single list of candidates.
'''
candidates = []
for filename in filenames:
with open(filename) as fp:
data = json.loads(fp.read())
candidates += [Candidate(c) for c in data]
return candidates
def print_candidates(candidates):
print '{:20} {:10} {:5} {}'.format(
'Name', 'Interviews', 'Score', 'Tags')
print '-' * 79
for candidate in candidates:
print candidate
print ''
def all_combinations(options):
'''
Return an iterable of all (ordered) combinations of the options.
For example
all_combinations([1, 2, 3]) returns:
((1,), (2,), (3,),
(1, 2), (1, 3), (2, 3),
(1, 2, 3))
'''
return chain(
*[combinations(options, x) for x in range(1, len(options)+1)]
)
def calculate_tag_combination_scores(candidates):
'''
Calculates a score for each combination of a candidate's tags based on the
average interview_count for the combination.
Returns a dict of {tag combination: score}
'''
tag_combo_interview_counts = defaultdict(list)
for candidate in candidates:
for tag in all_combinations(candidate.tags):
tag_combo_interview_counts[tag].append(candidate.interview_count)
scores = {tag_combo: sum(counts) * 1.0 / len(counts)
for tag_combo, counts in tag_combo_interview_counts.iteritems()}
return scores
def update_candidate_scores_and_sort(candidates, scores):
'''
Updates each candidate's score attribute for each combination of
the candidate's tags.
Then sort the list of candidates in place,
ordered by score and then interview count (largest first).
'''
for candidate in candidates:
for tag in all_combinations(candidate.tags):
candidate.score += scores.get(tag, 0)
candidates.sort(
key=lambda candidate: (candidate.score, candidate.interview_count),
reverse=True)
if __name__ == '__main__':
verbose_flag = sys.argv[-1] == '-v'
main(verbose_flag)