forked from sovaa/submission-criteria
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoriginality.py
225 lines (176 loc) · 8.08 KB
/
originality.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# System
"""Originality Checking."""
import logging
import functools
from threading import Lock
# Third Party
from scipy.stats import ks_2samp
from scipy.stats.stats import pearsonr
import numpy as np
import pandas as pd
from bson.objectid import ObjectId
lock = Lock()
@functools.lru_cache(maxsize=512)
def get_submission(db_manager, filemanager, submission_id):
"""Gets the submission file from S3
Parameters:
-----------
db_manager: DatabaseManager
MongoDB data access object that has read and write functions to NoSQL DB
filemanager: FileManager
S3 Bucket data access object for querying competition datasets
submission_id : string
The ID of the submission
Returns:
--------
submission : ndarray
Array of the submission probabilities sorted by ID
"""
if not submission_id:
return None
s3_filename = db_manager.get_filename(submission_id)
try:
local_files = filemanager.download([s3_filename])
if len(local_files) != 1:
logging.getLogger().info("Error looking for submission {}, found files".format(submission_id, local_files))
return None
local_file = local_files[0]
except Exception as e:
logging.getLogger().info("Could not get submission {}".format(submission_id))
return None
df = pd.read_csv(local_file)
df.sort_values("id", inplace=True)
df = df["probability"]
return df.as_matrix()
def original(submission1, submission2, threshold=0.05):
"""Determines if two submissions are original
Paramters:
----------
submission1, submission2 : 1-D ndarrays
Submission arrays that will be used in the Kolmogorov-Smirnov statistic
threshold : float, optional, default: 0.05
threshold in which the originality_score must be greater than to be "original"
Returns:
--------
original : bool
boolean value that indicates if a submission is original
"""
score = originality_score(submission1, submission2)
return score > threshold
# this function is taken from scipy (ks_2samp) and modified and so falls
# under their BSD license
def originality_score(data1, data2):
"""
Computes the Kolmogorov-Smirnov statistic on 2 samples.
This is a two-sided test for the null hypothesis that 2 independent samples
are drawn from the same continuous distribution.
Warning: data1 is assumed sorted in ascending order.
Parameters
----------
data1, data2 : ndarray
Two arrays of sample observations assumed to be drawn from a
continuous distribution. Arrays must be of the same size. data1 is
assumed sorted in ascending order.
Returns
-------
statistic : float
KS statistic
"""
# data1 is assumed sorted in ascending order
data2 = np.sort(data2)
n1 = data1.shape[0]
n2 = data2.shape[0]
if n1 != n2:
raise ValueError("`data1` and `data2` must have the same length")
# the following commented out line is slower than the two after it
# cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0*n1)
cdf1 = np.searchsorted(data1, data2, side='right')
cdf1 = np.concatenate((np.arange(n1) + 1, cdf1)) / (1.0*n1)
# the following commented out line is slower than the two after it
# cdf2 = np.searchsorted(data2, data_all, side='right') / (1.0*n2)
cdf2 = np.searchsorted(data2, data1, side='right')
cdf2 = np.concatenate((cdf2, np.arange(n1) + 1)) / (1.0*n2)
d = np.max(np.absolute(cdf1 - cdf2))
return d
def is_almost_unique(submission_data, submission, db_manager, filemanager, is_exact_dupe_thresh, is_similar_thresh, max_similar_models):
"""Determines how similar/exact a submission is to all other submission for the competition round
Paramters:
----------
submission_data : dictionary
Submission metadata containing the submission_id and the user associated to the submission
submission : ndarray
Submission data that contains the probabilities for the competition data
db_manager : DatabaseManager
MongoDB data access object that has read and write functions to NoSQL DB
filemanager : FileManager
S3 Bucket data access object for querying competition datasets
is_exact_dupe_thresh :
Threshold for determining if a submission is and exact duplicate to another submission
is_similar_thresh :
Similarity threshold that determines if a submission is too similar and counts against the submissions originality
max_similar_models :
The max number of models that a submission is allow to be similar to
Returns:
--------
bool
Whether the submission data is considered to be original or not
"""
num_similar_models = 0
is_original = True
similar_models = []
is_not_a_constant = np.std(submission) > 0
date_created = db_manager.get_date_created(submission_data['submission_id'])
sorted_submission = np.sort(submission)
for user_sub in db_manager.get_everyone_elses_recent_submssions(submission_data['competition_id'], submission_data['user'], date_created):
with lock:
other_submission = get_submission(db_manager, filemanager, user_sub["submission_id"])
if other_submission is None:
continue
score = originality_score(sorted_submission, other_submission)
if is_not_a_constant and np.std(other_submission) > 0 :
correlation = pearsonr(submission, other_submission)[0]
if np.abs(correlation) > 0.95:
logging.getLogger().info("Found a highly correlated submission {} with score {}".format(user_sub["submission_id"], correlation))
is_original = False
break
if score < is_exact_dupe_thresh:
logging.getLogger().info("Found a duplicate submission {} with score {}".format(user_sub["submission_id"], score))
is_original = False
break
if score <= is_similar_thresh:
num_similar_models += 1
similar_models.append(user_sub["submission_id"])
if num_similar_models >= max_similar_models:
logging.getLogger().info("Found too many similar models. Similar models were {}".format(similar_models))
is_original = False
break
return is_original
def submission_originality(submission_data, db_manager, filemanager):
"""Pulls submission data from MongoDB and determines the originality score and will update the submissions originality score
This checks a few things
1. If the current submission is similar to the previous submission, we give it the same originality score
2. Otherwise, we check that it is sufficently unique. To check this we see if it is A. Almost identitical to
any other submission or B. Very similar to a handful of other models.
Parameters:
-----------
submission_data : dictionary
Metadata about the submission pulled from the queue
db_manager : DatabaseManager
MongoDB data access object that has read and write functions to NoSQL DB
filemanager : FileManager
S3 Bucket data access object for querying competition datasets
"""
s = db_manager.db.submissions.find_one({'_id':ObjectId(submission_data['submission_id'])})
submission_data['user'] = s['username']
submission_data['competition_id'] = s['competition_id']
logging.getLogger().info("Scoring {} {}".format(submission_data['user'], submission_data['submission_id']))
with lock:
submission = get_submission(db_manager, filemanager, submission_data['submission_id'])
if submission is None:
logging.getLogger().info("Couldn't find {} {}".format(submission_data['user'], submission_data['submission_id']))
return
is_exact_dupe_thresh = 0.005
is_similar_thresh = 0.03
max_similar_models = 1
is_original = is_almost_unique(submission_data, submission, db_manager, filemanager, is_exact_dupe_thresh, is_similar_thresh, max_similar_models)
db_manager.write_originality(submission_data['submission_id'], submission_data['competition_id'], is_original)