-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreparation.py
40 lines (30 loc) · 1.24 KB
/
preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def minimal_entries_by_user(df, min):
count_user = df.groupby('user_id', as_index=False).agg({'ts_listen':'count'})
df = df.join(count_user, on='user_id', rsuffix='_count')
df = df[df['ts_listen_count'] >= min]
del df['ts_listen_count']
del df['user_id_count']
return df
def remove_first_value_for_all(df):
df = df.sort_values(by=['user_id','ts_listen'], axis=0)
df['index'] = df.index
first_index = df.groupby('user_id', as_index=False).agg({'index':'first'})['index']
df = df[~df.index.isin(first_index)]
del df['index']
df = df.sort_index()
return df
def remove_percent_entries_for_users_who_have_to_much(df, max, p):
def apply_select_first_p_indexes(group, p):
l = len(group)
num = l*p
return group.index[0:num]
df = df.sort_values(by=['user_id', 'ts_listen'], axis=0)
count_user = df.groupby('user_id', as_index=False).agg({'ts_listen':'count'})
df = df.join(count_user, on='user_id', rsuffix='_count')
df_big = df[df['ts_listen_count'] > max]
to_remove = [i for indexes in df_big.groupby('user_id').apply(lambda group: apply_select_first_p_indexes(group, p)) for i in indexes]
df = df[~df.index.isin(to_remove)]
del df['ts_listen_count']
del df['user_id_count']
df = df.sort_index()
return df