-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
57 lines (46 loc) · 2.5 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def create_split(df, seed):
train_df, test_df = train_test_split(df, train_size=0.8, test_size=0.2, random_state=seed, stratify=df['Problem_ID'])
return train_df.reset_index(drop=True), test_df.reset_index(drop=True)
if __name__ == '__main__':
df_raw = pd.read_excel("https://github.com/DavidBreuer/ifsc-analysis/raw/main/ifsc_Boulder.xlsx")
df = (
df_raw
# Get rid of columns we don't need
.drop(columns=['Unique', 'Discipline', 'Number', 'Group'])
.drop(columns=df_raw.filter(regex='Route|Run').columns)
# Capitalize climber names consistently
.assign(Name=lambda x: x['Name'].str.title())
# "Unpivot" so it's one row per climber-problem
# I'm treating tops and zones as separate problems even though there's obviously a correlation
.melt(id_vars=['Year', 'Competition', 'Gender', 'Level', 'Name', 'Country'],
value_vars=['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Zone1', 'Zone2', 'Zone3', 'Zone4', 'Zone5'],
var_name='Problem', value_name='Attempts')
.dropna(subset=['Attempts'])
.replace([-np.inf, np.inf], np.nan)
)
# Keep completed boulders
grouped = df.groupby(['Competition', 'Level', 'Problem', 'Gender'])
df = grouped.filter(lambda x: x['Attempts'].notnull().any())
# Max attempts passed through
df = df.assign(Max_attempts=lambda x: grouped['Attempts'].transform('max'))
# Survival model features
df = df.assign(
Status=lambda x: x['Attempts'].notna(),
Time=lambda x: np.where(x['Attempts'].isna(), x['Max_attempts'], x['Attempts'])
)
# Failure set to max attempts
df['Attempts'] = np.where(df['Attempts'].isna(), df['Max_attempts'], df['Attempts'])
df[['Attempts', 'Max_attempts', 'Status', 'Time']] = df[['Attempts', 'Max_attempts', 'Status', 'Time']].astype(int)
# Make Problem Category
df['Problem_category'] = np.where(df.Problem.str.startswith('Top').values, 'Top', 'Zone')
# Handle
dfm = df[df.Gender=="M"].copy()
dfw = df[df.Gender=="W"].copy()
STRATIFICATION = ['Year', 'Competition', 'Level', 'Problem']
dfm['Problem_ID'] = dfm[STRATIFICATION].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
dfw['Problem_ID'] = dfw[STRATIFICATION].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
dfm.to_csv('./data/men_data.csv', index=False)
dfw.to_csv('./data/women_data.csv', index=False)