forked from YummyAmy/Decision-Modeling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHome.py
392 lines (347 loc) · 17.7 KB
/
Home.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
import streamlit as st
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
# Function to load data from GitHub
def load_data_from_github():
base_url = "https://raw.githubusercontent.com/YummyAmy/Decision-Modeling/main/archive%20(3)/"
file_paths = {
'circuits': base_url + 'circuits.csv',
'constructor_results': base_url + 'constructor_results.csv',
'constructor_standings': base_url + 'constructor_standings.csv',
'constructors': base_url + 'constructors.csv',
'driver_standings': base_url + 'driver_standings.csv',
'races': base_url + 'races.csv',
'qualifying': base_url + 'qualifying.csv',
'pit_stops': base_url + 'pit_stops.csv',
'lap_times': base_url + 'lap_times.csv',
'drivers': base_url + 'drivers.csv',
'status': base_url + 'status.csv',
'sprint_results': base_url + 'sprint_results.csv',
'seasons': base_url + 'seasons.csv',
'results': base_url + 'results.csv'
}
dataframes = {name: pd.read_csv(url) for name, url in file_paths.items()}
return dataframes
# Set page configuration
st.set_page_config(
page_title="Home",
layout="wide",
initial_sidebar_state="expanded"
)
# Tabs at the top
tab1, tab2 = st.tabs(["Home", "Decision Modeling with Formula 1 Datasets"])
with tab1:
st.markdown("# Decision and Predictive Modeling with Formula 1 data")
st.write("""
#### Formula 1 Driver Performance Analysis
Welcome to the Decision Modeling in Formula 1 app!
This platform provides an in-depth analysis of Formula 1 drivers' performance metrics, including win rates, race participation, and performance trends over time.
Leveraging regression models and decision tree classifiers, the app offers insights into factors affecting driver success, helping you understand and predict key performance outcomes in F1 racing.
Explore various visualizations and interactive features to gain a comprehensive view of F1 decision modeling.
""")
try:
img_f1 = Image.open("Images for F1/Ferrari_Formula_1_lineup_at_the_Nürburgring.jpg")
new_image_f1 = img_f1.resize((800, 400))
st.image(new_image_f1, caption='F1 Drivers', use_column_width="always")
except FileNotFoundError:
st.error("F1 image not found. Please check the file path.")
st.markdown(
"""
#### Summary:
- Exploratory Analysis of Formula 1 datasets
- Correlation Matrices of datasets
- Evaluating driver performance metrics
- Visualizations of driver statistics and performance trends
- Regression models to predict win rates
- Export functionality for top driver data
"""
)
st.sidebar.title("Connect")
st.sidebar.markdown("[Linktree](https://linktr.ee/ameusifoh)")
st.sidebar.markdown("[LinkedIn](https://www.linkedin.com/in/ameti-obong-u-395a25111/)")
st.sidebar.markdown("[Clicked](https://www.clicked.com/browse-experiences)")
# st.sidebar.markdown('<a href="mailto:[email protected]">E-mail</a>', unsafe_allow_html=True)
# st.sidebar.markdown("[Tableau Profile](https://public.tableau.com/app/profile/amyu)")
# st.markdown(hide_decoration_bar_style, unsafe_allow_html=True)
# Tab 2: F1 Analysis
with tab2:
st.header("F1 Analysis")
# Load data
dataframes = load_data_from_github()
# Inspect data
# st.subheader("Data Inspection")
# def inspect_data(dataframes):
# inspection_results = {}
# for name, df in dataframes.items():
# inspection_results[name] = {
# 'shape': df.shape,
# 'missing_values': df.isnull().sum(),
# }
# return inspection_results
# inspection_results = inspect_data(dataframes)
# st.write("Data Inspection Results:")
# st.write(inspection_results)
# Driver nationality distribution
st.subheader("Exploration of datasets")
st.subheader("Driver Nationality Distribution")
drivers = dataframes['drivers']
nationality_distribution = drivers['nationality'].value_counts()
plt.figure(figsize=(10, 4))
sns.barplot(x=nationality_distribution.values, y=nationality_distribution.index)
plt.title('Nationality Distribution of Drivers')
plt.xlabel('Number of drivers')
plt.ylabel('Nationality')
st.pyplot(plt)
# Aggregate total points for each driver
st.subheader("Top 10 Drivers by Total Points")
driver_points = dataframes['results'].groupby('driverId')['points'].sum().reset_index()
driver_points_sorted = driver_points.sort_values(by='points', ascending=False)
top_10_drivers = driver_points_sorted.head(10)
top_10_drivers_details = top_10_drivers.merge(dataframes['drivers'], on='driverId')
driver_names = top_10_drivers_details['forename'] + ' ' + top_10_drivers_details['surname']
driver_points = top_10_drivers_details['points']
plt.figure(figsize=(5, 3))
plt.barh(driver_names, driver_points, color='skyblue')
plt.xlabel('Total Points')
plt.ylabel('Drivers')
plt.title('Top 10 Drivers by Total Points')
plt.gca().invert_yaxis()
st.pyplot(plt)
# Driver performance by constructor
st.subheader("Driver Performance by Constructor")
results = dataframes['results']
constructors = dataframes['constructors']
driver_constructor_points = results.groupby(['driverId', 'constructorId'])['points'].sum().reset_index()
driver_constructor_points = driver_constructor_points.merge(drivers[['driverId', 'forename', 'surname']], on='driverId')
driver_constructor_points = driver_constructor_points.merge(constructors[['constructorId', 'name']], on='constructorId')
top_10_drivers = driver_constructor_points.groupby('driverId')['points'].sum().reset_index().sort_values(by='points', ascending=False).head(10)
top_10_driver_constructor_points = driver_constructor_points[driver_constructor_points['driverId'].isin(top_10_drivers['driverId'])]
plt.figure(figsize=(8, 6))
sns.barplot(x='surname', y='points', hue='name', data=top_10_driver_constructor_points)
plt.title('Points Scored by Top 10 Drivers for Each Constructor')
plt.xlabel('Driver')
plt.ylabel('Total Points')
plt.legend(title='Constructor', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(plt)
# Points scored by top 10 drivers over seasons
st.subheader("Points Scored by Top 10 Drivers Over Seasons")
top_10_driver_ids = top_10_drivers['driverId'].tolist()
top_10_results = dataframes['results'][dataframes['results']['driverId'].isin(top_10_driver_ids)]
top_10_results = top_10_results.merge(dataframes['races'][['raceId', 'year']], on='raceId')
driver_season_points = top_10_results.groupby(['driverId', 'year'])['points'].sum().reset_index()
driver_season_points = driver_season_points.merge(drivers[['driverId', 'forename', 'surname']], on='driverId')
driver_season_pivot = driver_season_points.pivot(index='year', columns='surname', values='points')
plt.figure(figsize=(8, 5))
driver_season_pivot.plot(kind='line', marker='o', ax=plt.gca())
plt.title('Points Scored by Top 10 Drivers Over Seasons')
plt.xlabel('Season')
plt.ylabel('Total Points')
plt.legend(title='Driver')
plt.grid(True)
st.pyplot(plt)
# Win rates of top 5 drivers
st.subheader("Win Rates of Top 5 Drivers")
driver_races = results.groupby('driverId')['raceId'].count().reset_index()
driver_races.rename(columns={'raceId': 'total_races'}, inplace=True)
driver_wins = results[results['positionOrder'] == 1].groupby('driverId')['raceId'].count().reset_index()
driver_wins.rename(columns={'raceId': 'total_wins'}, inplace=True)
driver_performance = driver_races.merge(driver_wins, on='driverId', how='left')
driver_performance['total_wins'].fillna(0, inplace=True)
driver_performance['win_rate'] = driver_performance['total_wins'] / driver_performance['total_races']
top_5_drivers = driver_performance.sort_values(by='total_wins', ascending=False).head(5)
top_5_drivers = top_5_drivers.merge(drivers[['driverId', 'forename', 'surname']], on='driverId')
top_5_drivers = top_5_drivers.sort_values(by='win_rate', ascending=True)
plt.figure(figsize=(4, 3))
sns.barplot(x='surname', y='win_rate', data=top_5_drivers, palette='viridis')
plt.title('Win Rates of Top 5 Drivers')
plt.xlabel('Driver')
plt.ylabel('Win Rate')
plt.ylim(0, 1)
plt.tight_layout()
st.pyplot(plt)
# Correlation matrices
st.subheader("Correlation Matrices")
results_numeric = results.select_dtypes(include=['float64', 'int64'])
corr = results_numeric.corr()
plt.figure(figsize=(7, 5))
ax = sns.heatmap(corr, annot=False, cmap='coolwarm', linewidths=0.5, cbar_kws={"shrink": .8})
for i in range(corr.shape[0]):
for j in range(corr.shape[1]):
ax.text(j+0.5, i+0.5, f'{corr.iloc[i, j]:.2f}', ha='center', va='center', color='black')
plt.title('Correlation Matrix for Results')
st.pyplot(plt)
# Pairplots
st.subheader("Pairplots for distribution of Results")
sns.pairplot(results)
plt.suptitle("Pairplot for Results", y=1.02)
st.pyplot(plt)
# Relationship between pitstops and wins
st.subheader("Relationship Between Pitstops and Wins")
pit_stops = dataframes['pit_stops']
avg_pit_stop_duration = pit_stops.groupby(['raceId', 'driverId'])['milliseconds'].mean().reset_index(name='avg_pit_stop_duration')
performance = results.merge(avg_pit_stop_duration, on=['raceId', 'driverId'])
correlation = performance['avg_pit_stop_duration'].corr(performance['positionOrder'])
st.write(f"Correlation between Average Pit Stop Duration and Race Position Order: {correlation}")
plt.figure(figsize=(6, 4))
plt.scatter(performance['avg_pit_stop_duration'], performance['positionOrder'], alpha=0.6)
plt.title('Correlation Between Average Pit Stop Duration and Race Position Order')
plt.xlabel('Average Pit Stop Duration (milliseconds)')
plt.ylabel('Race Position Order')
z = np.polyfit(performance['avg_pit_stop_duration'], performance['positionOrder'], 1)
p = np.poly1d(z)
plt.plot(performance['avg_pit_stop_duration'], p(performance['avg_pit_stop_duration']), "r--")
plt.grid(True)
st.pyplot(plt)
# Linear regression
st.subheader("Linear Regression")
# Calculate the total number of races for each driver
driver_races = dataframes['results'].groupby('driverId')['raceId'].count().reset_index()
driver_races.rename(columns={'raceId': 'total_races'}, inplace=True)
# Calculate the total number of wins for each driver
driver_wins = dataframes['results'][dataframes['results']['positionOrder'] == 1].groupby('driverId')['raceId'].count().reset_index()
driver_wins.rename(columns={'raceId': 'total_wins'}, inplace=True)
# Merge total races and total wins dataframes
driver_performance = driver_races.merge(driver_wins, on='driverId', how='left')
driver_performance['total_wins'].fillna(0, inplace=True) # Fill NaNs with 0 for drivers with no wins
# Calculate win rate
driver_performance['win_rate'] = driver_performance['total_wins'] / driver_performance['total_races']
# Merge with drivers dataset to get driver details
driver_performance = driver_performance.merge(dataframes['drivers'][['driverId', 'forename', 'surname', 'dob']], on='driverId')
driver_performance['dob'] = pd.to_datetime(driver_performance['dob'])
driver_performance['age'] = 2024 - driver_performance['dob'].dt.year
features = ['total_races', 'total_wins', 'age']
target = 'win_rate'
X = driver_performance[features]
y = driver_performance[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
st.write(f"Mean Squared Error: {mse}")
st.write(f"R-squared: {r2}")
coefficients = pd.DataFrame(model.coef_, features, columns=['Coefficient'])
st.write("Model Coefficients:")
st.write(coefficients)
plt.figure(figsize=(6, 4))
sns.scatterplot(x=y_test, y=y_pred)
plt.title('Predicted vs. Actual Win Rates')
plt.xlabel('Actual Win Rate')
plt.ylabel('Predicted Win Rate')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.tight_layout()
st.pyplot(plt)
# Decision Tree Model
st.subheader("Decision Tree Model")
results['win'] = results['positionOrder'].apply(lambda x: 1 if x == 1 else 0)
data = results.merge(constructors, on='constructorId')
features = ['grid', 'laps', 'milliseconds', 'fastestLapSpeed']
target = 'win'
for feature in features:
data[feature] = pd.to_numeric(data[feature], errors='coerce')
data[features] = data[features].fillna(data[features].mean())
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
max_depth = 3
model = DecisionTreeClassifier(random_state=42, max_depth=max_depth)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
st.write(f'Accuracy: {accuracy:.2f}')
st.write(f'Precision: {precision:.2f}')
st.write(f'Recall: {recall:.2f}')
st.write('Confusion Matrix:')
st.write(conf_matrix)
plt.figure(figsize=(14, 5))
plot_tree(model, feature_names=features, class_names=['Not Win', 'Win'], filled=True, rounded=True, fontsize=10)
st.pyplot(plt)
# # Call the function
# if __name__ == '__main__':
# st.set_page_config(page_title="Home", layout="wide")
# def week2():
# st.markdown("# Week 2")
# st.write("Use the dropdown below to filter data by neighborhood")
# neighborhoods = np.sort(airbnb_df.neighbourhood_group.unique())
# selected_neighborhood = st.selectbox("Neighborhood", neighborhoods)
# filtered_df = airbnb_df[airbnb_df.neighbourhood_group == selected_neighborhood]
# st.write("Filtered Data:")
# st.write(filtered_df)
# map_data = filtered_df[['latitude', 'longitude']]
# map_data = map_data.dropna(how='any')
# st.map(map_data)
# st.sidebar.markdown("## Download CSV file")
# st.sidebar.download_button(
# label="Download Data",
# data=filtered_df.to_csv(index=False),
# file_name="airbnb_filtered.csv",
# mime="text/csv",
# )
# import streamlit as st
# from PIL import Image
# #set page configuration
# st.set_page_config(
# page_title = "Home",
# page_icon = ":house:",
# layout = "wide",
# initial_sidebar_state = "expanded"
# )
# #set theme
# hide_decoration_bar_style = '''
# <style>
# header {visibility: hidden;}
# </style>
# '''
# #define function for each page
# def home():
# st.markdown("# Exploratory Data Analysis on AirBnB Data")
# st.write("""
# A 2-week project from [Corise](https://corise.com/course/intro-to-numpy-and-pandas) using Numpy and Pandas for exploratory data analysis.""")
# st.write("""
# As a Data Scientist for the Amsterdam area at Airbnb, the objective is to help visitors to make an informed choice of which airbnb to stay by analyzing the data close to a location they would like to visit. This is an analytical platform with resources like price range and type of rooms for users to choose from. It includes two interactive pages and map visualizations that allows users to explore different aspects of the airbnb listings such as price, location, and availability""")
# img = Image.open("Images/Luxury-Airbnb-Apartment-Amsterdam.jpg")
# new_image = img.resize((900,600))
# st.image(new_image, caption = 'Luxury Airbnb apartment', use_column_width="always")
# st.markdown(
# """
# ### Summary of projects:
# ## Week 1
# - Exploratory data analysis using Numpy
# - Data cleaning and analysis
# - Exploring Amsterdam AirBnB dataset using metres from the chosen location and price range as filters. (Use sidebar sliders to filter data)
# - Include a download option for the filtered dataframe
# - Web app development and deployment on streamlit
# ## Week 2
# - Exploratory Data Analysis Using Pandas
# - Interactive visuals for exploring different filters for AirBnB listings such as Neighborhood, bedrooms, beds etc.
# - Map visualization
# - Added a multiselect option to filter dataframe
# - Include a theme and multi-page configuration
# - Include a download option for the filtered CSV dataframe
# - Web app development and deployment
# """
# )
# st.sidebar.title("Connect")
# st.sidebar.markdown("[Linktree](https://linktr.ee/ameusifoh)")
# st.sidebar.markdown("[LinkedIn](https://www.linkedin.com/in/ameti-obong-u-395a25111/)")
# #st.sidebar.markdown('<a href="mailto:[email protected]">E-mail</a>', unsafe_allow_html=True)
# st.sidebar.markdown("[Tableau Profile](https://public.tableau.com/app/profile/amyu)")
# st.sidebar.markdown("[CoRise Course](https://uplimit.com/course/intro-to-numpy-and-pandas)")
# st.markdown(hide_decoration_bar_style, unsafe_allow_html=True)
# # call the function
# if __name__ == '__main__':
# home()