Skip to content

Commit

Permalink
update trees
Browse files Browse the repository at this point in the history
  • Loading branch information
MaximilianPi committed Jun 18, 2024
1 parent 1a865c5 commit 0fd087b
Show file tree
Hide file tree
Showing 10 changed files with 1,237 additions and 1,306 deletions.
4 changes: 2 additions & 2 deletions A4-MLpipeline.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ model = dnn(survived~.,
lambda = 0.001, # change
alpha = 0.1, # change
lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 10, factor = 0.9),
data = data_obs, epochs = 40L, verbose = TRUE, plot= TRUE)
data = data_obs, epochs = 40L, verbose = FALSE, plot= TRUE)
# Predictions:
Expand Down Expand Up @@ -555,7 +555,7 @@ write.csv(data.frame(y = predictions[,1]), file = "Max_titanic_dnn.csv")

<!-- hyper_lambda = runif(20,0, 0.2) -->

<!-- tuning_results = -->
<!-- tuning_results = -->

<!-- sapply(1:length(hyper_lambda), function(k) { -->

Expand Down
214 changes: 91 additions & 123 deletions B1-Trees.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,8 @@ data_sub =
pclass = as.integer(pclass - 1L))
data_new = data_sub[is.na(data_sub$survived),] # for which we want to make predictions at the end
data_obs = data_sub[!is.na(data_sub$survived),] # data with known response
data_sub$survived = as.factor(data_sub$survived)
data_obs$survived = as.factor(data_obs$survived)
```

**Hints:**
Expand All @@ -421,44 +423,37 @@ data_obs = data_sub[!is.na(data_sub$survived),] # data with known response

**Bonus:**

- tune also mtry
- tune min node size (and mtry)
- use more features

::: {.callout-tip collapse="true" appearance="minimal"}
## Code template

```{r, eval=FALSE}
library(randomForest)
set.seed(42)
library(ranger)
data_obs = data_sub[!is.na(data_sub$survived),]
cv = 3
outer_split = as.integer(cut(1:nrow(data_obs), breaks = cv))
set.seed(42)
# sample minnodesize values (must be integers)
cv = 3
hyper_minnodesize = ...
results = data.frame(
set = rep(NA, cv),
minnodesize = rep(NA, cv),
AUC = rep(NA, cv)
)
for(i in 1:cv) {
train_outer = data_obs[outer_split != i, ]
test_outer = data_obs[outer_split == i, ]
tuning_results =
sapply(1:length(hyper_minnodesize), function(k) {
model = randomForest(as.factor(survived)~., data = train_outer, nodesize = ... )
return(Metrics::auc(test_outer$survived, predict(model, newdata = test_outer, type = "prob")[,2]))
})
best_minnodesize = hyper_minnodesize[which.max(tuning_results)]
results[i, 1] = i
results[i, 2] = best_minnodesize
results[i, 3] = max(tuning_results)
}
tuning_results =
sapply(1:length(hyper_minnodesize), function(k) {
auc_inner = NULL
for(j in 1:cv) {
inner_split = as.integer(cut(1:nrow(data_obs), breaks = cv))
train_inner = data_obs[inner_split != j, ]
test_inner = data_obs[inner_split == j, ]
model = ranger(survived~.,data = train_inner, min.node.size = hyper_minnodesize[k], probability = TRUE)
predictions = predict(model, test_inner)$predictions[,2]
auc_inner[j]= Metrics::auc(test_inner$survived, predictions)
}
return(mean(auc_inner))
})
results = data.frame(minnodesize = hyper_minnodesize, AUC = tuning_results)
print(results)
```
Expand All @@ -469,53 +464,43 @@ print(results)

`r hide("Click here to see the solution")`


```{r}
library(randomForest)
set.seed(42)
library(ranger)
data_obs = data_sub[!is.na(data_sub$survived),]
cv = 3
outer_split = as.integer(cut(1:nrow(data_obs), breaks = cv))
set.seed(42)
# sample minnodesize values (must be integers)
cv = 3
hyper_minnodesize = sample(300, 20)
results = data.frame(
set = rep(NA, cv),
minnodesize = rep(NA, cv),
AUC = rep(NA, cv)
)
for(i in 1:cv) {
train_outer = data_obs[outer_split != i, ]
test_outer = data_obs[outer_split == i, ]
tuning_results =
sapply(1:length(hyper_minnodesize), function(k) {
model = randomForest(as.factor(survived)~., data = train_outer, nodesize = hyper_minnodesize[k] )
return(Metrics::auc(test_outer$survived, predict(model, newdata = test_outer, type = "prob")[,2]))
})
best_minnodesize = hyper_minnodesize[which.max(tuning_results)]
results[i, 1] = i
results[i, 2] = best_minnodesize
results[i, 3] = max(tuning_results)
}
tuning_results =
sapply(1:length(hyper_minnodesize), function(k) {
auc_inner = NULL
for(j in 1:cv) {
inner_split = as.integer(cut(1:nrow(data_obs), breaks = cv))
train_inner = data_obs[inner_split != j, ]
test_inner = data_obs[inner_split == j, ]
model = ranger(survived~.,data = train_inner, min.node.size = hyper_minnodesize[k], probability = TRUE)
predictions = predict(model, test_inner)$predictions[,2]
auc_inner[j]= Metrics::auc(test_inner$survived, predictions)
}
return(mean(auc_inner))
})
results = data.frame(minnodesize = hyper_minnodesize, AUC = tuning_results)
print(results)
```



Make predictions:

```{r, results='hide', warning=FALSE, message=FALSE}
prediction_ensemble =
sapply(1:nrow(results), function(i) {
model = randomForest(as.factor(survived)~., data = data_obs, nodesize = results$minnodesize[i] )
return(predict(model, data_obs, type = "prob")[,2])
})
model = ranger(survived~.,data = data_obs, min.node.size = results[which.max(results$AUC),1], probability = TRUE)
# Single predictions from the ensemble model:
write.csv(data.frame(y = apply(prediction_ensemble, 1, mean)), file = "Max_titanic_ensemble.csv")
write.csv(data.frame(y = predict(model, data_new)$predictions[,1]), file = "Max_titanic_rf.csv")
```

`r unhide()`
Expand Down Expand Up @@ -574,33 +559,25 @@ outer_split = as.integer(cut(1:nrow(data_obs), breaks = cv))
hyper_depth = ...
hyper_eta = ...
results = data.frame(
set = rep(NA, cv),
depth = rep(NA, cv),
eta = rep(NA, cv),
AUC = rep(NA, cv)
)
for(i in 1:cv) {
train_outer = data_obs[outer_split != i, ]
test_outer = data_obs[outer_split == i, ]
tuning_results =
sapply(1:length(hyper_depth), function(k) {
# Cast data to xgboost data types
data_xg = xgb.DMatrix(data = as.matrix(train_outer[,-1]), label = train_outer$survived)
model = xgboost(data_xg, nrounds = 16L, eta = hyper_eta[k], max_depth = hyper_depth[k])
predictions = predict(model, newdata = as.matrix(test_outer)[,-1])
return(Metrics::auc(test_outer$survived, predictions)))
})
results[i, 1] = i
results[i, 2] = hyper_depth[which.max(tuning_results)]
results[i, 3] = hyper_eta[which.max(tuning_results)]
results[i, 4] = max(tuning_results)
}
tuning_results =
sapply(1:length(hyper_minnodesize), function(k) {
auc_inner = NULL
for(j in 1:cv) {
inner_split = as.integer(cut(1:nrow(data_obs), breaks = cv))
train_inner = data_obs[inner_split != j, ]
test_inner = data_obs[inner_split == j, ]
data_xg = xgb.DMatrix(data = as.matrix(train_inner[,-1]), label = train_inner$survived)
model = xgboost(data_xg, nrounds = 16L, eta = hyper_eta[k], max_depth = hyper_depth[k])
predictions = predict(model, newdata = as.matrix(test_inner)[,-1])
auc_inner[j]= Metrics::auc(test_inner$survived, predictions)
}
return(mean(auc_inner))
})
results = data.frame(depth = hyper_depth, eta = hyper_eta, AUC = tuning_results)
print(results)
```
Expand All @@ -620,51 +597,42 @@ outer_split = as.integer(cut(1:nrow(data_obs), breaks = cv))
hyper_depth = sample(200, 20)
hyper_eta = runif(20, 0, 1)
results = data.frame(
set = rep(NA, cv),
depth = rep(NA, cv),
eta = rep(NA, cv),
AUC = rep(NA, cv)
)
for(i in 1:cv) {
train_outer = data_obs[outer_split != i, ]
test_outer = data_obs[outer_split == i, ]
tuning_results =
sapply(1:length(hyper_depth), function(k) {
# Cast data to xgboost data types
data_xg = xgb.DMatrix(data = as.matrix(train_outer[,-1]), label = train_outer$survived)
model = xgboost(data_xg, nrounds = 16L, eta = hyper_eta[k], max_depth = hyper_depth[k])
predictions = predict(model, newdata = as.matrix(test_outer)[,-1])
return(Metrics::auc(test_outer$survived, predictions))
})
results[i, 1] = i
results[i, 2] = hyper_depth[which.max(tuning_results)]
results[i, 3] = hyper_eta[which.max(tuning_results)]
results[i, 4] = max(tuning_results)
}
tuning_results =
sapply(1:length(hyper_minnodesize), function(k) {
auc_inner = NULL
for(j in 1:cv) {
inner_split = as.integer(cut(1:nrow(data_obs), breaks = cv))
train_inner = data_obs[inner_split != j, ]
test_inner = data_obs[inner_split == j, ]
data_xg = xgb.DMatrix(data = as.matrix(train_inner[,-1]), label = train_inner$survived)
model = xgboost(data_xg, nrounds = 16L, eta = hyper_eta[k], max_depth = hyper_depth[k])
predictions = predict(model, newdata = as.matrix(test_inner)[,-1])
auc_inner[j]= Metrics::auc(test_inner$survived, predictions)
}
return(mean(auc_inner))
})
results = data.frame(depth = hyper_depth, eta = hyper_eta, AUC = tuning_results)
print(results)
```

Make predictions:

```{r, results='hide', warning=FALSE, message=FALSE}
prediction_ensemble =
sapply(1:nrow(results), function(i) {
data_xg = xgb.DMatrix(data = as.matrix(data_obs[,-1]), label = data_obs$survived)
model = xgboost(data_xg, nrounds = 16L, eta = results[which.max(results$AUC), 2], max_depth = results[which.max(results$AUC), 1])
data_xg = xgb.DMatrix(data = as.matrix(data_obs[,-1]), label = data_obs$survived)
model = xgboost(data_xg, nrounds = 16L, eta = results$eta[i], max_depth = results$depth[i])
predictions = predict(model, newdata = as.matrix(data_new)[,-1])
return(predictions)
})
predictions = predict(model, newdata = as.matrix(data_new)[,-1])
# Single predictions from the ensemble model:
write.csv(data.frame(y = apply(prediction_ensemble, 1, mean)), file = "Max_titanic_ensemble.csv")
write.csv(data.frame(y = predictions), file = "Max_titanic_xgboost.csv")
```

`r unhide()`
Expand Down
4 changes: 2 additions & 2 deletions _freeze/A4-MLpipeline/execute-results/html.json

Large diffs are not rendered by default.

Binary file modified _freeze/A4-MLpipeline/figure-html/unnamed-chunk-10-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions _freeze/B1-Trees/execute-results/html.json

Large diffs are not rendered by default.

Loading

0 comments on commit 0fd087b

Please sign in to comment.