-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrepeated_CV_scratch.R
53 lines (40 loc) · 1.54 KB
/
repeated_CV_scratch.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# repeated k-fold CV scratch
# clinic_gam
## partition data
k <- 5
set.seed(12345)
partition.df <- mutate(clinic.complete,
my.folds = sample(1:k, size = nrow(clinic.complete),replace = TRUE))
## train and evaluate model on each fold
cv.func.gam <- function(this.fold, data){
train <- filter(data, my.folds != this.fold)
validate <- filter(data, my.folds == this.fold)
# scale
train.sd <- apply(train, 2, sd)
train.mean <- apply(train, 2, mean)
train <- scale(train) %>%
as.data.frame()
validate <- test_scale(validate,train.mean,train.sd)
# fit model
model <- gam(gam.form, data = train, method = "REML")
# predict and evaluation
raw_pred = predict(gam.mod,validate)
preds <- raw_pred*train.sd[20]+train.mean[20] # the col of outcome
rmse <- rmse(pentaTest.raw[,20],preds)
r2 <- R2(pentaTest.raw[,20],preds)
mae <- MAE(pentaTest.raw[,20],preds)
# storing the result
# to be edited
}
## repeating the k-fold CV
## rbind the results for each iteration
coefs <- data.frame("fertility"=NA, "elevation"=NA, "poverty"=NA, "night_lights"=NA, "Population"=NA,"child_population"=NA,"population_density"=NA,
"radio"=NA, "electricity"=NA, "television"=NA,"mobile_phone"=NA, "mothers_age"=NA)
mod_performance <- data.frame("RMSE" = rep(0, 1000), "R2" = rep(0, 1000), "MAE"=rep(0, 1000))
# for a single partition
# need to write the repeated part
# for loop might be better
cv.error <- sapply(seq_len(k),
FUN = cv.fun,
data = sim_data) %>%
mean()