From 67346dc8bf29edb24d0ffbec7e12e7f5d20fdfe4 Mon Sep 17 00:00:00 2001 From: hansvancalster Date: Tue, 24 Sep 2024 15:37:56 +0200 Subject: [PATCH] further test validation scripts --- source/scripts/nca_functions.R | 153 +++++++++++++-- source/scripts/test_validation_functions.R | 216 ++++++++++++++++++++- 2 files changed, 355 insertions(+), 14 deletions(-) diff --git a/source/scripts/nca_functions.R b/source/scripts/nca_functions.R index 6a94bbc..30b6b89 100644 --- a/source/scripts/nca_functions.R +++ b/source/scripts/nca_functions.R @@ -54,7 +54,7 @@ Cleanchangeareadata <- function(file, tbltrans, type){ #mapdata <- nara13$valid #refdata <- as.factor(points_id$lu13oord) #both arrays need to be factor variables with the same levels. -CalculateAccuracy <- function(mapdata, refdata){ +calculate_accuracy <- function(mapdata, refdata){ assert_that(length(mapdata) == length(refdata), msg = "Length of the arrays is not equal") assert_that(nlevels(mapdata) == nlevels(refdata) & @@ -67,11 +67,7 @@ CalculateAccuracy <- function(mapdata, refdata){ return(conf) } - -#maparea is the surface area of each change class -#ma is the confusion matrix for the change classes -Validationuncertainty <- function(ma, maparea, pixelsize) { - dyn <- rownames(ma) +confusion_matrix <- function(maparea, ma) { aoi <- sum(maparea) # calculate the area proportions for each map class propmaparea <- maparea / aoi @@ -79,6 +75,132 @@ Validationuncertainty <- function(ma, maparea, pixelsize) { ni <- rowSums(ma) # number of reference points per map class propma <- as.matrix(ma / ni * propmaparea) propma[is.nan(propma)] <- 0 # for classes with ni. = 0 + return(propma) +} + + + +calc_oa <- function(maparea, ma, propma = NULL) { + if (is.null(propma)) { + propma <- confusion_matrix(maparea = maparea, ma = ma) + } + # overall accuracy (Eq. 1 in Olofsson et al. 2014) + oa <- sum(diag(propma)) + + # variance of overall accuracy (Eq. 5 in Olofsson et al. 2014) + ni <- rowSums(ma) # number of reference points per map class + aoi <- sum(maparea) # calculate the area proportions for each map class + propmaparea <- maparea / aoi + ua <- diag(propma) / rowSums(propma) + v_oa <- sum(propmaparea^2 * ua * (1 - ua) / (ni - 1), na.rm = TRUE) + me_oa <- 1.96 * sqrt(v_oa) + oa_low = oa - me_oa + oa_high = oa + me_oa + return( + data.frame(oa_est = oa, oa_var = v_oa, oa_low = oa_low, oa_high = oa_high) + ) +} + +calc_ua_pa <- function(maparea, ma, propma = NULL) { + if (is.null(propma)) { + propma <- confusion_matrix(maparea = maparea, ma = ma) + } + dyn <- rownames(ma) + aoi <- sum(maparea) # calculate the area proportions for each map class + propmaparea <- maparea / aoi + ni <- rowSums(ma) # number of reference points per map class + + # estimate the accuracies + # user's accuracy (Eq. 2 in Olofsson et al. 2014) + # producer's accuracy (Eq. 3 in Olofsson et al. 2014) + pa <- diag(propma) / colSums(propma) + ua <- diag(propma) / rowSums(propma) + + # estimate confidence intervals for the accuracies + # variance of user's accuracy (Eq. 6 in Olofsson et al. 2014) + # variance of producer's accuracy (Eq. 7 in Olofsson et al. 2014) + v_ua <- ua * (1 - ua) / (ni - 1) + n_j <- vector(mode = "numeric", length = length(dyn)) + aftersumsign <- vector(mode = "numeric", length = length(dyn)) + for (cj in seq_len(length(dyn))) { + n_j[cj] <- sum(maparea / ni * ma[, cj], na.rm = TRUE) + aftersumsign[cj] <- sum(maparea[-cj]^2 * ma[-cj, cj] / ni[-cj] * + (1 - ma[-cj, cj] / ni[-cj]) / + (ni[-cj] - 1), na.rm = TRUE) + } + v_pa <- 1 / n_j^2 * (maparea^2 * (1 - pa)^2 * ua * (1 - ua) / (ni - 1) + + pa^2 * aftersumsign) + v_pa[is.nan(v_pa)] <- 0 + + ua_me <- 1.96 * sqrt(v_ua) + pa_me <- 1.96 * sqrt(v_pa) + ua_low <- ua - ua_me + ua_high <- ua + ua_me + pa_low <- pa - pa_me + pa_high <- pa + pa_me + + + return(tibble::tibble( + class = dyn, + ua_est = ua, + ua_var = v_ua, + ua_low = ua_low, + ua_high = ua_high, + pa_est = pa, + pa_var = v_pa, + pa_low = pa_low, + pa_high = pa_high)) +} + +calc_areas <- function(maparea, ma, pixelsize = 10, propma = NULL) { + if (is.null(propma)) { + propma <- confusion_matrix(maparea = maparea, ma = ma) + } + # Estimate area + # proportional area estimation + # proportion of area (Eq. 8 and 9 in Olofsson et al. 2014) + propareaest <- colSums(propma) + + # standard errors of the area estimation (Eq. 10 in Olofsson et al. 2014) + dyn <- rownames(ma) + ni <- rowSums(ma) + aoi <- sum(maparea) # calculate the area proportions for each map class + propmaparea <- maparea / aoi + v_propareaest <- vector(mode = "numeric", length = length(dyn)) + for (cj in seq_len(length(dyn))) { + v_propareaest[cj] <- sum((propmaparea * propma[, cj] - propma[, cj]^2) / + (ni + 0.001 - 1)) + # + 0.001 voor klassen met maar 1 punt + } + v_propareaest[is.na(v_propareaest)] <- 0 + me_propareaest <- 1.96 * sqrt(v_propareaest) + + out <- tibble::tibble( + class = dyn, + n_points = ni, + prop_est = propareaest, + prop_var = v_propareaest) |> + mutate( + prop_low = prop_est - me_propareaest, + prop_high = prop_est + me_propareaest, + prop_map_unadjusted = propmaparea, + area_pixelcount_ha = maparea * pixelsize, # in ha + area_est_ha = prop_est * aoi * pixelsize, # in ha + area_low_ha = area_est_ha - me_propareaest * aoi * pixelsize, # in ha + area_high_ha = area_est_ha + me_propareaest * aoi * pixelsize # in ha + ) + return(out) +} + + +#maparea is the surface area of each change class +#ma is the n_{ij} confusion matrix for the change classes +validation_uncertainty <- function(ma, maparea, pixelsize) { + dyn <- rownames(ma) + aoi <- sum(maparea) # calculate the area proportions for each map class + propmaparea <- maparea / aoi + ni <- rowSums(ma) # number of reference points per map class + propma <- confusion_matrix(maparea = maparea, ma = ma) pa <- diag(propma) / colSums(propma) # estimate the accuracies @@ -146,7 +268,7 @@ Validationuncertainty <- function(ma, maparea, pixelsize) { } -PlotValidationData <- function(ov){ +plot_validation_data <- function(ov){ plot_val <- ov %>% dplyr::select(class, area_ha, adj_area, ci_adj_area, ua, pa) %>% mutate(conf.low = adj_area - ci_adj_area, conf.high = adj_area + @@ -196,9 +318,14 @@ PlotValidationData <- function(ov){ return(bar) } -validationData <- function(){ - punten <- read_vc("validatiepunten", root = sprintf("%s/data/",here())) # Gevalideerde punten - combine <- read_vc("combine", root = sprintf("%s/data/",here())) # Combine van de validatieklassenkaart van 2013 +validation_data <- function(data_root){ + punten <- read_vc( + "validatiepunten", + root = file.path(data_root, "data")) # Gevalideerde punten + combine <- read_vc( + "combine", + root = file.path(data_root, "data")) + # Combine van de validatieklassenkaart van 2013 # en 2016 -> geeft de oppervlakte van de landgebruiksveranderingen en # van de stabiele klassen # Omzettingstabel landgebruiken @@ -251,12 +378,14 @@ validationData <- function(){ # 1. Puntenset ----- points <- punten %>% - na_if("") %>% + mutate( + across(where(is.factor), \(x) as.character(x) |> na_if("")) + ) %>% # Alle omzetten in NA gather(key = klasse, value = oordeel, X2013:change.9) %>% # Van breed formaat naar lang formaat separate(col = klasse, into = c("klasse", "eval"), sep = "\\.") %>% - replace_na(list(eval = 0)) %>% + replace_na(list(eval = "0")) %>% # Voeg evaluator toe drop_na() %>% # Alle NA laten vallen diff --git a/source/scripts/test_validation_functions.R b/source/scripts/test_validation_functions.R index 03ef807..6e8e077 100644 --- a/source/scripts/test_validation_functions.R +++ b/source/scripts/test_validation_functions.R @@ -1,4 +1,216 @@ +# code rmd NCA_validatingextent adapted, test only for nara map +git_root <- rprojroot::find_root(rprojroot::is_git_root) +source(file.path(git_root, "source/scripts/nca_functions.R")) +flea_data <- gsub( + pattern = "flea-extent", replacement = "flea-data", x = git_root) +library(tidyverse) +library(terra) +library(git2rdata) +library(plotly) +library(sf) +library(INBOtheme) +conflicted::conflicts_prefer(dplyr::filter) + +# load validation.Rdata +# this file was produced by +# validation_data(data_root = flea_data) +# but it is currently not working, probably due to different package versions +# anyway, not really needed for this script +load( + file.path( + flea_data, + "data/validation.Rdata" + ) +) + +# preprocessing steps +##################### +ls <- list( + nara2016 = terra::rast( + file.path(flea_data, "data/2016/LG2016_finaal_ori.tif")), + nara2013 = terra::rast( + file.path(flea_data, "data/2013/LG2013_finaal_ori.tif")) +) + +data <- ls %>% map(function(x) + cleanmapdata(data = x, + points_id = points_id, + tbltrans = tbltrans, + type = + ifelse(str_detect(names(x),"ori"), "nara", + ifelse(str_detect(names(x),"bos"), "bosw", + ifelse(str_detect(names(x),"BAK"), "bak", + ifelse(str_detect(names(x), "gras"), + "gras", "all")))), + year = as.numeric(gsub("[^0-9.-]", "", names(x))) + ) +) + +ls <- list( + nara = file.path(flea_data, "data/2013_2016/LG2013_LG2016_finaal_ori.csv")) +area <- ls %>% map(function(x) + Cleanchangeareadata(file = x, + tbltrans = tbltrans, + type = + ifelse(str_detect(x,"ori"), "nara", + ifelse(str_detect(x,"bos"), "bosw", + ifelse(str_detect(x,"BAK"), "bak", + ifelse(str_detect(x,"gras"), + "gras", "all")))) + ) +) +rm(ls) + +ttl <- list(nara = "the original land use map") + +this_map <- "nara" + +title <- ttl[[this_map]] +mapdata1 <- data[[str_c(this_map, "2013")]]$valid_eng +refdata1 <- as.factor(points_id$lu13oord_eng) +mapdata2 <- data[[str_c(this_map, "2016")]]$valid_eng +refdata2 <- as.factor(points_id$lu16oord_eng) +maparea <- area[[this_map]] + +# calculations +############## + +# map of 2013 +res1 <- calculate_accuracy(mapdata1, refdata1) +# OA +round(100*unname(res1$overall['Accuracy']), digits = 0) +# UA PA +# UA = precision (this does NOT equal specificity) = positive predictive value +# PA = recall = sensitivity = true positive fraction +t(res1$byClass) |> round(digits = 2) + +# map of 2016 +res2 <- calculate_accuracy(mapdata2, refdata2) +# OA +round(100*unname(res2$overall['Accuracy']), digits = 2) +# UA PA +# UA = precision (this does NOT equal specificity) = positive predictive value +# PA = recall = sensitivity = true positive fraction +t(res2$byClass) |> round(digits = 2) + +data.frame(users = c(res1$byClass[,5], res2$byClass[,5]), + producers = c(res1$byClass[,6], res2$byClass[,6]), + lu = rep(str_remove(rownames(res1$byClass), "Class: "), 2), + year = c(rep(2013, nrow(res1$byClass)), + rep(2016, nrow(res2$byClass)))) %>% + mutate(lu = as.factor(lu), + year = as.factor(year)) %>% + ggplot() + + geom_point(aes(x = users, y = producers, color = lu, shape = year)) + + scale_color_discrete(name = "land use") + + xlab("user's accuracy (precision)") + + ylab("producer's accuracy (recall)") + + theme_bw() + + +observed_changes <- data.frame( + map = as.factor(str_c(mapdata1, mapdata2, sep = "-")), + ref = as.factor(str_c(refdata1, refdata2, sep = "-")), + mapchange = as.factor(ifelse(mapdata1 == mapdata2, + "No change", "Change")), + refchange = as.factor(ifelse(refdata1 == refdata2, + "No change", "Change"))) %>% + mutate(map = factor(map, levels = sort(unique(c(levels(map), levels(ref))))), + map = factor(map, levels = levels(map))) +observed_changes %>% + pivot_longer(cols = 1:2, names_to = "type", values_to = "change") %>% + group_by(change) %>% + summarize(map = sum(type == "map"), + ref = sum(type == "ref")) %>% + ungroup() + +reschange1 <- calculate_accuracy(observed_changes$map, observed_changes$ref) +reschange2 <- calculate_accuracy( + observed_changes$mapchange, observed_changes$refchange) + +A <- reschange1$byClass[,c(1,2,5,6)] +rownames(A) <- str_remove(rownames(A), "Class: ") +A |> round(digits = 2) |> + knitr::kable( + caption = str_c( + "Accuracy of land use changes for ", + title, + " (only some accuracy measures are shown but others can be requested)."), + row.names = TRUE, + booktabs = TRUE) + +reschange2$table |> round(digits = 2) |> + knitr::kable( + caption = str_c( + "Confusion matrix for land use changes, + only comparing change / no change for ", + title, ". "), + row.names = TRUE, + booktabs = TRUE) + +b <- reschange2$byClass[c(1,2,5,6)] +b |> round(digits = 2) + +ov <- validation_uncertainty( + ma = as.data.frame.matrix(reschange1$table), + maparea = maparea$area, + pixelsize = 0.01)#each cell is 100 square meters = 0.01ha + +ov + +plot_validation_data(ov) + + +############################################################################## +############################################################################## + +cm <- confusion_matrix( + maparea = maparea$area, + ma = as.data.frame.matrix(reschange1$table) +) +dim(cm) +cm |> round(digits = 4) + +oa_df <- calc_oa( + maparea = maparea$area, + ma = as.data.frame.matrix(reschange1$table) +) +oa_df + +ua_pa_df <- calc_ua_pa( + maparea = maparea$area, + ma = as.data.frame.matrix(reschange1$table) +) +ua_pa_df + +ua_pa_df %>% + ggplot(aes(x = pa_est, y = ua_est)) + + geom_abline(alpha = 0.5) + + geom_point() + + ggrepel::geom_text_repel(aes(label = class), size = 2) + + geom_errorbar(aes(ymin = ua_low, ymax = ua_high), alpha = 0.3) + + geom_errorbarh(aes(xmin = pa_low, xmax = pa_high), alpha = 0.3) + + coord_equal(xlim = c(0, 1), ylim = c(0, 1)) + +areas_df <- calc_areas( + maparea = maparea$area, + ma = as.data.frame.matrix(reschange1$table) +) +areas_df + +areas_df %>% + mutate( + class = reorder(class, area_est_ha)) %>% + ggplot() + + geom_pointrange( + aes( + x = class, + y = area_est_ha, + ymin = area_low_ha, + ymax = area_high_ha, + colour = area_low_ha < 0 + )) + + scale_y_log10() + + coord_flip() -git_root <- rprojroot::find_root(rprojroot::is_git_root) -source(file.path(git_root, "NCA_validatingextend/src/NCA_functions.R"))