From 67346dc8bf29edb24d0ffbec7e12e7f5d20fdfe4 Mon Sep 17 00:00:00 2001
From: hansvancalster <hans.vancalster@inbo.be>
Date: Tue, 24 Sep 2024 15:37:56 +0200
Subject: [PATCH] further test validation scripts

---
 source/scripts/nca_functions.R             | 153 +++++++++++++--
 source/scripts/test_validation_functions.R | 216 ++++++++++++++++++++-
 2 files changed, 355 insertions(+), 14 deletions(-)

diff --git a/source/scripts/nca_functions.R b/source/scripts/nca_functions.R
index 6a94bbc..30b6b89 100644
--- a/source/scripts/nca_functions.R
+++ b/source/scripts/nca_functions.R
@@ -54,7 +54,7 @@ Cleanchangeareadata <- function(file, tbltrans, type){
 #mapdata <- nara13$valid
 #refdata <- as.factor(points_id$lu13oord)
 #both arrays need to be factor variables with the same levels.
-CalculateAccuracy <- function(mapdata, refdata){
+calculate_accuracy <- function(mapdata, refdata){
   assert_that(length(mapdata) == length(refdata),
               msg = "Length of the arrays is not equal")
   assert_that(nlevels(mapdata) == nlevels(refdata) &
@@ -67,11 +67,7 @@ CalculateAccuracy <- function(mapdata, refdata){
   return(conf)
 }
 
-
-#maparea is the surface area of each change class
-#ma is the confusion matrix for the change classes
-Validationuncertainty <- function(ma, maparea, pixelsize) {
-  dyn <- rownames(ma)
+confusion_matrix <- function(maparea, ma) {
   aoi <- sum(maparea) # calculate the area proportions for each map class
   propmaparea <- maparea / aoi
 
@@ -79,6 +75,132 @@ Validationuncertainty <- function(ma, maparea, pixelsize) {
   ni <- rowSums(ma) # number of reference points per map class
   propma <- as.matrix(ma / ni * propmaparea)
   propma[is.nan(propma)] <- 0 # for classes with ni. = 0
+  return(propma)
+}
+
+
+
+calc_oa <- function(maparea, ma, propma = NULL) {
+  if (is.null(propma)) {
+    propma <- confusion_matrix(maparea = maparea, ma = ma)
+  }
+  # overall accuracy (Eq. 1 in Olofsson et al. 2014)
+  oa <- sum(diag(propma))
+
+  # variance of overall accuracy (Eq. 5 in Olofsson et al. 2014)
+  ni <- rowSums(ma) # number of reference points per map class
+  aoi <- sum(maparea) # calculate the area proportions for each map class
+  propmaparea <- maparea / aoi
+  ua <- diag(propma) / rowSums(propma)
+  v_oa <- sum(propmaparea^2 * ua * (1 - ua) / (ni - 1), na.rm = TRUE)
+  me_oa <- 1.96 * sqrt(v_oa)
+  oa_low = oa - me_oa
+  oa_high = oa + me_oa
+  return(
+    data.frame(oa_est = oa, oa_var = v_oa, oa_low = oa_low, oa_high = oa_high)
+  )
+}
+
+calc_ua_pa <- function(maparea, ma, propma = NULL) {
+  if (is.null(propma)) {
+    propma <- confusion_matrix(maparea = maparea, ma = ma)
+  }
+  dyn <- rownames(ma)
+  aoi <- sum(maparea) # calculate the area proportions for each map class
+  propmaparea <- maparea / aoi
+  ni <- rowSums(ma) # number of reference points per map class
+
+  # estimate the accuracies
+  # user's accuracy (Eq. 2 in Olofsson et al. 2014)
+  # producer's accuracy (Eq. 3 in Olofsson et al. 2014)
+  pa <- diag(propma) / colSums(propma)
+  ua <- diag(propma) / rowSums(propma)
+
+  # estimate confidence intervals for the accuracies
+  # variance of user's accuracy (Eq. 6 in Olofsson et al. 2014)
+  # variance of producer's accuracy (Eq. 7 in Olofsson et al. 2014)
+  v_ua <- ua * (1 - ua) / (ni - 1)
+  n_j <- vector(mode = "numeric", length = length(dyn))
+  aftersumsign <- vector(mode = "numeric", length = length(dyn))
+  for (cj in seq_len(length(dyn))) {
+    n_j[cj] <- sum(maparea / ni * ma[, cj], na.rm = TRUE)
+    aftersumsign[cj] <- sum(maparea[-cj]^2 * ma[-cj, cj] / ni[-cj] *
+                              (1 - ma[-cj, cj] / ni[-cj]) /
+                              (ni[-cj] - 1), na.rm = TRUE)
+  }
+  v_pa <- 1 / n_j^2 * (maparea^2 * (1 - pa)^2 * ua * (1 - ua) / (ni - 1) +
+                         pa^2 * aftersumsign)
+  v_pa[is.nan(v_pa)] <- 0
+
+  ua_me <- 1.96 * sqrt(v_ua)
+  pa_me <- 1.96 * sqrt(v_pa)
+  ua_low <- ua - ua_me
+  ua_high <- ua + ua_me
+  pa_low <- pa - pa_me
+  pa_high <- pa + pa_me
+
+
+  return(tibble::tibble(
+    class = dyn,
+    ua_est = ua,
+    ua_var = v_ua,
+    ua_low = ua_low,
+    ua_high = ua_high,
+    pa_est = pa,
+    pa_var = v_pa,
+    pa_low = pa_low,
+    pa_high = pa_high))
+}
+
+calc_areas <- function(maparea, ma, pixelsize = 10, propma = NULL) {
+  if (is.null(propma)) {
+    propma <- confusion_matrix(maparea = maparea, ma = ma)
+  }
+  # Estimate area
+  # proportional area estimation
+  # proportion of area (Eq. 8 and 9 in Olofsson et al. 2014)
+  propareaest <- colSums(propma)
+
+  # standard errors of the area estimation (Eq. 10 in Olofsson et al. 2014)
+  dyn <- rownames(ma)
+  ni <- rowSums(ma)
+  aoi <- sum(maparea) # calculate the area proportions for each map class
+  propmaparea <- maparea / aoi
+  v_propareaest <- vector(mode = "numeric", length = length(dyn))
+  for (cj in seq_len(length(dyn))) {
+    v_propareaest[cj] <- sum((propmaparea * propma[, cj] - propma[, cj]^2) /
+                               (ni + 0.001 - 1))
+    # + 0.001 voor klassen met maar 1 punt
+  }
+  v_propareaest[is.na(v_propareaest)] <- 0
+  me_propareaest <- 1.96 * sqrt(v_propareaest)
+
+  out <- tibble::tibble(
+    class = dyn,
+    n_points = ni,
+    prop_est = propareaest,
+    prop_var = v_propareaest) |>
+    mutate(
+      prop_low = prop_est - me_propareaest,
+      prop_high = prop_est + me_propareaest,
+      prop_map_unadjusted = propmaparea,
+      area_pixelcount_ha = maparea * pixelsize, # in ha
+      area_est_ha = prop_est * aoi * pixelsize, # in ha
+      area_low_ha = area_est_ha - me_propareaest * aoi * pixelsize, # in ha
+      area_high_ha = area_est_ha + me_propareaest * aoi * pixelsize # in ha
+  )
+  return(out)
+}
+
+
+#maparea is the surface area of each change class
+#ma is the n_{ij} confusion matrix for the change classes
+validation_uncertainty <- function(ma, maparea, pixelsize) {
+  dyn <- rownames(ma)
+  aoi <- sum(maparea) # calculate the area proportions for each map class
+  propmaparea <- maparea / aoi
+  ni <- rowSums(ma) # number of reference points per map class
+  propma <- confusion_matrix(maparea = maparea, ma = ma)
 
    pa <- diag(propma) / colSums(propma)
  # estimate the accuracies
@@ -146,7 +268,7 @@ Validationuncertainty <- function(ma, maparea, pixelsize) {
 }
 
 
-PlotValidationData <- function(ov){
+plot_validation_data <- function(ov){
   plot_val <- ov %>%
     dplyr::select(class, area_ha, adj_area, ci_adj_area, ua, pa) %>%
     mutate(conf.low = adj_area - ci_adj_area, conf.high = adj_area +
@@ -196,9 +318,14 @@ PlotValidationData <- function(ov){
   return(bar)
 }
 
-validationData <- function(){
-  punten <- read_vc("validatiepunten", root = sprintf("%s/data/",here())) # Gevalideerde punten
-  combine <- read_vc("combine", root = sprintf("%s/data/",here())) # Combine van de validatieklassenkaart van 2013
+validation_data <- function(data_root){
+  punten <- read_vc(
+    "validatiepunten",
+    root = file.path(data_root, "data")) # Gevalideerde punten
+  combine <- read_vc(
+    "combine",
+    root = file.path(data_root, "data"))
+  # Combine van de validatieklassenkaart van 2013
   # en 2016 -> geeft de oppervlakte van de landgebruiksveranderingen en
   # van de stabiele klassen
   # Omzettingstabel landgebruiken
@@ -251,12 +378,14 @@ validationData <- function(){
 
   # 1. Puntenset -----
   points <- punten %>%
-    na_if("<Null>") %>%
+    mutate(
+      across(where(is.factor), \(x) as.character(x) |> na_if("<Null>"))
+    ) %>%
     # Alle <Null> omzetten in NA
     gather(key = klasse, value = oordeel, X2013:change.9) %>%
     # Van breed formaat naar lang formaat
     separate(col = klasse, into = c("klasse", "eval"), sep = "\\.") %>%
-    replace_na(list(eval = 0)) %>%
+    replace_na(list(eval = "0")) %>%
     # Voeg evaluator toe
     drop_na() %>%
     # Alle NA laten vallen
diff --git a/source/scripts/test_validation_functions.R b/source/scripts/test_validation_functions.R
index 03ef807..6e8e077 100644
--- a/source/scripts/test_validation_functions.R
+++ b/source/scripts/test_validation_functions.R
@@ -1,4 +1,216 @@
+# code rmd NCA_validatingextent adapted, test only for nara map
+git_root <- rprojroot::find_root(rprojroot::is_git_root)
+source(file.path(git_root, "source/scripts/nca_functions.R"))
+flea_data <- gsub(
+  pattern = "flea-extent", replacement = "flea-data", x = git_root)
+library(tidyverse)
+library(terra)
+library(git2rdata)
+library(plotly)
+library(sf)
+library(INBOtheme)
 
+conflicted::conflicts_prefer(dplyr::filter)
+
+# load validation.Rdata
+# this file was produced by
+# validation_data(data_root = flea_data)
+# but it is currently not working, probably due to different package versions
+# anyway, not really needed for this script
+load(
+  file.path(
+    flea_data,
+    "data/validation.Rdata"
+  )
+)
+
+# preprocessing steps
+#####################
+ls <- list(
+  nara2016 = terra::rast(
+    file.path(flea_data, "data/2016/LG2016_finaal_ori.tif")),
+  nara2013 = terra::rast(
+    file.path(flea_data, "data/2013/LG2013_finaal_ori.tif"))
+)
+
+data <- ls %>% map(function(x)
+  cleanmapdata(data = x,
+               points_id = points_id,
+               tbltrans = tbltrans,
+               type =
+                 ifelse(str_detect(names(x),"ori"), "nara",
+                        ifelse(str_detect(names(x),"bos"), "bosw",
+                               ifelse(str_detect(names(x),"BAK"), "bak",
+                                      ifelse(str_detect(names(x), "gras"),
+                                             "gras", "all")))),
+               year = as.numeric(gsub("[^0-9.-]", "", names(x)))
+  )
+)
+
+ls <- list(
+  nara = file.path(flea_data, "data/2013_2016/LG2013_LG2016_finaal_ori.csv"))
+area <- ls %>% map(function(x)
+  Cleanchangeareadata(file = x,
+                      tbltrans = tbltrans,
+                      type =
+                        ifelse(str_detect(x,"ori"), "nara",
+                               ifelse(str_detect(x,"bos"), "bosw",
+                                      ifelse(str_detect(x,"BAK"), "bak",
+                                             ifelse(str_detect(x,"gras"),
+                                                    "gras", "all"))))
+  )
+)
+rm(ls)
+
+ttl <- list(nara = "the original land use map")
+
+this_map <- "nara"
+
+title <- ttl[[this_map]]
+mapdata1 <- data[[str_c(this_map, "2013")]]$valid_eng
+refdata1 <- as.factor(points_id$lu13oord_eng)
+mapdata2 <- data[[str_c(this_map, "2016")]]$valid_eng
+refdata2 <- as.factor(points_id$lu16oord_eng)
+maparea <- area[[this_map]]
+
+# calculations
+##############
+
+# map of 2013
+res1 <- calculate_accuracy(mapdata1, refdata1)
+# OA
+round(100*unname(res1$overall['Accuracy']), digits = 0)
+# UA PA
+# UA = precision (this does NOT equal specificity) = positive predictive value
+# PA = recall = sensitivity = true positive fraction
+t(res1$byClass) |> round(digits = 2)
+
+# map of 2016
+res2 <- calculate_accuracy(mapdata2, refdata2)
+# OA
+round(100*unname(res2$overall['Accuracy']), digits = 2)
+# UA PA
+# UA = precision (this does NOT equal specificity) = positive predictive value
+# PA = recall = sensitivity = true positive fraction
+t(res2$byClass) |> round(digits = 2)
+
+data.frame(users = c(res1$byClass[,5], res2$byClass[,5]),
+           producers = c(res1$byClass[,6], res2$byClass[,6]),
+           lu = rep(str_remove(rownames(res1$byClass), "Class: "), 2),
+           year = c(rep(2013, nrow(res1$byClass)),
+                    rep(2016, nrow(res2$byClass)))) %>%
+  mutate(lu = as.factor(lu),
+         year = as.factor(year)) %>%
+  ggplot() +
+  geom_point(aes(x = users, y = producers, color = lu, shape = year)) +
+  scale_color_discrete(name = "land use") +
+  xlab("user's accuracy (precision)") +
+  ylab("producer's accuracy (recall)") +
+  theme_bw()
+
+
+observed_changes <- data.frame(
+  map = as.factor(str_c(mapdata1, mapdata2, sep = "-")),
+  ref = as.factor(str_c(refdata1, refdata2, sep = "-")),
+  mapchange = as.factor(ifelse(mapdata1 == mapdata2,
+                               "No change", "Change")),
+  refchange = as.factor(ifelse(refdata1 == refdata2,
+                               "No change", "Change"))) %>%
+  mutate(map = factor(map, levels = sort(unique(c(levels(map), levels(ref))))),
+         map = factor(map, levels = levels(map)))
+observed_changes %>%
+  pivot_longer(cols = 1:2, names_to = "type", values_to = "change") %>%
+  group_by(change) %>%
+  summarize(map = sum(type == "map"),
+            ref = sum(type == "ref")) %>%
+  ungroup()
+
+reschange1 <- calculate_accuracy(observed_changes$map, observed_changes$ref)
+reschange2 <- calculate_accuracy(
+  observed_changes$mapchange, observed_changes$refchange)
+
+A <- reschange1$byClass[,c(1,2,5,6)]
+rownames(A) <- str_remove(rownames(A), "Class: ")
+A |> round(digits = 2) |>
+  knitr::kable(
+    caption = str_c(
+      "Accuracy of land use changes  for ",
+      title,
+      " (only some accuracy measures are shown but others can be requested)."),
+    row.names = TRUE,
+    booktabs = TRUE)
+
+reschange2$table |> round(digits = 2) |>
+  knitr::kable(
+    caption = str_c(
+      "Confusion matrix for land use changes,
+      only comparing change / no change for ",
+      title, ". "),
+    row.names = TRUE,
+    booktabs = TRUE)
+
+b <- reschange2$byClass[c(1,2,5,6)]
+b |> round(digits = 2)
+
+ov <- validation_uncertainty(
+  ma = as.data.frame.matrix(reschange1$table),
+  maparea = maparea$area,
+  pixelsize = 0.01)#each cell is 100 square meters = 0.01ha
+
+ov
+
+plot_validation_data(ov)
+
+
+##############################################################################
+##############################################################################
+
+cm <- confusion_matrix(
+  maparea = maparea$area,
+  ma = as.data.frame.matrix(reschange1$table)
+)
+dim(cm)
+cm |> round(digits = 4)
+
+oa_df <- calc_oa(
+  maparea = maparea$area,
+  ma = as.data.frame.matrix(reschange1$table)
+)
+oa_df
+
+ua_pa_df <- calc_ua_pa(
+  maparea = maparea$area,
+  ma = as.data.frame.matrix(reschange1$table)
+)
+ua_pa_df
+
+ua_pa_df %>%
+  ggplot(aes(x = pa_est, y = ua_est)) +
+  geom_abline(alpha = 0.5) +
+  geom_point() +
+  ggrepel::geom_text_repel(aes(label = class), size = 2) +
+  geom_errorbar(aes(ymin = ua_low, ymax = ua_high), alpha = 0.3) +
+  geom_errorbarh(aes(xmin = pa_low, xmax = pa_high), alpha = 0.3) +
+  coord_equal(xlim = c(0, 1), ylim = c(0, 1))
+
+areas_df <- calc_areas(
+  maparea = maparea$area,
+  ma = as.data.frame.matrix(reschange1$table)
+)
+areas_df
+
+areas_df %>%
+  mutate(
+    class = reorder(class, area_est_ha)) %>%
+  ggplot() +
+  geom_pointrange(
+    aes(
+      x = class,
+      y = area_est_ha,
+      ymin = area_low_ha,
+      ymax = area_high_ha,
+      colour  = area_low_ha < 0
+    )) +
+  scale_y_log10() +
+  coord_flip()
 
-git_root <- rprojroot::find_root(rprojroot::is_git_root)
-source(file.path(git_root, "NCA_validatingextend/src/NCA_functions.R"))