Merge pull request #83 from florianhartig/0.2.0.1

0.2.0.1
florianhartig · Aug 20, 2018 · 831c66b · 831c66b
2 parents 7127674 + 02c90cb
commit 831c66b
Show file tree

Hide file tree

Showing 12 changed files with 288 additions and 21 deletions.
diff --git a/Code/DHARMaPackageSupport/glmmTMB/glmmTMB.Rmd b/Code/DHARMaPackageSupport/glmmTMB/glmmTMB.Rmd
@@ -212,11 +212,12 @@ plot(res, asFactor = T)
 Example from the help
 
 ```{r}
-m <- glmmTMB(count~spp + mined + (1|site), zi=~spp + mined, family=nbinom2, Salamanders)
+m <- glmmTMB(count~spp + mined + (1|site), zi=~spp + mined, 
+             family=nbinom2, Salamanders)
 summary(m)
 
 res = simulateResiduals(m)
-plot(res, rank = T)
+plot(res)
 
 ### creating new data based on the fitted models. Residuals should now be perfect
 

diff --git a/Code/DHARMaPerformance/.gitignore b/Code/DHARMaPerformance/.gitignore
@@ -0,0 +1 @@
+*.png
diff --git a/Code/DHARMaPerformance/Power/TestPower.Rmd b/Code/DHARMaPerformance/Power/TestPower.Rmd
@@ -57,10 +57,11 @@ legend("bottomright", tests, col = 1:4, lty = 1:4)
 ```
 
 
-## Benchmarking DHARMa dispersiontests against pearson dispersiontest for glmms
 
+## Benchmarking DHARMa dispersiontests against AER dispersiontest for Poisson GLM
 
 ```{r}
+
 overdisp_fun <- function(model) {
   ## number of variance parameters in 
   ##   an n-by-n variance-covariance matrix
@@ -73,11 +74,43 @@ overdisp_fun <- function(model) {
   Pearson.chisq <- sum(rp^2)
   prat <- Pearson.chisq/rdf
   pval <- pchisq(Pearson.chisq, df=rdf, lower.tail=FALSE)
-  c(chisq=Pearson.chisq,ratio=prat,rdf=rdf,p=pval)
+  list(chisq=Pearson.chisq,ratio=prat,rdf=rdf,p=pval)
+}
+
+doCalculations <- function(control = 0){
+  testData = createData(sampleSize = 200, family = poisson(), overdispersion = control, randomEffectVariance = 1)
+  fittedModel <- glmer(observedResponse ~ Environment1 + (1|group), data = testData, family = poisson())
+ 
+  out = list()
+  
+  res <- simulateResiduals(fittedModel = fittedModel, n = 250)
+  out$uniformTest = testUniformity(res)$p.value  
+  out$Dispersion = testDispersion(res, plot = F)$p.value  
+  out$DispersionAER = overdisp_fun(fittedModel)$pval 
+  
+  res <- simulateResiduals(fittedModel = fittedModel, n = 250, refit = T)  
+  out$DispersionRefitted = testDispersion(res, plot = F)$p.value  
+  return(unlist(out))
 }
+
+# testing a single return
+doCalculations(control = 0.3)
+
+dispValues = seq(0,1.2, len = 5)
+
+# running benchmark
+out = runBenchmarks(doCalculations, controlValues = dispValues , nRep = 10,  parallel = F)
+
+tests = c("uniformity", "DHARMa disp basic" , "GLMER dispersiontest", "DHARMa disp refit")
+
+
+matplot(dispValues, t(out$summaries[,1,]), type = "l")
+legend("bottomright", tests, col = 1:4, lty = 1:4)
+
 ```
 
 
+
 # Autocorrelation tests
 
 

diff --git a/DHARMa/R/simulateResiduals.R b/DHARMa/R/simulateResiduals.R
@@ -181,8 +181,18 @@ simulateResiduals <- function(fittedModel, n = 250, refit = F, integerResponse =
   ########### Wrapup ############
 
   out$scaledResidualsNormal = qnorm(out$scaledResiduals + 0.00 )
-
 
+  # outliers = list()
+  # outliers$fracLowerThanAllSimulated = sum(out$scaledResiduals == 0) / out$nSim
+  # outliers$fracHigherThanAllSimulated = sum(out$scaledResiduals == 1) / out$nSim
+  # 
+  # # see http://www.di.fc.ul.pt/~jpn/r/prob/range.html 
+  # # 1−(1−x)n
+  # outliers$samplingDist = ecdf(replicate(10000, min(runif(out$nSim))))
+  #   
+  # outliers$plowerThanAllSimulated = 1-outliers$samplingDist(fracLowerThanAllSimulated)
+  # outliers$plowerThanAllSimulated = 1-outliers$samplingDist(fracLowerThanAllSimulated)
+
   out$time = proc.time() - ptm
   out$randomState = randomState
 
@@ -301,6 +311,7 @@ recalculateResiduals <- function(simulationOutput, group = NULL, aggregateBy = s
   if(!is.null(simulationOutput$original)) simulationOutput = simulationOutput$original
 
   out = list()
+  out$original = simulationOutput
 
   if(is.null(group)) return(simulationOutput)
   else group =as.factor(group)
@@ -311,37 +322,35 @@ recalculateResiduals <- function(simulationOutput, group = NULL, aggregateBy = s
   out$observedResponse = aggregateByGroup(simulationOutput$observedResponse)
   out$fittedPredictedResponse = aggregateByGroup(simulationOutput$fittedPredictedResponse)
   out$simulatedResponse = apply(simulationOutput$simulatedResponse, 2, aggregateByGroup)
-  out$scaledResiduals = rep(NA, out$nGroups)
-
+
   if (simulationOutput$refit == F){
-    if(simulationOutput$integerResponse == T){
-      for (i in 1:out$nGroups) out$scaledResiduals[i] <- ecdf(out$simulatedResponse[i,] + runif(out$nGroups, -0.5, 0.5))(out$observedResponse[i] + runif(1, -0.5, 0.5))
-    } else {
-      for (i in 1:out$nGroups) out$scaledResiduals[i] <- ecdf(out$simulatedResponse[i,])(out$observedResponse[i])
-    } 
+
+    out$scaledResiduals = getQuantile(simulations = out$simulatedResponse , observed = out$observedResponse , n = out$nGroups, nSim = simulationOutput$nSim, integerResponse = simulationOutput$integerResponse)
+
   ######## refit = T ##################   
   } else {
 
     out$refittedPredictedResponse <- apply(simulationOutput$refittedPredictedResponse, 2, aggregateByGroup)
     out$fittedResiduals = aggregateByGroup(simulationOutput$fittedResiduals)
     out$refittedResiduals = apply(simulationOutput$refittedResiduals, 2, aggregateByGroup)
     out$refittedPearsonResiduals = apply(simulationOutput$refittedPearsonResiduals, 2, aggregateByGroup)
-
-    if(simulationOutput$integerResponse == T){
-      for (i in 1:out$nGroups) out$scaledResiduals[i] <- ecdf(out$refittedResiduals[i,] + runif(out$nGroups, -0.5, 0.5))(out$fittedResiduals[i] + runif(1, -0.5, 0.5))
-    } else {
-      for (i in 1:out$nGroups) out$scaledResiduals[i] <- ecdf(out$refittedResiduals[i,])(out$fittedResiduals[i])
-    } 
+
+    out$scaledResiduals = getQuantile(simulations = out$refittedResiduals , observed = out$fittedResiduals , n = out$nGroups, nSim = simulationOutput$nSim, integerResponse = simulationOutput$integerResponse)
+
   }
+
   # hack - the c here will result in both old and new outputs to be present resulting output, but a named access should refer to the new, grouped calculations
+  # question to myself - what's the use of that, why not erase the old outputs? they are anyway saved in the old object
+
   out$aggregateByGroup = aggregateByGroup
   out = c(out, simulationOutput)
-  out$original = simulationOutput
   class(out) = "DHARMa"
   return(out)
 }
 
-
+#' Quantile calculations
+#' 
+#' @keywords internal
 getQuantile <- function(simulations, observed, n, nSim, integerResponse){
 
   scaledResiduals = rep(NA, n)

diff --git a/DHARMa/R/tests.R b/DHARMa/R/tests.R
@@ -211,6 +211,8 @@ testGeneric <- function(simulationOutput, summary, alternative = c("two.sided",
 #' @example inst/examples/testTemporalAutocorrelationHelp.R
 #' @export
 testTemporalAutocorrelation <- function(simulationOutput, time = NULL , alternative = c("two.sided", "greater", "less"), plot = T){
+  # actually not sure if this is neccessary for dwtest, but seems better to aggregate
+  if(any(duplicated(time))) stop("testing for temporal autocorrelation requires unique time values - if you have several observations per location, use the recalculateResiduals function to aggregate residuals per location")
 
   alternative <- match.arg(alternative)
 
@@ -250,6 +252,8 @@ testTemporalAutocorrelation <- function(simulationOutput, time = NULL , alternat
 #' @export
 testSpatialAutocorrelation <- function(simulationOutput, x = NULL, y  = NULL, distMat = NULL, alternative = c("two.sided", "greater", "less"), plot = T){
 
+  if(any(duplicated(cbind(x,y)))) stop("testing for spatial autocorrelation requires unique x,y values - if you have several observations per location, use the recalculateResiduals function to aggregate residuals per location")
+
   alternative <- match.arg(alternative)
 
   if( !is.null(x) & !is.null(distMat) ) warning("coordinates and distMat provided, coordinates will only be used for plotting")

diff --git a/DHARMa/inst/examples/createDharmaHelp.R b/DHARMa/inst/examples/createDharmaHelp.R
@@ -1,3 +1,32 @@
+## READING IN HAND-CODED SIMULATIONS
+
+testData = createData(sampleSize = 50, randomEffectVariance = 0)
+fittedModel <- glm(observedResponse ~ Environment1, data = testData, family = "poisson")
+
+# in DHARMA, using the simulate.glm function of glm 
+sims = simulateResiduals(fittedModel)
+plot(sims, quantreg = FALSE)
+
+# Doing the same with a handcode simulate function. 
+# of course this code will only work with a 1-par glm model
+simulateMyfit <- function(n=10, fittedModel){
+  int = coef(fittedModel)[1]
+  slo = coef(fittedModel)[2]
+  pred = exp(int + slo * testData$Environment1)
+  predSim = replicate(n, rpois(length(pred), pred))
+  return(predSim)
+}
+
+sims = simulateMyfit(250, fittedModel)
+
+dharmaRes <- createDHARMa(simulatedResponse = sims, 
+                          observedResponse = testData$observedResponse, 
+                          fittedPredictedResponse = predict(fittedModel, type = "response"), 
+                          integer = TRUE)
+plot(dharmaRes, quantreg = FALSE)
+
+## A BAYESIAN EXAMPLE
+
 \dontrun{
 
   # This example shows how to check the residuals for a 

diff --git a/DHARMa/inst/examples/testSpatialAutocorrelationHelp.R b/DHARMa/inst/examples/testSpatialAutocorrelationHelp.R
@@ -11,3 +11,43 @@ testSpatialAutocorrelation(res)
 # Alternatively, one can provide a distance matrix
 dM = as.matrix(dist(cbind(testData$x, testData$y)))
 testSpatialAutocorrelation(res, distMat = dM)
+
+# carefull with clustered data and conditional / unconditional simulations
+# this originates from https://github.com/florianhartig/DHARMa/issues/81
+
+# Assume our data is divided into clusters, and we use a RE to take out cluster effects
+
+clusters = 100
+subsamples = 10
+size = clusters * subsamples
+
+testData = createData(sampleSize = size, family = gaussian(), numGroups = clusters )
+testData$x  = rnorm(clusters)[testData$group] + rnorm(size, sd = 0.01)
+testData$y  = rnorm(clusters)[testData$group] + rnorm(size, sd = 0.01)
+
+library(lme4)
+fittedModel <- lmer(observedResponse ~ Environment1 + (1|group), data = testData)
+
+# DHARMa default is to re-simulted REs - this means spatial pattern remains
+# because residuals are still clustered
+
+res = simulateResiduals(fittedModel)
+testSpatialAutocorrelation(res, x =  testData$x, y = testData$y)
+
+# However, it should disappear if you just calculate an aggregate residuals per cluster
+# Because at least how the data are simualted, cluster are spatially independent
+
+res2 = recalculateResiduals(res, group = testData$group)
+testSpatialAutocorrelation(res2, 
+                           x =  aggregate(testData$x, list(testData$group), mean)$x, 
+                           y = aggregate(testData$y, list(testData$group), mean)$x)
+
+# For lme4, possible to simulated residuals conditional on fitted REs (re.form)
+# This takes out most of the RSA - a remainder is probably due the shrinkage
+# of the REs
+
+res = simulateResiduals(fittedModel, re.form = NULL)
+testSpatialAutocorrelation(res, x =  testData$x, y = testData$y)
+
+
+
diff --git a/DHARMa/man/createDHARMa.Rd b/DHARMa/man/createDHARMa.Rd
diff --git a/DHARMa/man/getQuantile.Rd b/DHARMa/man/getQuantile.Rd
diff --git a/DHARMa/man/testSpatialAutocorrelation.Rd b/DHARMa/man/testSpatialAutocorrelation.Rd
diff --git a/DHARMa/tests/testthat/testRecalculateResiduals.R b/DHARMa/tests/testthat/testRecalculateResiduals.R
@@ -0,0 +1,22 @@
+
+
+testData = createData(sampleSize = 200, family = poisson(), numGroups = 20, randomEffectVariance = 0)
+
+#len = sum(testData$group == 1)
+#testData$group[testData$group == 1] = sample(c(1,21),20, replace = T)
+
+fittedModel <- glm(observedResponse ~ Environment1, family = "poisson", data = testData)
+
+simulationOutput = simulateResiduals(fittedModel)
+plot(simulationOutput)
+dim(simulationOutput$simulatedResponse)
+
+simulationOutput = recalculateResiduals(simulationOutput, group = testData$group)
+plot(simulationOutput)
+
+
+testData = createData(sampleSize = 200, family = gaussian())
+fittedModel <- glm(observedResponse ~ Environment1, family = "gaussian", data = testData)
+simulationOutput = recalculateResiduals(simulationOutput, group = testData$group)
+
+