From 0b4ea7c9563d875d79526366378857f63f2a4144 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Sun, 29 Oct 2023 21:56:16 +0100 Subject: [PATCH 1/4] Slight speedup in univariate_grid() --- R/utils_grid.R | 17 ++++++++++------- man/{H2.Rd => h2.Rd} | 2 +- man/{H2_overall.Rd => h2_overall.Rd} | 2 +- man/{H2_pairwise.Rd => h2_pairwise.Rd} | 2 +- man/{H2_threeway.Rd => h2_threeway.Rd} | 2 +- man/ice.Rd | 2 +- man/multivariate_grid.Rd | 2 +- man/partial_dep.Rd | 2 +- man/univariate_grid.Rd | 8 ++++---- 9 files changed, 21 insertions(+), 18 deletions(-) rename man/{H2.Rd => h2.Rd} (98%) rename man/{H2_overall.Rd => h2_overall.Rd} (98%) rename man/{H2_pairwise.Rd => h2_pairwise.Rd} (99%) rename man/{H2_threeway.Rd => h2_threeway.Rd} (98%) diff --git a/R/utils_grid.R b/R/utils_grid.R index 657a38ee..867408a6 100644 --- a/R/utils_grid.R +++ b/R/utils_grid.R @@ -14,7 +14,7 @@ #' If `strategy = "quantile"`, the evaluation points are quantiles over a regular grid #' of probabilities from `trim[1]` to `trim[2]`. #' -#' All quantiles are calculated via the inverse of the ECDF, i.e., via +#' Quantiles are calculated via the inverse of the ECDF, i.e., via #' `stats::quantile(..., type = 1`). #' #' @param z A vector or factor. @@ -24,7 +24,7 @@ #' of grid values. Set to `0:1` for no trimming. #' @param strategy How to find grid values of non-discrete numeric columns? #' Either "uniform" or "quantile", see description of [univariate_grid()]. -#' @param na.rm Should missing values be dropped from grid? Default is `TRUE`. +#' @param na.rm Should missing values be dropped from the grid? Default is `TRUE`. #' @returns A vector or factor of evaluation points. #' @seealso [multivariate_grid()] #' @export @@ -33,8 +33,8 @@ #' univariate_grid(rev(iris$Species)) # Same #' #' x <- iris$Sepal.Width -#' univariate_grid(x, grid_size = 5) # Quantile binning -#' univariate_grid(x, grid_size = 5, strategy = "uniform") # Uniform +#' univariate_grid(x, grid_size = 5) # Uniform binning +#' univariate_grid(x, grid_size = 5, strategy = "quantile") # Quantile univariate_grid <- function(z, grid_size = 49L, trim = c(0.01, 0.99), strategy = c("uniform", "quantile"), na.rm = TRUE) { strategy <- match.arg(strategy) @@ -50,9 +50,12 @@ univariate_grid <- function(z, grid_size = 49L, trim = c(0.01, 0.99), g <- stats::quantile(z, probs = p, names = FALSE, type = 1L, na.rm = TRUE) out <- unique(g) } else { - # strategy = "uniform" (could use range() if trim = 0:1) - r <- stats::quantile(z, probs = trim, names = FALSE, type = 1L, na.rm = TRUE) - # pretty(r, n = grid_size) # Until version 0.2.0 + # strategy = "uniform" + if (trim[1L] == 0 && trim[2L] == 1) { + r <- range(z, na.rm = TRUE) + } else { + r <- stats::quantile(z, probs = trim, names = FALSE, type = 1L, na.rm = TRUE) + } out <- seq(r[1L], r[2L], length.out = grid_size) } if (!na.rm && anyNA(z)) { diff --git a/man/H2.Rd b/man/h2.Rd similarity index 98% rename from man/H2.Rd rename to man/h2.Rd index 73280f70..cddcbe49 100644 --- a/man/H2.Rd +++ b/man/h2.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/H2.R +% Please edit documentation in R/h2.R \name{h2} \alias{h2} \alias{h2.default} diff --git a/man/H2_overall.Rd b/man/h2_overall.Rd similarity index 98% rename from man/H2_overall.Rd rename to man/h2_overall.Rd index 700af946..e5b0cf0d 100644 --- a/man/H2_overall.Rd +++ b/man/h2_overall.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/H2_overall.R +% Please edit documentation in R/h2_overall.R \name{h2_overall} \alias{h2_overall} \alias{h2_overall.default} diff --git a/man/H2_pairwise.Rd b/man/h2_pairwise.Rd similarity index 99% rename from man/H2_pairwise.Rd rename to man/h2_pairwise.Rd index 4f38c004..d2970c6a 100644 --- a/man/H2_pairwise.Rd +++ b/man/h2_pairwise.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/H2_pairwise.R +% Please edit documentation in R/h2_pairwise.R \name{h2_pairwise} \alias{h2_pairwise} \alias{h2_pairwise.default} diff --git a/man/H2_threeway.Rd b/man/h2_threeway.Rd similarity index 98% rename from man/H2_threeway.Rd rename to man/h2_threeway.Rd index fba0f84f..2a0540fe 100644 --- a/man/H2_threeway.Rd +++ b/man/h2_threeway.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/H2_threeway.R +% Please edit documentation in R/h2_threeway.R \name{h2_threeway} \alias{h2_threeway} \alias{h2_threeway.default} diff --git a/man/ice.Rd b/man/ice.Rd index df027fc9..b59330b3 100644 --- a/man/ice.Rd +++ b/man/ice.Rd @@ -107,7 +107,7 @@ of grid values. Set to \code{0:1} for no trimming.} \item{strategy}{How to find grid values of non-discrete numeric columns? Either "uniform" or "quantile", see description of \code{\link[=univariate_grid]{univariate_grid()}}.} -\item{na.rm}{Should missing values be dropped from grid? Default is \code{TRUE}.} +\item{na.rm}{Should missing values be dropped from the grid? Default is \code{TRUE}.} \item{n_max}{If \code{X} has more than \code{n_max} rows, a random sample of \code{n_max} rows is selected from \code{X}. In this case, set a random seed for reproducibility.} diff --git a/man/multivariate_grid.Rd b/man/multivariate_grid.Rd index 6db9bc47..42141030 100644 --- a/man/multivariate_grid.Rd +++ b/man/multivariate_grid.Rd @@ -25,7 +25,7 @@ of grid values. Set to \code{0:1} for no trimming.} \item{strategy}{How to find grid values of non-discrete numeric columns? Either "uniform" or "quantile", see description of \code{\link[=univariate_grid]{univariate_grid()}}.} -\item{na.rm}{Should missing values be dropped from grid? Default is \code{TRUE}.} +\item{na.rm}{Should missing values be dropped from the grid? Default is \code{TRUE}.} } \value{ A vector, matrix, or data.frame with evaluation points. diff --git a/man/partial_dep.Rd b/man/partial_dep.Rd index be96e6a2..96098d0d 100644 --- a/man/partial_dep.Rd +++ b/man/partial_dep.Rd @@ -121,7 +121,7 @@ of grid values. Set to \code{0:1} for no trimming.} \item{strategy}{How to find grid values of non-discrete numeric columns? Either "uniform" or "quantile", see description of \code{\link[=univariate_grid]{univariate_grid()}}.} -\item{na.rm}{Should missing values be dropped from grid? Default is \code{TRUE}.} +\item{na.rm}{Should missing values be dropped from the grid? Default is \code{TRUE}.} \item{n_max}{If \code{X} has more than \code{n_max} rows, a random sample of \code{n_max} rows is selected from \code{X}. In this case, set a random seed for reproducibility.} diff --git a/man/univariate_grid.Rd b/man/univariate_grid.Rd index 57925c89..ed33e1c7 100644 --- a/man/univariate_grid.Rd +++ b/man/univariate_grid.Rd @@ -24,7 +24,7 @@ of grid values. Set to \code{0:1} for no trimming.} \item{strategy}{How to find grid values of non-discrete numeric columns? Either "uniform" or "quantile", see description of \code{\link[=univariate_grid]{univariate_grid()}}.} -\item{na.rm}{Should missing values be dropped from grid? Default is \code{TRUE}.} +\item{na.rm}{Should missing values be dropped from the grid? Default is \code{TRUE}.} } \value{ A vector or factor of evaluation points. @@ -43,7 +43,7 @@ Set \code{trim = 0:1} for no trimming. If \code{strategy = "quantile"}, the evaluation points are quantiles over a regular grid of probabilities from \code{trim[1]} to \code{trim[2]}. -All quantiles are calculated via the inverse of the ECDF, i.e., via +Quantiles are calculated via the inverse of the ECDF, i.e., via \verb{stats::quantile(..., type = 1}). } \examples{ @@ -51,8 +51,8 @@ univariate_grid(iris$Species) univariate_grid(rev(iris$Species)) # Same x <- iris$Sepal.Width -univariate_grid(x, grid_size = 5) # Quantile binning -univariate_grid(x, grid_size = 5, strategy = "uniform") # Uniform +univariate_grid(x, grid_size = 5) # Uniform binning +univariate_grid(x, grid_size = 5, strategy = "quantile") # Quantile } \seealso{ \code{\link[=multivariate_grid]{multivariate_grid()}} From 0a1a7ddbbbc5c706efdbebcd23bdfb547fa19d8a Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Sun, 29 Oct 2023 22:29:25 +0100 Subject: [PATCH 2/4] draft of hist2() --- backlog/calibration.R | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/backlog/calibration.R b/backlog/calibration.R index 0c3174ed..2abf764f 100644 --- a/backlog/calibration.R +++ b/backlog/calibration.R @@ -281,3 +281,32 @@ plot.calibration <- function(x, } p } + +hist2 <- function(x, breaks = "Sturges", trim = c(0.01, 0.99), + include.lowest = TRUE, right = TRUE, na.rm = TRUE) { + if (!is.numeric(x)) { + g <- sort(unique(x), na.last = if (na.rm) NA else TRUE) + return(list(x = x, grid = g)) + } + + # Trim outliers before histogram construction? + if (trim[1L] == 0 && trim[2L] == 1) { + xx <- x + } else { + r <- stats::quantile(x, probs = trim, names = FALSE, type = 1L, na.rm = TRUE) + xx <- x[x >= r[1L] & x <= r[2L]] + } + h <- hist(xx, breaks = breaks, include.lowest = include.lowest, right = right) + g <- h[["mids"]] + ix <- findInterval( + x, + vec = h[["breaks"]], + left.open = right, + rightmost.closed = include.lowest, + all.inside = TRUE + ) + if (!na.rm && anyNA(x)) { + g <- c(g, NA) + } + list(x = g[ix], grid = g) +} From 2bbbc88fd28b4ccc99b26706623ad3fcc1673199 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Mon, 30 Oct 2023 13:24:58 +0100 Subject: [PATCH 3/4] update gwColMeans --- R/average_loss.R | 2 +- R/utils_calculate.R | 37 ++++++++++++++--------------- backlog/calibration.R | 32 ++++++++++++++++--------- tests/testthat/test_calculate.R | 42 ++++++++++++++++----------------- 4 files changed, 61 insertions(+), 52 deletions(-) diff --git a/R/average_loss.R b/R/average_loss.R index bff65f9f..ee90ee92 100644 --- a/R/average_loss.R +++ b/R/average_loss.R @@ -93,7 +93,7 @@ average_loss.default <- function(object, X, y, # Real work L <- as.matrix(loss(y, pred_fun(object, X, ...))) - M <- gwColMeans(L, g = BY, w = w) + M <- gwColMeans(L, g = BY, w = w)[["M"]] if (agg_cols && ncol(M) > 1L) { M <- cbind(rowSums(M)) diff --git a/R/utils_calculate.R b/R/utils_calculate.R index aa581adc..1b836bdc 100644 --- a/R/utils_calculate.R +++ b/R/utils_calculate.R @@ -77,8 +77,9 @@ wcolMeans <- function(x, w = NULL) { #' Grouped wcolMeans() #' -#' Internal function used to calculate grouped column-wise weighted means. -#' +#' Internal function used to calculate grouped column-wise weighted means along with +#' corresponding (weighted) counts. +#' #' @noRd #' @keywords internal #' @@ -86,30 +87,28 @@ wcolMeans <- function(x, w = NULL) { #' @param g Optional grouping variable. #' @param w Optional case weights. #' @param reorder Should groups be ordered, see [rowsum()]. Default is `TRUE`. -#' @param mean_only If `TRUE`, only the means are returned. If `FALSE`, a list -#' with "mean", "num" and "denom" is returned. -#' @returns A matrix with one row per group (if `mean_only = TRUE`), or a list -#' with slots "mean", "num", and "denom". +#' @returns A list with two elements: "M" represents a matrix of grouped (column) +#' means, and "w" is a vector of corresponding group counts/weights. #' @examples #' with(iris, gwColMeans(Sepal.Width, g = Species, w = Sepal.Length)) -#' with(iris, gwColMeans(Sepal.Width, g = Species, w = Sepal.Length, mean_only = FALSE)) -gwColMeans <- function(x, g = NULL, w = NULL, reorder = TRUE, mean_only = TRUE) { +gwColMeans <- function(x, g = NULL, w = NULL, reorder = TRUE) { if (is.null(g)) { - return(rbind(wcolMeans(x, w = w))) + wg <- if (is.null(w)) NROW(x) else sum(w) + M <- rbind(wcolMeans(x, w = w)) + return(list(M = M, w = wg)) } + + # Now the interesting case if (is.null(w)) { - num <- rowsum(x, group = g, reorder = reorder) - denom <- rowsum(rep.int(1, NROW(x)), group = g, reorder = reorder) + w <- rep.int(1, NROW(x)) } else { - num <- rowsum(x * w, group = g, reorder = reorder) - denom <- rowsum(w, group = g, reorder = reorder) - } - out <- num / matrix(denom, nrow = nrow(num), ncol = ncol(num), byrow = FALSE) - - if (mean_only) { - return(out) + x <- x * w # w is correctly recycled over columns } - list(mean = out, num = num, denom = denom) + num <- rowsum(x, group = g, reorder = reorder) + wg <- rowsum(w, group = g, reorder = reorder) + p <- ncol(num) + denom <- if (p == 1L) wg else matrix(wg, nrow = nrow(num), ncol = p) + list(M = num / denom, w = as.numeric(wg)) } #' Weighted Mean Centering diff --git a/backlog/calibration.R b/backlog/calibration.R index 2abf764f..13b402b7 100644 --- a/backlog/calibration.R +++ b/backlog/calibration.R @@ -282,11 +282,24 @@ plot.calibration <- function(x, p } -hist2 <- function(x, breaks = "Sturges", trim = c(0.01, 0.99), +#' Histogram Bin Construction +#' +#' Creates histogram of vector/factor `x`. In the discrete case, no binning is done. +#' Otherwise, the values are optionally trimmed and then passed to [hist()]. Compared +#' with [hist()], the function also returns the binned values of `x`. +#' +#' @param x A vector or factor to be binned. +#' @inheritParams hist +#' @inheritParams univariate_grid +#' @returns A list with binned "x", vector of "breaks", bin midpoints "grid", and a +#' logical flag "discrete" indicating whether the values have not been binned. +#' @seealso See [calibration()] for examples. +hist2 <- function(x, breaks = 17L, trim = c(0.01, 0.99), include.lowest = TRUE, right = TRUE, na.rm = TRUE) { - if (!is.numeric(x)) { - g <- sort(unique(x), na.last = if (na.rm) NA else TRUE) - return(list(x = x, grid = g)) + g <- unique(x) + if (!is.numeric(x) || (length(breaks) == 1L && is.numeric(breaks) && length(g) <= breaks)) { + g <- sort(g, na.last = if (na.rm) NA else TRUE) + return(list(x = x, breaks = g, grid = g, discrete = TRUE)) } # Trim outliers before histogram construction? @@ -297,16 +310,13 @@ hist2 <- function(x, breaks = "Sturges", trim = c(0.01, 0.99), xx <- x[x >= r[1L] & x <= r[2L]] } h <- hist(xx, breaks = breaks, include.lowest = include.lowest, right = right) - g <- h[["mids"]] + b <- h$breaks ix <- findInterval( - x, - vec = h[["breaks"]], - left.open = right, - rightmost.closed = include.lowest, - all.inside = TRUE + x, vec = b, left.open = right, rightmost.closed = include.lowest, all.inside = TRUE ) + g <- h$mids if (!na.rm && anyNA(x)) { g <- c(g, NA) } - list(x = g[ix], grid = g) + list(x = g[ix], breaks = b, grid = g, discrete = FALSE) } diff --git a/tests/testthat/test_calculate.R b/tests/testthat/test_calculate.R index 77c67f1c..7d3893fa 100644 --- a/tests/testthat/test_calculate.R +++ b/tests/testthat/test_calculate.R @@ -71,31 +71,31 @@ test_that("gwcolMeans() works", { w2 <- 1:6 # Ungrouped - expect_equal(gwColMeans(x), rbind(wcolMeans(x))) - expect_equal(gwColMeans(x, w = w1), rbind(wcolMeans(x, w = w1))) - expect_equal(gwColMeans(x, w = w2), rbind(wcolMeans(x, w = w2))) + r <- gwColMeans(x) + expect_equal(r$M, rbind(wcolMeans(x))) + expect_equal(r$w, nrow(x)) - # Grouped - expect_equal(gwColMeans(x, g = g)[2L, ], wcolMeans(x[g == 2, ])) - expect_equal(gwColMeans(x, g = g, reorder = FALSE)[2L, ], wcolMeans(x[g == 1, ])) + r <- gwColMeans(x, w = w1) + expect_equal(r$M, rbind(wcolMeans(x, w = w1))) + expect_equal(r$w, sum(w1)) - g1 <- gwColMeans(x, g = g) - g2 <- gwColMeans(x, g = g, mean_only = FALSE) - g2_d <- matrix(g2$denom, nrow = 2, ncol = 2, byrow = FALSE) - expect_equal(g1, g2$num / g2_d) - expect_equal(g2$mean, g1) + r <- gwColMeans(x, w = w2) + expect_equal(r$M, rbind(wcolMeans(x, w = w2))) + expect_equal(r$w, sum(w2)) - # Grouped and weighted - expect_equal( - gwColMeans(x, g = g, w = w2)[2L, ], - wcolMeans(x[g == 2, ], w = w2[g == 2]) - ) + # Grouped + r <- gwColMeans(x, g = g) + expect_equal(r$M[2L, ], wcolMeans(x[g == 2, ])) + expect_equal(r$w, c(4, 2)) - g1 <- gwColMeans(x, g = g, w = w2) - g2 <- gwColMeans(x, g = g, w = w2, mean_only = FALSE) - g2_d <- matrix(g2$denom, nrow = 2, ncol = 2, byrow = FALSE) - expect_equal(g1, g2$num / g2_d) - expect_equal(g2$mean, g1) + r <- gwColMeans(x, g = g, reorder = FALSE) + expect_equal(r$M[2L, ], wcolMeans(x[g == 1, ])) + expect_equal(r$w, c(2, 4)) + + # Grouped and weighted + r <- gwColMeans(x, g = g, w = w2) + expect_equal(r$M[2L, ], wcolMeans(x[g == 2, ], w = w2[g == 2])) + expect_equal(r$w, c(sum(3:6), sum(1:2))) }) test_that("wcenter() works for matrices with > 1 columns", { From 590a001a6bb22bf9c4988011b3f5bd850df225fa Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Mon, 30 Oct 2023 17:08:52 +0100 Subject: [PATCH 4/4] simplified gwColMeans --- R/utils_calculate.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/R/utils_calculate.R b/R/utils_calculate.R index 1b836bdc..5be55bd0 100644 --- a/R/utils_calculate.R +++ b/R/utils_calculate.R @@ -93,9 +93,9 @@ wcolMeans <- function(x, w = NULL) { #' with(iris, gwColMeans(Sepal.Width, g = Species, w = Sepal.Length)) gwColMeans <- function(x, g = NULL, w = NULL, reorder = TRUE) { if (is.null(g)) { - wg <- if (is.null(w)) NROW(x) else sum(w) M <- rbind(wcolMeans(x, w = w)) - return(list(M = M, w = wg)) + denom <- if (is.null(w)) NROW(x) else sum(w) + return(list(M = M, w = denom)) } # Now the interesting case @@ -105,10 +105,8 @@ gwColMeans <- function(x, g = NULL, w = NULL, reorder = TRUE) { x <- x * w # w is correctly recycled over columns } num <- rowsum(x, group = g, reorder = reorder) - wg <- rowsum(w, group = g, reorder = reorder) - p <- ncol(num) - denom <- if (p == 1L) wg else matrix(wg, nrow = nrow(num), ncol = p) - list(M = num / denom, w = as.numeric(wg)) + denom <- as.numeric(rowsum(w, group = g, reorder = reorder)) + list(M = num / denom, w = denom) } #' Weighted Mean Centering