From 0b4ea7c9563d875d79526366378857f63f2a4144 Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Sun, 29 Oct 2023 21:56:16 +0100
Subject: [PATCH 1/4] Slight speedup in univariate_grid()

---
 R/utils_grid.R                         | 17 ++++++++++-------
 man/{H2.Rd => h2.Rd}                   |  2 +-
 man/{H2_overall.Rd => h2_overall.Rd}   |  2 +-
 man/{H2_pairwise.Rd => h2_pairwise.Rd} |  2 +-
 man/{H2_threeway.Rd => h2_threeway.Rd} |  2 +-
 man/ice.Rd                             |  2 +-
 man/multivariate_grid.Rd               |  2 +-
 man/partial_dep.Rd                     |  2 +-
 man/univariate_grid.Rd                 |  8 ++++----
 9 files changed, 21 insertions(+), 18 deletions(-)
 rename man/{H2.Rd => h2.Rd} (98%)
 rename man/{H2_overall.Rd => h2_overall.Rd} (98%)
 rename man/{H2_pairwise.Rd => h2_pairwise.Rd} (99%)
 rename man/{H2_threeway.Rd => h2_threeway.Rd} (98%)

diff --git a/R/utils_grid.R b/R/utils_grid.R
index 657a38ee..867408a6 100644
--- a/R/utils_grid.R
+++ b/R/utils_grid.R
@@ -14,7 +14,7 @@
 #' If `strategy = "quantile"`, the evaluation points are quantiles over a regular grid 
 #' of probabilities from `trim[1]` to `trim[2]`.
 #' 
-#' All quantiles are calculated via the inverse of the ECDF, i.e., via
+#' Quantiles are calculated via the inverse of the ECDF, i.e., via
 #' `stats::quantile(..., type = 1`).
 #' 
 #' @param z A vector or factor.
@@ -24,7 +24,7 @@
 #'   of grid values. Set to `0:1` for no trimming.
 #' @param strategy How to find grid values of non-discrete numeric columns? 
 #'   Either "uniform" or "quantile", see description of [univariate_grid()].
-#' @param na.rm Should missing values be dropped from grid? Default is `TRUE`.
+#' @param na.rm Should missing values be dropped from the grid? Default is `TRUE`.
 #' @returns A vector or factor of evaluation points.
 #' @seealso [multivariate_grid()]
 #' @export
@@ -33,8 +33,8 @@
 #' univariate_grid(rev(iris$Species))                       # Same
 #' 
 #' x <- iris$Sepal.Width
-#' univariate_grid(x, grid_size = 5)                        # Quantile binning
-#' univariate_grid(x, grid_size = 5, strategy = "uniform")  # Uniform
+#' univariate_grid(x, grid_size = 5)                        # Uniform binning
+#' univariate_grid(x, grid_size = 5, strategy = "quantile")  # Quantile
 univariate_grid <- function(z, grid_size = 49L, trim = c(0.01, 0.99), 
                             strategy = c("uniform", "quantile"), na.rm = TRUE) {
   strategy <- match.arg(strategy)
@@ -50,9 +50,12 @@ univariate_grid <- function(z, grid_size = 49L, trim = c(0.01, 0.99),
     g <- stats::quantile(z, probs = p, names = FALSE, type = 1L, na.rm = TRUE)
     out <- unique(g)
   } else {
-    # strategy = "uniform" (could use range() if trim = 0:1)
-    r <- stats::quantile(z, probs = trim, names = FALSE, type = 1L, na.rm = TRUE)
-    # pretty(r, n = grid_size)  # Until version 0.2.0
+    # strategy = "uniform"
+    if (trim[1L] == 0 && trim[2L] == 1) {
+      r <- range(z, na.rm = TRUE)
+    } else {
+      r <- stats::quantile(z, probs = trim, names = FALSE, type = 1L, na.rm = TRUE)
+    }
     out <- seq(r[1L], r[2L], length.out = grid_size)  
   }
   if (!na.rm && anyNA(z)) {
diff --git a/man/H2.Rd b/man/h2.Rd
similarity index 98%
rename from man/H2.Rd
rename to man/h2.Rd
index 73280f70..cddcbe49 100644
--- a/man/H2.Rd
+++ b/man/h2.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/H2.R
+% Please edit documentation in R/h2.R
 \name{h2}
 \alias{h2}
 \alias{h2.default}
diff --git a/man/H2_overall.Rd b/man/h2_overall.Rd
similarity index 98%
rename from man/H2_overall.Rd
rename to man/h2_overall.Rd
index 700af946..e5b0cf0d 100644
--- a/man/H2_overall.Rd
+++ b/man/h2_overall.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/H2_overall.R
+% Please edit documentation in R/h2_overall.R
 \name{h2_overall}
 \alias{h2_overall}
 \alias{h2_overall.default}
diff --git a/man/H2_pairwise.Rd b/man/h2_pairwise.Rd
similarity index 99%
rename from man/H2_pairwise.Rd
rename to man/h2_pairwise.Rd
index 4f38c004..d2970c6a 100644
--- a/man/H2_pairwise.Rd
+++ b/man/h2_pairwise.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/H2_pairwise.R
+% Please edit documentation in R/h2_pairwise.R
 \name{h2_pairwise}
 \alias{h2_pairwise}
 \alias{h2_pairwise.default}
diff --git a/man/H2_threeway.Rd b/man/h2_threeway.Rd
similarity index 98%
rename from man/H2_threeway.Rd
rename to man/h2_threeway.Rd
index fba0f84f..2a0540fe 100644
--- a/man/H2_threeway.Rd
+++ b/man/h2_threeway.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/H2_threeway.R
+% Please edit documentation in R/h2_threeway.R
 \name{h2_threeway}
 \alias{h2_threeway}
 \alias{h2_threeway.default}
diff --git a/man/ice.Rd b/man/ice.Rd
index df027fc9..b59330b3 100644
--- a/man/ice.Rd
+++ b/man/ice.Rd
@@ -107,7 +107,7 @@ of grid values. Set to \code{0:1} for no trimming.}
 \item{strategy}{How to find grid values of non-discrete numeric columns?
 Either "uniform" or "quantile", see description of \code{\link[=univariate_grid]{univariate_grid()}}.}
 
-\item{na.rm}{Should missing values be dropped from grid? Default is \code{TRUE}.}
+\item{na.rm}{Should missing values be dropped from the grid? Default is \code{TRUE}.}
 
 \item{n_max}{If \code{X} has more than \code{n_max} rows, a random sample of \code{n_max} rows is
 selected from \code{X}. In this case, set a random seed for reproducibility.}
diff --git a/man/multivariate_grid.Rd b/man/multivariate_grid.Rd
index 6db9bc47..42141030 100644
--- a/man/multivariate_grid.Rd
+++ b/man/multivariate_grid.Rd
@@ -25,7 +25,7 @@ of grid values. Set to \code{0:1} for no trimming.}
 \item{strategy}{How to find grid values of non-discrete numeric columns?
 Either "uniform" or "quantile", see description of \code{\link[=univariate_grid]{univariate_grid()}}.}
 
-\item{na.rm}{Should missing values be dropped from grid? Default is \code{TRUE}.}
+\item{na.rm}{Should missing values be dropped from the grid? Default is \code{TRUE}.}
 }
 \value{
 A vector, matrix, or data.frame with evaluation points.
diff --git a/man/partial_dep.Rd b/man/partial_dep.Rd
index be96e6a2..96098d0d 100644
--- a/man/partial_dep.Rd
+++ b/man/partial_dep.Rd
@@ -121,7 +121,7 @@ of grid values. Set to \code{0:1} for no trimming.}
 \item{strategy}{How to find grid values of non-discrete numeric columns?
 Either "uniform" or "quantile", see description of \code{\link[=univariate_grid]{univariate_grid()}}.}
 
-\item{na.rm}{Should missing values be dropped from grid? Default is \code{TRUE}.}
+\item{na.rm}{Should missing values be dropped from the grid? Default is \code{TRUE}.}
 
 \item{n_max}{If \code{X} has more than \code{n_max} rows, a random sample of \code{n_max} rows is
 selected from \code{X}. In this case, set a random seed for reproducibility.}
diff --git a/man/univariate_grid.Rd b/man/univariate_grid.Rd
index 57925c89..ed33e1c7 100644
--- a/man/univariate_grid.Rd
+++ b/man/univariate_grid.Rd
@@ -24,7 +24,7 @@ of grid values. Set to \code{0:1} for no trimming.}
 \item{strategy}{How to find grid values of non-discrete numeric columns?
 Either "uniform" or "quantile", see description of \code{\link[=univariate_grid]{univariate_grid()}}.}
 
-\item{na.rm}{Should missing values be dropped from grid? Default is \code{TRUE}.}
+\item{na.rm}{Should missing values be dropped from the grid? Default is \code{TRUE}.}
 }
 \value{
 A vector or factor of evaluation points.
@@ -43,7 +43,7 @@ Set \code{trim = 0:1} for no trimming.
 If \code{strategy = "quantile"}, the evaluation points are quantiles over a regular grid
 of probabilities from \code{trim[1]} to \code{trim[2]}.
 
-All quantiles are calculated via the inverse of the ECDF, i.e., via
+Quantiles are calculated via the inverse of the ECDF, i.e., via
 \verb{stats::quantile(..., type = 1}).
 }
 \examples{
@@ -51,8 +51,8 @@ univariate_grid(iris$Species)
 univariate_grid(rev(iris$Species))                       # Same
 
 x <- iris$Sepal.Width
-univariate_grid(x, grid_size = 5)                        # Quantile binning
-univariate_grid(x, grid_size = 5, strategy = "uniform")  # Uniform
+univariate_grid(x, grid_size = 5)                        # Uniform binning
+univariate_grid(x, grid_size = 5, strategy = "quantile")  # Quantile
 }
 \seealso{
 \code{\link[=multivariate_grid]{multivariate_grid()}}

From 0a1a7ddbbbc5c706efdbebcd23bdfb547fa19d8a Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Sun, 29 Oct 2023 22:29:25 +0100
Subject: [PATCH 2/4] draft of hist2()

---
 backlog/calibration.R | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/backlog/calibration.R b/backlog/calibration.R
index 0c3174ed..2abf764f 100644
--- a/backlog/calibration.R
+++ b/backlog/calibration.R
@@ -281,3 +281,32 @@ plot.calibration <- function(x,
   }
   p
 }
+
+hist2 <- function(x, breaks = "Sturges", trim = c(0.01, 0.99), 
+                  include.lowest = TRUE, right = TRUE, na.rm = TRUE) {
+  if (!is.numeric(x)) {
+    g <- sort(unique(x), na.last = if (na.rm) NA else TRUE)
+    return(list(x = x, grid = g))
+  }
+  
+  # Trim outliers before histogram construction?
+  if (trim[1L] == 0 && trim[2L] == 1) {
+    xx <- x
+  } else {
+    r <- stats::quantile(x, probs = trim, names = FALSE, type = 1L, na.rm = TRUE)
+    xx <- x[x >= r[1L] & x <= r[2L]]
+  }
+  h <- hist(xx, breaks = breaks, include.lowest = include.lowest, right = right)
+  g <- h[["mids"]]
+  ix <- findInterval(
+    x, 
+    vec = h[["breaks"]], 
+    left.open = right, 
+    rightmost.closed = include.lowest, 
+    all.inside = TRUE
+  )
+  if (!na.rm && anyNA(x)) {
+    g <- c(g, NA)
+  }
+  list(x = g[ix], grid = g)
+}

From 2bbbc88fd28b4ccc99b26706623ad3fcc1673199 Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Mon, 30 Oct 2023 13:24:58 +0100
Subject: [PATCH 3/4] update gwColMeans

---
 R/average_loss.R                |  2 +-
 R/utils_calculate.R             | 37 ++++++++++++++---------------
 backlog/calibration.R           | 32 ++++++++++++++++---------
 tests/testthat/test_calculate.R | 42 ++++++++++++++++-----------------
 4 files changed, 61 insertions(+), 52 deletions(-)

diff --git a/R/average_loss.R b/R/average_loss.R
index bff65f9f..ee90ee92 100644
--- a/R/average_loss.R
+++ b/R/average_loss.R
@@ -93,7 +93,7 @@ average_loss.default <- function(object, X, y,
   
   # Real work
   L <- as.matrix(loss(y, pred_fun(object, X, ...)))
-  M <- gwColMeans(L, g = BY, w = w)
+  M <- gwColMeans(L, g = BY, w = w)[["M"]]
   
   if (agg_cols && ncol(M) > 1L) {
     M <- cbind(rowSums(M))
diff --git a/R/utils_calculate.R b/R/utils_calculate.R
index aa581adc..1b836bdc 100644
--- a/R/utils_calculate.R
+++ b/R/utils_calculate.R
@@ -77,8 +77,9 @@ wcolMeans <- function(x, w = NULL) {
 
 #' Grouped wcolMeans()
 #' 
-#' Internal function used to calculate grouped column-wise weighted means.
-#' 
+#' Internal function used to calculate grouped column-wise weighted means along with
+#' corresponding (weighted) counts.
+#'
 #' @noRd
 #' @keywords internal
 #' 
@@ -86,30 +87,28 @@ wcolMeans <- function(x, w = NULL) {
 #' @param g Optional grouping variable.
 #' @param w Optional case weights.
 #' @param reorder Should groups be ordered, see [rowsum()]. Default is `TRUE`.
-#' @param mean_only If `TRUE`, only the means are returned. If `FALSE`, a list
-#'   with "mean", "num" and "denom" is returned.
-#' @returns A matrix with one row per group (if `mean_only = TRUE`), or a list
-#'   with slots "mean", "num", and "denom".
+#' @returns A list with two elements: "M" represents a matrix of grouped (column) 
+#'   means, and "w" is a vector of corresponding group counts/weights.
 #' @examples 
 #' with(iris, gwColMeans(Sepal.Width, g = Species, w = Sepal.Length))
-#' with(iris, gwColMeans(Sepal.Width, g = Species, w = Sepal.Length, mean_only = FALSE))
-gwColMeans <- function(x, g = NULL, w = NULL, reorder = TRUE, mean_only = TRUE) {
+gwColMeans <- function(x, g = NULL, w = NULL, reorder = TRUE) {
   if (is.null(g)) {
-    return(rbind(wcolMeans(x, w = w)))
+    wg <- if (is.null(w)) NROW(x) else sum(w)
+    M <- rbind(wcolMeans(x, w = w))
+    return(list(M = M, w = wg))
   }
+  
+  # Now the interesting case
   if (is.null(w)) {
-    num <- rowsum(x, group = g, reorder = reorder)
-    denom <- rowsum(rep.int(1, NROW(x)), group = g, reorder = reorder)
+    w <- rep.int(1, NROW(x))
   } else {
-    num <- rowsum(x * w, group = g, reorder = reorder)
-    denom <- rowsum(w, group = g, reorder = reorder)
-  }
-  out <- num / matrix(denom, nrow = nrow(num), ncol = ncol(num), byrow = FALSE)
-  
-  if (mean_only) {
-    return(out)
+    x <- x * w  # w is correctly recycled over columns
   }
-  list(mean = out, num = num, denom = denom)
+  num <- rowsum(x, group = g, reorder = reorder)
+  wg <- rowsum(w, group = g, reorder = reorder)
+  p <- ncol(num)
+  denom <- if (p == 1L) wg else matrix(wg, nrow = nrow(num), ncol = p)
+  list(M = num / denom, w = as.numeric(wg))
 }
 
 #' Weighted Mean Centering
diff --git a/backlog/calibration.R b/backlog/calibration.R
index 2abf764f..13b402b7 100644
--- a/backlog/calibration.R
+++ b/backlog/calibration.R
@@ -282,11 +282,24 @@ plot.calibration <- function(x,
   p
 }
 
-hist2 <- function(x, breaks = "Sturges", trim = c(0.01, 0.99), 
+#' Histogram Bin Construction
+#' 
+#' Creates histogram of vector/factor `x`. In the discrete case, no binning is done.
+#' Otherwise, the values are optionally trimmed and then passed to [hist()]. Compared
+#' with [hist()], the function also returns the binned values of `x`.
+#' 
+#' @param x A vector or factor to be binned.
+#' @inheritParams hist
+#' @inheritParams univariate_grid
+#' @returns A list with binned "x", vector of "breaks", bin midpoints "grid", and a
+#'   logical flag "discrete" indicating whether the values have not been binned.
+#' @seealso See [calibration()] for examples.
+hist2 <- function(x, breaks = 17L, trim = c(0.01, 0.99), 
                   include.lowest = TRUE, right = TRUE, na.rm = TRUE) {
-  if (!is.numeric(x)) {
-    g <- sort(unique(x), na.last = if (na.rm) NA else TRUE)
-    return(list(x = x, grid = g))
+  g <- unique(x)
+  if (!is.numeric(x) || (length(breaks) == 1L && is.numeric(breaks) && length(g) <= breaks)) {
+    g <- sort(g, na.last = if (na.rm) NA else TRUE)
+    return(list(x = x, breaks = g, grid = g, discrete = TRUE))
   }
   
   # Trim outliers before histogram construction?
@@ -297,16 +310,13 @@ hist2 <- function(x, breaks = "Sturges", trim = c(0.01, 0.99),
     xx <- x[x >= r[1L] & x <= r[2L]]
   }
   h <- hist(xx, breaks = breaks, include.lowest = include.lowest, right = right)
-  g <- h[["mids"]]
+  b <- h$breaks
   ix <- findInterval(
-    x, 
-    vec = h[["breaks"]], 
-    left.open = right, 
-    rightmost.closed = include.lowest, 
-    all.inside = TRUE
+    x, vec = b, left.open = right, rightmost.closed = include.lowest, all.inside = TRUE
   )
+  g <- h$mids
   if (!na.rm && anyNA(x)) {
     g <- c(g, NA)
   }
-  list(x = g[ix], grid = g)
+  list(x = g[ix], breaks = b, grid = g, discrete = FALSE)
 }
diff --git a/tests/testthat/test_calculate.R b/tests/testthat/test_calculate.R
index 77c67f1c..7d3893fa 100644
--- a/tests/testthat/test_calculate.R
+++ b/tests/testthat/test_calculate.R
@@ -71,31 +71,31 @@ test_that("gwcolMeans() works", {
   w2 <- 1:6
   
   # Ungrouped
-  expect_equal(gwColMeans(x), rbind(wcolMeans(x)))
-  expect_equal(gwColMeans(x, w = w1), rbind(wcolMeans(x, w = w1)))
-  expect_equal(gwColMeans(x, w = w2), rbind(wcolMeans(x, w = w2)))
+  r <- gwColMeans(x)
+  expect_equal(r$M, rbind(wcolMeans(x)))
+  expect_equal(r$w, nrow(x))
   
-  # Grouped
-  expect_equal(gwColMeans(x, g = g)[2L, ], wcolMeans(x[g == 2, ]))
-  expect_equal(gwColMeans(x, g = g, reorder = FALSE)[2L, ], wcolMeans(x[g == 1, ]))
+  r <- gwColMeans(x, w = w1)
+  expect_equal(r$M, rbind(wcolMeans(x, w = w1)))
+  expect_equal(r$w, sum(w1))
   
-  g1 <- gwColMeans(x, g = g)
-  g2 <- gwColMeans(x, g = g, mean_only = FALSE)
-  g2_d <- matrix(g2$denom, nrow = 2, ncol = 2, byrow = FALSE)
-  expect_equal(g1, g2$num / g2_d)
-  expect_equal(g2$mean, g1)
+  r <- gwColMeans(x, w = w2)
+  expect_equal(r$M, rbind(wcolMeans(x, w = w2)))
+  expect_equal(r$w, sum(w2))
   
-  # Grouped and weighted
-  expect_equal(
-    gwColMeans(x, g = g, w = w2)[2L, ], 
-    wcolMeans(x[g == 2, ], w = w2[g == 2])
-  )
+  # Grouped
+  r <- gwColMeans(x, g = g)
+  expect_equal(r$M[2L, ], wcolMeans(x[g == 2, ]))
+  expect_equal(r$w, c(4, 2))
   
-  g1 <- gwColMeans(x, g = g, w = w2)
-  g2 <- gwColMeans(x, g = g, w = w2, mean_only = FALSE)
-  g2_d <- matrix(g2$denom, nrow = 2, ncol = 2, byrow = FALSE)
-  expect_equal(g1, g2$num / g2_d)
-  expect_equal(g2$mean, g1)
+  r <- gwColMeans(x, g = g, reorder = FALSE)
+  expect_equal(r$M[2L, ], wcolMeans(x[g == 1, ]))
+  expect_equal(r$w, c(2, 4))
+
+  # Grouped and weighted
+  r <- gwColMeans(x, g = g, w = w2)
+  expect_equal(r$M[2L, ], wcolMeans(x[g == 2, ], w = w2[g == 2]))
+  expect_equal(r$w, c(sum(3:6), sum(1:2)))
 })
 
 test_that("wcenter() works for matrices with > 1 columns", {

From 590a001a6bb22bf9c4988011b3f5bd850df225fa Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Mon, 30 Oct 2023 17:08:52 +0100
Subject: [PATCH 4/4] simplified gwColMeans

---
 R/utils_calculate.R | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/R/utils_calculate.R b/R/utils_calculate.R
index 1b836bdc..5be55bd0 100644
--- a/R/utils_calculate.R
+++ b/R/utils_calculate.R
@@ -93,9 +93,9 @@ wcolMeans <- function(x, w = NULL) {
 #' with(iris, gwColMeans(Sepal.Width, g = Species, w = Sepal.Length))
 gwColMeans <- function(x, g = NULL, w = NULL, reorder = TRUE) {
   if (is.null(g)) {
-    wg <- if (is.null(w)) NROW(x) else sum(w)
     M <- rbind(wcolMeans(x, w = w))
-    return(list(M = M, w = wg))
+    denom <- if (is.null(w)) NROW(x) else sum(w)
+    return(list(M = M, w = denom))
   }
   
   # Now the interesting case
@@ -105,10 +105,8 @@ gwColMeans <- function(x, g = NULL, w = NULL, reorder = TRUE) {
     x <- x * w  # w is correctly recycled over columns
   }
   num <- rowsum(x, group = g, reorder = reorder)
-  wg <- rowsum(w, group = g, reorder = reorder)
-  p <- ncol(num)
-  denom <- if (p == 1L) wg else matrix(wg, nrow = nrow(num), ncol = p)
-  list(M = num / denom, w = as.numeric(wg))
+  denom <- as.numeric(rowsum(w, group = g, reorder = reorder))
+  list(M = num / denom, w = denom)
 }
 
 #' Weighted Mean Centering