Merge branch 'master' into weights_ii

mlr-org · Dec 19, 2024 · 7bb73c1 · 7bb73c1
2 parents 89202f5 + 2c2c16d
commit 7bb73c1
Show file tree

Hide file tree

Showing 121 changed files with 1,536 additions and 632 deletions.
diff --git a/.github/workflows/pkgdown.yml b/.github/workflows/pkgdown.yml
@@ -44,7 +44,7 @@ jobs:
 
       - name: Deploy
         if: github.event_name != 'pull_request'
-        uses: JamesIves/github-pages-deploy-action@v4.6.3
+        uses: JamesIves/github-pages-deploy-action@v4.7.2
         with:
           clean: false
           branch: gh-pages

diff --git a/.gitignore b/.gitignore
@@ -24,7 +24,8 @@
 .LSOverride
 
 # Icon must end with two \r
-Icon
+Icon
+
 
 # Thumbnails
 ._*
@@ -180,3 +181,4 @@ revdep/
 
 # misc
 Meta/
+Rplots.pdf
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: mlr3
 Title: Machine Learning in R - Next Generation
-Version: 0.20.2.9000
+Version: 0.22.1.9000
 Authors@R:
   c(
     person("Michel", "Lang", , "[email protected]", role = "aut",
@@ -52,7 +52,7 @@ Imports:
     future.apply (>= 1.5.0),
     lgr (>= 0.3.4),
     mlbench,
-    mlr3measures (>= 0.6.0),
+    mlr3measures (>= 1.0.0),
     mlr3misc (>= 0.15.0),
     parallelly,
     palmerpenguins,
@@ -69,9 +69,7 @@ Suggests:
     remotes,
     RhpcBLASctl,
     rpart,
-    testthat (>= 3.1.0)
-Remotes:
-    mlr-org/mlr3measures
+    testthat (>= 3.2.0)
 Encoding: UTF-8
 Config/testthat/edition: 3
 Config/testthat/parallel: false
@@ -158,7 +156,7 @@ Collate:
     'TaskGeneratorSpirals.R'
     'TaskGeneratorXor.R'
     'TaskRegr.R'
-    'TaskRegr_boston_housing.R'
+    'TaskRegr_california_housing.R'
     'TaskRegr_mtcars.R'
     'TaskUnsupervised.R'
     'as_benchmark_result.R'
@@ -181,6 +179,7 @@ Collate:
     'benchmark.R'
     'benchmark_grid.R'
     'bibentries.R'
+    'default_fallback.R'
     'default_measures.R'
     'fix_factor_levels.R'
     'helper.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -74,6 +74,9 @@ S3method(col_info,DataBackend)
 S3method(col_info,data.table)
 S3method(create_empty_prediction_data,TaskClassif)
 S3method(create_empty_prediction_data,TaskRegr)
+S3method(default_fallback,Learner)
+S3method(default_fallback,LearnerClassif)
+S3method(default_fallback,LearnerRegr)
 S3method(default_values,Learner)
 S3method(default_values,LearnerClassifRpart)
 S3method(default_values,LearnerRegrRpart)
@@ -108,6 +111,11 @@ S3method(set_threads,list)
 S3method(set_validate,Learner)
 S3method(summary,Task)
 S3method(tail,Task)
+S3method(task_check_col_roles,Task)
+S3method(task_check_col_roles,TaskClassif)
+S3method(task_check_col_roles,TaskRegr)
+S3method(task_check_col_roles,TaskSupervised)
+S3method(task_check_col_roles,TaskUnsupervised)
 S3method(unmarshal_model,classif.debug_model_marshaled)
 S3method(unmarshal_model,default)
 S3method(unmarshal_model,learner_state_marshaled)
@@ -241,6 +249,7 @@ export(rsmp)
 export(rsmps)
 export(set_threads)
 export(set_validate)
+export(task_check_col_roles)
 export(tgen)
 export(tgens)
 export(tsk)

diff --git a/NEWS.md b/NEWS.md
@@ -5,25 +5,52 @@
   The weights used during training by the Learner are renamed to `weights_learner`, the previous column role `weight` is dysfunctional.
   Additionally, it is now possible to disable the use of weights via the new hyperparameter `use_weights`.
   Note that this is a breaking change, but appears to be the less error-prone solution in the long run.
-* refactor: Deprecated `data_format` and `data_formats` for Learners, Tasks, and DataBackends.
-* feat: The `partition()` function creates training, test and validation sets.
-* refactor: Optimize runtime of fixing factor levels.
-* refactor: Optimize runtime of setting row roles.
-* refactor: Optimize runtime of marshalling.
-* refactor: Optimize runtime of `Task$col_info`.
-* fix: Column info is now checked for compatibility during `Learner$predict` (#943).
+
+# mlr3 0.22.1
+
+* fix: Extend `assert_measure()` with checks for trained models in `assert_scorable()`.
+
+# mlr3 0.22.0
+
+* fix: Quantiles must not ascend with probabilities.
+* refactor: Replace `tsk("boston_housing")` with `tsk("california_housing")`.
+* feat: Require unique learner ids in `benchmark_grid()`.
+* BREAKING CHANGE: Remove ``$loglik()`` method from all learners.
+* fix: Ignore `future.globals.maxSize` when `future::plan("sequential")` is used.
+* feat: Add `$characteristics` field to `Task` to store additional information.
+
+# mlr3 0.21.1
+
+* feat: Throw warning when prediction and measure type do not match.
+* fix: The `mlr_reflections` were broken when an extension package was not loaded on the workers.
+  Extension packages must now register themselves in the `mlr_reflections$loaded_packages` field.
+
+# mlr3 0.21.0
+
+* BREAKING CHANGE: Deprecated `data_format` and `data_formats` for `Learner`, `Task`, and `DataBackend` classes.
+* feat: The `partition()` function creates training, test and validation sets now.
+* perf: Optimize the runtime of fixing factor levels.
+* perf: Optimize the runtime of setting row roles.
+* perf: Optimize the runtime of marshalling.
+* perf: Optimize the runtime of `Task$col_info`.
+* fix: column info is now checked for compatibility during `Learner$predict` (#943).
 * BREAKING CHANGE: The predict time of the learner now stores the cumulative duration for all predict sets (#992).
 * feat: `$internal_valid_task` can now be set to an `integer` vector.
 * feat: Measures can now have an empty `$predict_sets` (#1094).
   This is relevant for measures that only extract information from the model of a learner (such as internal validation scores or AIC / BIC)
-* refactor: Deprecated the `$divide()` method
-* fix: `Task$cbind()` now works with non-standard primary keys  for `data.frames` (#961).
+* BREAKING CHANGE: Deprecated the `$divide()` method
+* fix: `Task$cbind()` now works with non-standard primary keys for `data.frames` (#961).
 * fix: Triggering of fallback learner now has log-level `"info"` instead of `"debug"` (#972).
-* feat: Added new measure `pinballs `.
-* feat: Added new measure `mu_auc`.
+* feat: Added new measure `regr.pinball` here and in mlr3measures.
+* feat: Added new measure `mu_auc` here and in mlr3measures.
 * feat: Add option to calculate the mean of the true values on the train set in `msr("regr.rsq")`.
 * feat: Default fallback learner is set when encapsulation is activated.
-* feat: Learners classif.debug and regr.debug have new methods `$importance()` and `$selected_features()` for testing, also in downstream packages
+* feat: Learners `classif.debug` and `regr.debug` have new methods `$importance()` and `$selected_features()` for testing, also in downstream packages.
+* feat: Create default fallback learner with `default_fallback()`.
+* feat: Check column roles when using `$set_col_roles()` and `$col_roles`.
+* fix: Add predict set to learner hash.
+* BREAKING CHANGE: Encapsulation and the fallback learner are now set with the `$encapsulate(method, fallback)` method.
+  The `$fallback` field is read-only now and the encapsulate status can be retrieved from the `$encapsulation` field.
 
 # mlr3 0.20.2
 

diff --git a/R/BenchmarkResult.R b/R/BenchmarkResult.R
@@ -19,7 +19,7 @@
 #' @template param_measures
 #'
 #' @section S3 Methods:
-#' * `as.data.table(rr, ..., reassemble_learners = TRUE, convert_predictions = TRUE, predict_sets = "test")`\cr
+#' * `as.data.table(rr, ..., reassemble_learners = TRUE, convert_predictions = TRUE, predict_sets = "test", task_characteristics = FALSE)`\cr
 #'   [BenchmarkResult] -> [data.table::data.table()]\cr
 #'   Returns a tabular view of the internal data.
 #' * `c(...)`\cr
@@ -545,9 +545,17 @@ BenchmarkResult = R6Class("BenchmarkResult",
 )
 
 #' @export
-as.data.table.BenchmarkResult = function(x, ..., hashes = FALSE, predict_sets = "test") { # nolint
+as.data.table.BenchmarkResult = function(x, ..., hashes = FALSE, predict_sets = "test", task_characteristics = FALSE) { # nolint
+  assert_flag(task_characteristics)
   tab = get_private(x)$.data$as_data_table(view = NULL, predict_sets = predict_sets)
-  tab[, c("uhash", "task", "learner", "resampling", "iteration", "prediction"), with = FALSE]
+  tab = tab[, c("uhash", "task", "learner", "resampling", "iteration", "prediction"), with = FALSE]
+
+  if (task_characteristics) {
+    set(tab, j = "characteristics", value = map(tab$task, "characteristics"))
+    tab = unnest(tab, "characteristics")
+  }
+
+  tab[]
 }
 
 #' @export

diff --git a/R/DataBackend.R b/R/DataBackend.R
@@ -90,7 +90,8 @@ DataBackend = R6Class("DataBackend", cloneable = FALSE,
     #' This is deprecated and will be removed in the future.
     data_formats = deprecated_binding("DataBackend$data_formats", "data.table"),
 
-    #' @template field_hash
+    #' @field hash (`character(1)`)\cr
+    #' Hash (unique identifier) for this object.
     hash = function(rhs) {
       if (missing(rhs)) {
         if (is.na(private$.hash)) {

diff --git a/R/Learner.R b/R/Learner.R
@@ -59,9 +59,6 @@
 #' * `oob_error(...)`: Returns the out-of-bag error of the model as `numeric(1)`.
 #'   The learner must be tagged with property `"oob_error"`.
 #'
-#' * `loglik(...)`: Extracts the log-likelihood (c.f. [stats::logLik()]).
-#'   This can be used in measures like [mlr_measures_aic] or [mlr_measures_bic].
-#'
 #' * `internal_valid_scores`: Returns the internal validation score(s) of the model as a named `list()`.
 #'   Only available for [`Learner`]s with the `"validation"` property.
 #'   If the learner is not trained yet, this returns `NULL`.
@@ -463,6 +460,65 @@ Learner = R6Class("Learner",
       } else {
         self
       }
+    },
+
+    #' @description
+    #' Sets the encapsulation method and fallback learner for the train and predict steps.
+    #' There are currently four different methods implemented:
+    #'
+    #' * `"none"`: Just runs the learner in the current session and measures the elapsed time.
+    #'   Does not keep a log, output is printed directly to the console.
+    #'   Works well together with [traceback()].
+    #' * `"try"`: Similar to `"none"`, but catches error.
+    #'   Output is printed to the console and not logged.
+    #' * `"evaluate"`: Uses the package \CRANpkg{evaluate} to call the learner, measure time and do the logging.
+    #' * `"callr"`: Uses the package \CRANpkg{callr} to call the learner, measure time and do the logging.
+    #'   This encapsulation spawns a separate R session in which the learner is called.
+    #'   While this comes with a considerable overhead, it also guards your session from being teared down by segfaults.
+    #'
+    #' The fallback learner is fitted to create valid predictions in case that either the model fitting or the prediction of the original learner fails.
+    #' If the training step or the predict step of the original learner fails, the fallback is used completely to predict predictions sets.
+    #' If the original learner only partially fails during predict step (usually in the form of missing to predict some observations or producing some `NA`` predictions), these missing predictions are imputed by the fallback.
+    #' Note that the fallback is always trained, as we do not know in advance whether prediction will fail.
+    #' If the training step fails, the `$model` field of the original learner is `NULL`.
+    #'
+    #' Also see the section on error handling the mlr3book:
+    #' \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html#sec-error-handling}
+    #'
+    #' @param method `character(1)`\cr
+    #'  One of `"none"`, `"try"`, `"evaluate"` or `"callr"`.
+    #'  See the description for details.
+    #' @param fallback [Learner]\cr
+    #'  The fallback learner for failed predictions.
+    #'
+    #' @return `self` (invisibly).
+    encapsulate = function(method, fallback = NULL) {
+      assert_choice(method, c("none", "try", "evaluate", "callr"))
+
+      if (method != "none") {
+        assert_learner(fallback, task_type = self$task_type)
+
+        if (!identical(self$predict_type, fallback$predict_type)) {
+          warningf("The fallback learner '%s' and the base learner '%s' have different predict types: '%s' != '%s'.",
+            fallback$id, self$id, fallback$predict_type, self$predict_type)
+        }
+
+        # check properties
+        properties = intersect(self$properties, c("twoclass", "multiclass", "missings", "importance", "selected_features"))
+        missing_properties = setdiff(properties, fallback$properties)
+
+        if (length(missing_properties)) {
+          warningf("The fallback learner '%s' does not have the following properties of the learner '%s': %s.",
+            fallback$id, self$id, str_collapse(missing_properties))
+        }
+      } else if (method == "none" && !is.null(fallback)) {
+        stop("Fallback learner must be `NULL` if encapsulation is set to `none`.")
+      }
+
+      private$.encapsulation = c(train = method, predict = method)
+      private$.fallback = fallback
+
+      return(invisible(self))
     }
   ),
 
@@ -540,16 +596,17 @@ Learner = R6Class("Learner",
     },
 
 
-    #' @template field_hash
+    #' @field hash (`character(1)`)\cr
+    #' Hash (unique identifier) for this object.
+    #' The hash is calculated based on the learner id, the parameter settings, the predict type, the fallback hash, the parallel predict setting, the validate setting, and the predict sets.
     hash = function(rhs) {
       assert_ro_binding(rhs)
       calculate_hash(class(self), self$id, self$param_set$values, private$.predict_type,
-        self$fallback$hash, self$parallel_predict, get0("validate", self))
+        self$fallback$hash, self$parallel_predict, get0("validate", self), self$predict_sets)
     },
 
     #' @field phash (`character(1)`)\cr
-    #' Hash (unique identifier) for this partial object, excluding some components
-    #' which are varied systematically during tuning (parameter values).
+    #' Hash (unique identifier) for this partial object, excluding some components which are varied systematically during tuning (parameter values).
     phash = function(rhs) {
       assert_ro_binding(rhs)
       calculate_hash(class(self), self$id, private$.predict_type,
@@ -580,58 +637,20 @@ Learner = R6Class("Learner",
       private$.param_set
     },
 
-    #' @field encapsulate (named `character()`)\cr
-    #' Controls how to execute the code in internal train and predict methods.
-    #' Must be a named character vector with names `"train"` and `"predict"`.
-    #' Possible values are `"none"`, `"try"`, `"evaluate"` (requires package \CRANpkg{evaluate}) and `"callr"` (requires package \CRANpkg{callr}).
-    #' When encapsulation is activated, a fallback learner must be set,
-    #  to ensure that some form of valid model / predictions are created,
-    #  after an error of the original learner is caught via encapsulation.
-    #' If no learner is set in `$fallback`, the default fallback learner is used (see `mlr_reflections$task_types`).
-    #' See [mlr3misc::encapsulate()] for more details.
-    encapsulate = function(rhs) {
-      default = c(train = "none", predict = "none")
-
-      if (missing(rhs)) {
-        return(insert_named(default, private$.encapsulate))
-      }
-
-      assert_character(rhs)
-      assert_names(names(rhs), subset.of = c("train", "predict"))
-      private$.encapsulate = insert_named(default, rhs)
 
-      if (is.null(private$.fallback)) {
-        # if there is no fallback, we get a default one from the reflections table
-        fallback_id = mlr_reflections$learner_fallback[[self$task_type]]
-        if (!is.null(fallback_id)) {
-          self$fallback = lrn(mlr_reflections$learner_fallback[[self$task_type]], predict_type = self$predict_type)
-        }
-      }
-    },
 
     #' @field fallback ([Learner])\cr
-    #' Learner which is fitted to impute predictions in case that either the model fitting or the prediction of the top learner is not successful.
-    #' Requires encapsulation, otherwise errors are not caught and the execution is terminated before the fallback learner kicks in.
-    #' If you have not set encapsulation manually before, setting the fallback learner automatically
-    #' activates encapsulation using the \CRANpkg{evaluate} package.
-    #' Also see the section on error handling the mlr3book:
-    #' \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html#sec-error-handling}
+    #' Returns the fallback learner set with `$encapsulate()`.
     fallback = function(rhs) {
-      if (missing(rhs)) {
-        return(private$.fallback)
-      }
+      assert_ro_binding(rhs)
+      return(private$.fallback)
+    },
 
-      if (!is.null(rhs)) {
-        assert_learner(rhs, task_type = self$task_type)
-        if (!identical(self$predict_type, rhs$predict_type)) {
-          warningf("The fallback learner '%s' and the base learner '%s' have different predict types: '%s' != '%s'.",
-            rhs$id, self$id, rhs$predict_type, self$predict_type)
-        }
-        if (is.null(private$.encapsulate)) {
-          private$.encapsulate = c(train = "evaluate", predict = "evaluate")
-        }
-      }
-      private$.fallback = rhs
+    #' @field encapsulation (`character(2)`)\cr
+    #' Returns the encapsulation settings set with `$encapsulate()`.
+    encapsulation = function(rhs) {
+      assert_ro_binding(rhs)
+      return(private$.encapsulation)
     },
 
     #' @field hotstart_stack ([HotstartStack])\cr.
@@ -647,7 +666,7 @@ Learner = R6Class("Learner",
 
   private = list(
     .use_weights = NULL,
-    .encapsulate = NULL,
+    .encapsulation = c(train = "none", predict = "none"),
     .fallback = NULL,
     .predict_type = NULL,
     .param_set = NULL,

diff --git a/R/LearnerRegrDebug.R b/R/LearnerRegrDebug.R
@@ -108,7 +108,8 @@ LearnerRegrDebug = R6Class("LearnerRegrDebug", inherit = LearnerRegr,
         return(prediction)
       }
 
-      prediction = setdiff(named_list(mlr_reflections$learner_predict_types[["regr"]][[self$predict_type]]), "quantiles")
+      predict_types = setdiff(self$predict_type, "quantiles")
+      prediction = named_list(mlr_reflections$learner_predict_types[["regr"]][[predict_types]])
       missing_type = pv$predict_missing_type %??% "na"
 
       for (pt in names(prediction)) {