diff --git a/NEWS.md b/NEWS.md index 8d99661225..ce17cd42b0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # dplyr (development version) +* `dplyr_reconstruct()`'s default method has been rewritten to avoid + materializing duckplyr queries too early (#6947). + * Updated the `storms` data to include 2022 data (#6937, @steveharoz). * Updated the `starwars` data to use a new API, because the old one is defunct. diff --git a/R/generics.R b/R/generics.R index e31073c72a..a9155f1a44 100644 --- a/R/generics.R +++ b/R/generics.R @@ -192,6 +192,7 @@ dplyr_col_modify.rowwise_df <- function(data, cols) { dplyr_reconstruct <- function(data, template) { # Strip attributes before dispatch to make it easier to implement # methods and prevent unexpected leaking of irrelevant attributes. + # This also enforces that `data` is a well-formed data frame. data <- dplyr_new_data_frame(data) return(dplyr_reconstruct_dispatch(data, template)) UseMethod("dplyr_reconstruct", template) @@ -202,12 +203,7 @@ dplyr_reconstruct_dispatch <- function(data, template) { #' @export dplyr_reconstruct.data.frame <- function(data, template) { - attrs <- attributes(template) - attrs$names <- names(data) - attrs$row.names <- .row_names_info(data, type = 0L) - - attributes(data) <- attrs - data + .Call(ffi_dplyr_reconstruct, data, template) } #' @export diff --git a/R/zzz.R b/R/zzz.R index 9f2c20854f..ad24248048 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -40,13 +40,16 @@ setHook(packageEvent("plyr", "attach"), NULL, "replace") } -import_vctrs <- function(name) { - import_from(name, "vctrs") +import_vctrs <- function(name, optional = FALSE) { + import_from(name, "vctrs", optional = optional) } -import_from <- function(name, package) { +import_from <- function(name, package, optional = FALSE) { ns <- getNamespace(package) if (!exists(name, mode = "function", envir = ns, inherits = FALSE)) { + if (optional) { + return(NULL) + } abort(sprintf("No such '%s' function: `%s()`.", package, name)) } diff --git a/src/dplyr.h b/src/dplyr.h index 8c0035c29c..fc0359095d 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -96,6 +96,10 @@ inline bool obj_is_list(SEXP x) { } +SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_); +SEXP ffi_test_dplyr_attributes(SEXP x); +SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes); + SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr); SEXP dplyr_cumall(SEXP x); SEXP dplyr_cumany(SEXP x); diff --git a/src/init.cpp b/src/init.cpp index 09e699f5f5..11f797fdce 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -95,6 +95,10 @@ SEXP dplyr_init_library(SEXP ns_dplyr, SEXP ns_vctrs, SEXP ns_rlang) { static const R_CallMethodDef CallEntries[] = { {"dplyr_init_library", (DL_FUNC)& dplyr_init_library, 3}, + {"ffi_dplyr_reconstruct", (DL_FUNC)& ffi_dplyr_reconstruct, 2}, + {"ffi_test_dplyr_attributes", (DL_FUNC)& ffi_test_dplyr_attributes, 1}, + {"ffi_test_dplyr_set_attributes", (DL_FUNC)& ffi_test_dplyr_set_attributes, 2}, + {"dplyr_expand_groups", (DL_FUNC)& dplyr_expand_groups, 3}, {"dplyr_cumall", (DL_FUNC)& dplyr_cumall, 1}, {"dplyr_cumany", (DL_FUNC)& dplyr_cumany, 1}, diff --git a/src/reconstruct.cpp b/src/reconstruct.cpp new file mode 100644 index 0000000000..f9862c6395 --- /dev/null +++ b/src/reconstruct.cpp @@ -0,0 +1,138 @@ +#include "dplyr.h" + +// Essentially, a C implementation of: +// +// ``` +// attributes <- attributes(template) +// attributes$names <- names(data) +// attributes$row.names <- .row_names_info(data, type = 0L) +// attributes(data) <- attributes +// ``` +// +// The problem with that is that: +// - `attributes()` ends up calling `Rf_getAttrib()`, which tries to check +// for internal `row.names` in `template` so they aren't leaked to the user. +// Unfortunately this materializes lazy ALTREP `row.names`, like those used +// by duckplyr. +// - `attributes<-()` ends up calling `Rf_setAttrib()`, which tries to check +// if it can make efficient internal `row.names`. Again, this materializes +// lazy ALTREP `row.names`, like those used by duckplyr. +// +// So we bypass that here by carefully manipulating the attribute pairlists. +// +// We expect that at this point, both `data` and `template_` are S3 data +// frames, both of which have `names` and `row.names` attributes. If this isn't +// true, we error. +// - For `data`, we enforce this in `dplyr_reconstruct()`'s generic by calling +// `dplyr_new_data_frame()` (ideally no intermediate method invalidates this). +// - For `template_`, we assume this since we got here through the S3 method +// `dplyr_reconstruct.data.frame()`, which dispatched off `template_`. A +// well-formed S3 data frame must have `names` and `row.names` attributes. +// +// https://github.com/tidyverse/dplyr/pull/6947 +// https://github.com/tidyverse/dplyr/issues/6525#issuecomment-1303619152 +// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L176-L177 +// https://github.com/wch/r-source/blob/69b94f0c8ce9b2497f6d7a81922575f6c585b713/src/main/attrib.c#L57 +SEXP ffi_dplyr_reconstruct(SEXP data, SEXP template_) { + if (TYPEOF(data) != VECSXP) { + Rf_errorcall(R_NilValue, "Internal error: `data` must be a list."); + } + if (TYPEOF(template_) != VECSXP) { + Rf_errorcall(R_NilValue, "Internal error: `template` must be a list."); + } + if (!OBJECT(data)) { + Rf_errorcall(R_NilValue, "Internal error: `data` must be an object."); + } + if (!OBJECT(template_)) { + Rf_errorcall(R_NilValue, "Internal error: `template` must be an object."); + } + + bool seen_names = false; + bool seen_row_names = false; + + // Pull the `names` and `row.names` off `data`. + // These are the only 2 attributes from `data` that persist. + SEXP names = R_NilValue; + SEXP row_names = R_NilValue; + + for (SEXP node = ATTRIB(data); node != R_NilValue; node = CDR(node)) { + SEXP tag = TAG(node); + + if (tag == R_NamesSymbol) { + names = CAR(node); + MARK_NOT_MUTABLE(names); + seen_names = true; + } + if (tag == R_RowNamesSymbol) { + row_names = CAR(node); + MARK_NOT_MUTABLE(row_names); + seen_row_names = true; + } + } + + if (!seen_names) { + Rf_errorcall(R_NilValue, "Internal error: `data` must have a `names` attribute."); + } + if (!seen_row_names) { + Rf_errorcall(R_NilValue, "Internal error: `data` must have a `row.names` attribute."); + } + + seen_names = false; + seen_row_names = false; + + // Now replace the `names` and `row.names` attributes in the `template_` + // attributes with the ones from `data`. This attribute set becomes the final + // one we set on `data`. + SEXP attributes = ATTRIB(template_); + attributes = PROTECT(Rf_shallow_duplicate(attributes)); + + for (SEXP node = attributes; node != R_NilValue; node = CDR(node)) { + SEXP tag = TAG(node); + + if (tag == R_NamesSymbol) { + SETCAR(node, names); + seen_names = true; + } + if (tag == R_RowNamesSymbol) { + SETCAR(node, row_names); + seen_row_names = true; + } + } + + if (!seen_names) { + Rf_errorcall(R_NilValue, "Internal error: `template` must have a `names` attribute."); + } + if (!seen_row_names) { + Rf_errorcall(R_NilValue, "Internal error: `template` must have a `row.names` attribute."); + } + + // Make an ALTREP wrapper if possible, since the underlying data doesn't change. + // Won't actually make an ALTREP wrapper unless there are >64 columns + // (internally controlled by R). +#if R_VERSION >= R_Version(3, 6, 0) + data = PROTECT(R_shallow_duplicate_attr(data)); +#else + data = PROTECT(Rf_shallow_duplicate(data)); +#endif + + SET_ATTRIB(data, attributes); + + UNPROTECT(2); + return data; +} + +// Very unsafe wrappers needed for testing. +// Bypass `Rf_getAttrib()` and `Rf_setAttrib()` calls to avoid forcing ALTREP +// `row.names`. +SEXP ffi_test_dplyr_attributes(SEXP x) { + return ATTRIB(x); +} +SEXP ffi_test_dplyr_set_attributes(SEXP x, SEXP attributes) { + if (TYPEOF(attributes) != LISTSXP) { + Rf_errorcall(R_NilValue, "`attributes` must be a pairlist."); + } + x = PROTECT(Rf_shallow_duplicate(x)); + SET_ATTRIB(x, attributes); + UNPROTECT(1); + return x; +} diff --git a/tests/testthat/helper-lazy.R b/tests/testthat/helper-lazy.R new file mode 100644 index 0000000000..ea086992d6 --- /dev/null +++ b/tests/testthat/helper-lazy.R @@ -0,0 +1,22 @@ +skip_if_no_lazy_character <- function() { + skip_if(getRversion() <= "3.5.0") + + new_lazy_character <- import_vctrs("new_lazy_character", optional = TRUE) + lazy_character_is_materialized <- import_vctrs("lazy_character_is_materialized", optional = TRUE) + + if (is.null(new_lazy_character) || is.null(lazy_character_is_materialized)) { + skip("Lazy character helpers from vctrs are not available.") + } + + invisible() +} + +new_lazy_character <- function(fn) { + f <- import_vctrs("new_lazy_character") + f(fn) +} + +lazy_character_is_materialized <- function(x) { + f <- import_vctrs("lazy_character_is_materialized") + f(x) +} diff --git a/tests/testthat/test-generics.R b/tests/testthat/test-generics.R index a3c43a5b8a..2453420d44 100644 --- a/tests/testthat/test-generics.R +++ b/tests/testthat/test-generics.R @@ -137,3 +137,54 @@ test_that("dplyr_reconstruct() strips attributes before dispatch", { dplyr_reconstruct(df, df) expect_identical(out, data.frame(x = 1, row.names = "a")) }) + +test_that("`dplyr_reconstruct()` retains attribute ordering of `template`", { + df <- vctrs::data_frame(x = 1) + expect_identical( + attributes(dplyr_reconstruct(df, df)), + attributes(df) + ) +}) + +test_that("`dplyr_reconstruct()` doesn't modify the original `data` in place", { + data <- new_data_frame(list(x = 1), foo = "bar") + template <- vctrs::data_frame(x = 1) + + out <- dplyr_reconstruct(data, template) + + expect_null(attr(out, "foo")) + expect_identical(attr(data, "foo"), "bar") +}) + +test_that("`dplyr_reconstruct()`, which gets and sets attributes, doesn't touch `row.names` (#6525)", { + skip_if_no_lazy_character() + + dplyr_attributes <- function(x) { + .Call(ffi_test_dplyr_attributes, x) + } + dplyr_set_attributes <- function(x, attributes) { + .Call(ffi_test_dplyr_set_attributes, x, attributes) + } + + df <- vctrs::data_frame(x = 1) + + attributes <- attributes(df) + attributes$row.names <- new_lazy_character(function() "a") + attributes <- as.pairlist(attributes) + + df_with_lazy_row_names <- dplyr_set_attributes(df, attributes) + + # Ensure `data` row names aren't materialized + x <- dplyr_reconstruct(df_with_lazy_row_names, df) + attributes <- dplyr_attributes(df_with_lazy_row_names) + expect_false(lazy_character_is_materialized(attributes$row.names)) + + # `data` row names should also propagate into the result unmaterialized + attributes <- dplyr_attributes(x) + expect_false(lazy_character_is_materialized(attributes$row.names)) + + # Ensure `template` row names aren't materialized + x <- dplyr_reconstruct(df, df_with_lazy_row_names) + attributes <- dplyr_attributes(df_with_lazy_row_names) + expect_false(lazy_character_is_materialized(attributes$row.names)) +})