Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add method #307

Merged
merged 4 commits into from
Jul 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: tidybulk
Title: Brings transcriptomics to the tidyverse
Version: 1.17.2
Version: 1.17.3
Authors@R: c(person("Stefano", "Mangiola", email = "[email protected]",
role = c("aut", "cre")),
person("Maria", "Doyle", email = "[email protected]",
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ export(quantile_normalise_abundance)
export(reduce_dimensions)
export(remove_redundancy)
export(rename)
export(resolve_complete_confounders_of_non_interest)
export(right_join)
export(rotate_dimensions)
export(rowwise)
Expand Down Expand Up @@ -141,9 +142,11 @@ importFrom(purrr,map2_dfc)
importFrom(purrr,map2_dfr)
importFrom(purrr,map_chr)
importFrom(purrr,map_dfr)
importFrom(purrr,map_int)
importFrom(purrr,map_lgl)
importFrom(purrr,reduce)
importFrom(purrr,when)
importFrom(rlang,"!!")
importFrom(rlang,":=")
importFrom(rlang,dots_list)
importFrom(rlang,dots_values)
Expand All @@ -159,6 +162,7 @@ importFrom(rlang,quo_is_symbol)
importFrom(rlang,quo_is_symbolic)
importFrom(rlang,quo_name)
importFrom(rlang,quo_squash)
importFrom(rlang,set_names)
importFrom(scales,extended_breaks)
importFrom(scales,label_scientific)
importFrom(scales,log_breaks)
Expand Down
24 changes: 13 additions & 11 deletions R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -465,26 +465,28 @@ get_differential_transcript_abundance_bulk <- function(.data,
data = df_for_edgeR %>% select(!!.sample, any_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample)
)

# # Print the design column names in case I want contrasts
# message(
# sprintf(
# "tidybulk says: The design column names are \"%s\"",
# design %>% colnames %>% paste(collapse = ", ")
# )
# )

# Replace `:` with ___ because it creates error with edgeR
if(design |> colnames() |> str_detect(":") |> any()) {
message("tidybulk says: the interaction term `:` has been replaced with `___` in the design matrix, in order to work with edgeR.")
colnames(design) = design |> colnames() |> str_replace(":", "___")
}

# Print the design column names in case I want contrasts
message(
sprintf(
"tidybulk says: The design column names are \"%s\"",
design %>% colnames %>% paste(collapse = ", ")
)
)


# Specify the design column tested
if(is.null(.contrasts))
message(
sprintf(
"tidybulk says: The design column being tested is %s",
design %>% colnames %>% .[1]
design %>% colnames %>% .[2]
)
)

Expand Down Expand Up @@ -770,7 +772,7 @@ get_differential_transcript_abundance_glmmSeq <- function(.data,
object = .formula |> lme4::nobars(),
data = metadata
)

if(quo_is_symbolic(.dispersion))
dispersion = .data |> pivot_transcript(!!.transcript) |> select(!!.transcript, !!.dispersion) |> deframe()
else
Expand All @@ -787,8 +789,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data,

# Scaling
sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)


glmmSeq_object =
glmmSeq( .formula,
countdata = counts ,
Expand Down
91 changes: 87 additions & 4 deletions R/functions_SE.R
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ get_reduced_dimensions_TSNE_bulk_SE <-
#' @param of_samples A boolean
#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
#' @param ... Further parameters passed to the function uwot
#' @param ... Further parameters passed to the function uwot::tumap
#'
#' @return A tibble with additional columns
#'
Expand Down Expand Up @@ -1145,7 +1145,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
object = .formula |> lme4::nobars(),
data = metadata
)

if(quo_is_symbolic(.dispersion))
dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe()
else
Expand All @@ -1162,8 +1162,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,

# Scaling
sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)


glmmSeq_object =
glmmSeq( .formula,
countdata = counts ,
Expand Down Expand Up @@ -1496,3 +1496,86 @@ univariable_differential_tissue_composition_SE = function(

unnest(surv_test, keep_empty = TRUE)
}


#' Resolve Complete Confounders of Non-Interest
#'
#' This function processes a SummarizedExperiment object to handle confounders
#' that are not of interest in the analysis. It deals with two factors, adjusting
#' the data by nesting and summarizing over these factors.
#'
#' @importFrom rlang enquo
#' @importFrom rlang quo_name
#' @importFrom rlang !!
#' @importFrom tidyr unnest
#' @importFrom tidyr nest
#' @importFrom dplyr mutate
#' @importFrom dplyr arrange
#' @importFrom dplyr slice
#' @importFrom dplyr pull
#' @importFrom dplyr select
#' @importFrom purrr map_int
#' @importFrom dplyr if_else
#'
#' @param se A SummarizedExperiment object that contains the data to be processed.
#' @param .factor_1 A symbol or quosure representing the first factor variable in `se`.
#' @param .factor_2 A symbol or quosure representing the second factor variable in `se`.
#' @return A modified SummarizedExperiment object with confounders resolved.
#' @examples
#' # Not run:
#' # se is a SummarizedExperiment object
#' resolve_complete_confounders_of_non_interest(se, .factor_1 = factor1, .factor_2 = factor2)
#' @noRd
resolve_complete_confounders_of_non_interest_pair_SE <- function(se, .factor_1, .factor_2){

.factor_1 <- enquo(.factor_1)
.factor_2 <- enquo(.factor_2)

cd =
colData(se) |>
as_tibble() |>
rowid_to_column() |>
distinct(rowid, !!.factor_1, !!.factor_2) |>

nest(se_data = -c(!!.factor_1, !!.factor_2)) |>
nest(data = -!!.factor_1) |>
mutate(n1 = map_int(data, ~ .x |> distinct(!!.factor_2) |> nrow())) |>
unnest(data) |>

# Nest data excluding .factor_2 and count distinct .factor_1 values
nest(data = -!!.factor_2) |>
mutate(n2 = map_int(data, ~ .x |> distinct(!!.factor_1) |> nrow())) |>
unnest(data)

# Choose a dummy value for .factor_2 based on sorting by n1 + n2
dummy_factor_2 <- cd |> arrange(desc(n1 + n2)) |> slice(1) |> pull(!!.factor_2)

# Messages if I have confounders
if(cd |> filter(n1 + n2 < 3) |> nrow() > 0){

message(sprintf("tidybulk says: IMPORTANT! the columns %s and %s, have been corrected for complete confounders and now are NOT interpretable, and cannot be used in hypothesis testing.", quo_name(.factor_1), quo_name(.factor_2)))

message(sprintf(
"tidybulk says: The value(s) %s in column %s from sample(s) %s, has been changed to %s.",
cd |> filter(n1 + n2 < 3) |> pull(!!.factor_2),
quo_name(.factor_2),
cd |> filter(n1 + n2 < 3) |> pull(se_data) |> map(colnames) |> unlist(),
dummy_factor_2
))

# Replace .factor_2 with dummy_factor_2 where n1 + n2 is less than 3 and unnest
cd = cd |>
mutate(!!.factor_2 := if_else(n1 + n2 < 3, dummy_factor_2, !!.factor_2))
}

colData(se)[,c(quo_name(.factor_1), quo_name(.factor_2))] =
cd |>
unnest(se_data) |>
arrange(rowid) |>
select(!!.factor_1, !!.factor_2) |>
DataFrame()

se

}

65 changes: 46 additions & 19 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,7 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements)
#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
#' @param scale A boolean for method="PCA", this will be passed to the `prcomp` function. It is not included in the ... argument because although the default for `prcomp` if FALSE, it is advisable to set it as TRUE.
#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).
#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE"
#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE", or uwot::tumap if you choose method="umap"
#'
#' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#'
Expand Down Expand Up @@ -1154,14 +1154,14 @@ setGeneric("reduce_dimensions", function(.data,
# adjust top for the max number of features I have
if(top > .data |> distinct(!!.feature) |> nrow()){
warning(sprintf(
"tidybulk says: the \"top\" argument %s is higher than the number of features %s",
top,
"tidybulk says: the \"top\" argument %s is higher than the number of features %s",
top,
.data |> distinct(!!.feature) |> nrow()
))

top = min(top, .data |> distinct(!!.feature) |> nrow())
}

# Validate data frame
if(do_validate()) {
validation(.data, !!.element, !!.feature, !!.abundance)
Expand Down Expand Up @@ -2611,7 +2611,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
#' All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula.
#'
#' Underlying method for edgeR framework:
#'
#'
#' .data |>
#'
#' # Filter
Expand All @@ -2638,7 +2638,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
#'
#'
#' Underlying method for DESeq2 framework:
#'
#'
#' keep_abundant(
#' factor_of_interest = !!as.symbol(parse_formula(.formula)[[1]]),
#' minimum_counts = minimum_counts,
Expand All @@ -2657,25 +2657,25 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
#' counts =
#' .data %>%
#' assay(my_assay)
#'
#'
#' # Create design matrix for dispersion, removing random effects
#' design =
#' model.matrix(
#' object = .formula |> lme4::nobars(),
#' data = metadata
#' )
#'
#'
#' dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts))
#'
#'
#' glmmSeq( .formula,
#' countdata = counts ,
#' metadata = metadata |> as.data.frame(),
#' dispersion = dispersion,
#' progress = TRUE,
#' method = method |> str_remove("(?i)^glmmSeq_" ),
#' )
#'
#'
#'
#'
#' @return A consistent object (to the input) with additional columns for the statistics from the test (e.g., log fold change, p-value and false discovery rate).
#'
#'
Expand Down Expand Up @@ -3980,12 +3980,12 @@ setGeneric("test_gene_rank", function(.data,

# DEPRECATION OF reference function
if (is_present(.sample) & !is.null(.sample)) {

# Signal the deprecation to the user
deprecate_warn("1.13.2", "tidybulk::test_gene_rank(.sample = )", details = "The argument .sample is now deprecated and not needed anymore.")

}

# Get column names
.arrange_desc = enquo(.arrange_desc)
.entrez = enquo(.entrez)
Expand Down Expand Up @@ -4023,14 +4023,14 @@ setGeneric("test_gene_rank", function(.data,

.data |>
select(!!.entrez, !!.arrange_desc) |>
distinct() |>
distinct() |>

# Select one entrez - NEEDED?
with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |>
with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |>

# arrange
# arrange
arrange(desc(!!.arrange_desc)) |>

# Format
deframe() |>
entrez_rank_to_gsea(species, gene_collections = gene_sets ) |>
Expand Down Expand Up @@ -4967,3 +4967,30 @@ as_matrix <- function(tbl,
}


#' Resolve Complete Confounders of Non-Interest
#'
#' This generic function processes a SummarizedExperiment object to handle confounders
#' that are not of interest in the analysis. It dynamically handles combinations
#' of provided factors, adjusting the data by nesting and summarizing over these factors.
#'
#'
#' @param se A SummarizedExperiment object that contains the data to be processed.
#' @param ... Arbitrary number of factor variables represented as symbols or quosures
#' to be considered for resolving confounders. These factors are processed
#' in combinations of two.
#'
#' @rdname resolve_complete_confounders_of_non_interest-methods
#'
#' @return A modified SummarizedExperiment object with confounders resolved.
#'
#' @examples
#' # Not run:
#' # se is a SummarizedExperiment object
#' # resolve_complete_confounders_of_non_interest(se, factor1, factor2, factor3)
#' @export
setGeneric("resolve_complete_confounders_of_non_interest", function(se, ...) {
standardGeneric("resolve_complete_confounders_of_non_interest")
})



Loading
Loading