+pbDD

HelenaLC · Sep 30, 2024 · d7725a2 · d7725a2
1 parent b9fcf7f
commit d7725a2
Show file tree

Hide file tree

Showing 13 changed files with 525 additions and 63 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,6 +16,9 @@ Authors@R: c(
 	person("Pierre-Luc", "Germain", role="aut"),
 	person("Charlotte", "Soneson", role="aut"),
 	person("Anthony", "Sonrel", role="aut"),
+	person("Jeroen", "Gilis", role="aut"),
+	person("Davide", "Risso", role="aut"),
+	person("Lieven", "Clement", role="aut"),
 	person("Mark D.", "Robinson", role=c("aut", "fnd"), 
 	    email="[email protected]"))
 Imports: 
@@ -33,22 +36,23 @@ Imports:
 Suggests: 
 	BiocStyle,
 	countsimQC,
-	cowplot, 
 	ExperimentHub, 
 	iCOBRA,
 	knitr, 
+	patchwork,
 	phylogram,
 	RColorBrewer,
 	reshape2, 
 	rmarkdown,
 	statmod,
+	stageR,
 	testthat, 
 	UpSetR
 biocViews: ImmunoOncology, DifferentialExpression, Sequencing, 
     SingleCell, Software, StatisticalMethod, Visualization
 License: GPL-3
 VignetteBuilder: knitr
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Encoding: UTF-8
 URL: https://github.com/HelenaLC/muscat
 BugReports: https://github.com/HelenaLC/muscat/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(aggregateData)
 export(calcExprFreqs)
 export(mmDS)
+export(pbDD)
 export(pbDS)
 export(pbFlatten)
 export(pbHeatmap)
@@ -11,6 +12,7 @@ export(prepSCE)
 export(prepSim)
 export(resDS)
 export(simData)
+export(stagewise_DS_DD)
 import(ggplot2)
 importFrom(BiocParallel,MulticoreParam)
 importFrom(BiocParallel,SerialParam)
@@ -101,6 +103,7 @@ importFrom(lmerTest,contest)
 importFrom(lmerTest,lmer)
 importFrom(matrixStats,colAnys)
 importFrom(matrixStats,rowAnyNAs)
+importFrom(matrixStats,rowMedians)
 importFrom(matrixStats,rowMins)
 importFrom(matrixStats,rowQuantiles)
 importFrom(matrixStats,rowSds)

diff --git a/R/pbDS.R b/R/pbDS.R
@@ -20,7 +20,7 @@
 #'   Can be a list for multiple, independent comparisons.
 #' @param min_cells a numeric. Specifies the minimum number of cells in a given 
 #'   cluster-sample required to consider the sample for differential testing.
-#' @param filter characterstring specifying whether
+#' @param filter character string specifying whether
 #'   to filter on genes, samples, both or neither.
 #' @param treat logical specifying whether empirical Bayes moderated-t 
 #'   p-values should be computed relative to a minimum fold-change threshold. 
@@ -82,16 +82,16 @@
 #' @export
 
 pbDS <- function(pb, 
-    method = c("edgeR", "DESeq2", "limma-trend", "limma-voom"),
-    design = NULL, coef  = NULL, contrast = NULL, min_cells = 10, 
-    filter = c("both", "genes", "samples", "none"), treat = FALSE, 
-    verbose = TRUE, BPPARAM = SerialParam(progressbar = verbose)) {
+    method=c("edgeR", "DESeq2", "limma-trend", "limma-voom", "DD"),
+    design=NULL, coef=NULL, contrast=NULL, min_cells=10, 
+    filter=c("both", "genes", "samples", "none"), treat=FALSE, 
+    verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) {
 
     # check validity of input arguments
     args <- as.list(environment())
     method <- match.arg(method)
     filter <- match.arg(filter)
-    .check_pbs(pb, check_by = TRUE)
+    .check_pbs(pb, check_by=TRUE)
     .check_args_pbDS(args)
     stopifnot(is(BPPARAM, "BiocParallelParam"))
 
@@ -104,7 +104,7 @@ pbDS <- function(pb,
     }
     if (is.null(coef) & is.null(contrast)) {
         c <- colnames(design)[ncol(design)]
-        contrast <- makeContrasts(contrasts = c, levels = design)
+        contrast <- makeContrasts(contrasts=c, levels=design)
         args$contrast <- contrast
     }
 
@@ -117,18 +117,19 @@ pbDS <- function(pb,
         if (!is.list(coef)) 
             coef <- list(coef)
         cs <- vapply(coef, function(i)
-            paste(colnames(design)[i], collapse = "-"),
+            paste(colnames(design)[i], collapse="-"),
             character(1))
         names(cs) <- names(coef) <- cs
     }
     ct <- ifelse(is.null(coef), "contrast", "coef")
 
     if (!is.function(method)) {
         fun <- switch(method,
-            "DESeq2" = .DESeq2,
-            "edgeR" = .edgeR, 
-            "limma-trend" = .limma_trend, 
-            "limma-voom" = .limma_voom)
+            "DD"=.edgeR_NB,
+            "edgeR"=.edgeR, 
+            "DESeq2"=.DESeq2,
+            "limma-voom"=.limma_voom,
+            "limma-trend"=.limma_trend)
     } else {
         fun_call <- 1
     }
@@ -139,28 +140,30 @@ pbDS <- function(pb,
     n_cells <- .n_cells(pb)
     names(kids) <- kids <- assayNames(pb)
     res <- bplapply(
-        BPPARAM = BPPARAM, 
+        BPPARAM=BPPARAM, 
         kids, function (k) {
         rmv <- n_cells[k, ] < min_cells
-        d <- design[colnames(y <- pb[ , !rmv]), , drop = FALSE]
+        d <- design[colnames(y <- pb[ , !rmv]), , drop=FALSE]
         if (filter %in% c("samples", "both")) {
             ls <- colSums(assay(y, k))
-            ol <- isOutlier(ls, log = TRUE, type = "lower", nmads = 3)
-            d <- d[colnames(y <- y[, !ol]), , drop = FALSE]
+            ol <- isOutlier(ls, log=TRUE, type="lower", nmads=3)
+            d <- d[colnames(y <- y[, !ol]), , drop=FALSE]
         }
         if (any(tabulate(y$group_id) < 2) 
-            || qr(d)$rank == nrow(d) 
+            || qr(d)$rank== nrow(d) 
             || qr(d)$rank < ncol(d)) 
             return(NULL)
-        y <- y[rowSums(assay(y, k)) != 0, , drop = FALSE]
+        y <- y[rowSums(assay(y, k)) != 0, , drop=FALSE]
         if (filter %in% c("genes", "both") & max(assay(y, k)) > 100) 
-            y <- y[filterByExpr(assay(y, k), d), , drop = FALSE]
+            y <- y[filterByExpr(assay(y, k), d), , drop=FALSE]
         # drop samples without any detected features
         keep <- colAnys(assay(y, k) > 0)
-        y <- y[, keep, drop = FALSE]
-        d <- d[keep, , drop = FALSE]
-        args <- list(x = y, k = k, design = d, coef = coef, 
-            contrast = contrast, ct = ct, cs = cs, treat = treat)
+        y <- y[, keep, drop=FALSE]
+        d <- d[keep, , drop=FALSE]
+        args <- list(
+            x=y, k=k, design=d, coef=coef, 
+            contrast=contrast, ct=ct, cs=cs,
+            treat=treat, nc=n_cells[k, !rmv])
         args <- args[intersect(names(args), fun_args)]
         suppressWarnings(do.call(fun, args))
     })
@@ -169,14 +172,24 @@ pbDS <- function(pb,
     rmv <- vapply(res, is.null, logical(1))
     res <- res[!rmv]
 
-    if (length(res) == 0) stop(
+    if (length(res)== 0) stop(
       "Specified filtering options result in no genes in any clusters ",
       "being tested. To force testing, consider modifying arguments ",
       "'min_cells' and/or 'filter'. See '?pbDS' for details.")
 
     # reorganize & do global p-value adjustment
     names(i) <- i <- c("table", "data", "fit")
-    res <- lapply(i, map, .x = res)
+    res <- lapply(i, map, .x=res)
     res$table <- .p_adj_global(res$table)
-    return(c(res, list(args = args)))
+    return(c(res, list(args=args)))
+}
+
+#' @rdname pbDS
+#' @export
+pbDD <- function(pb, design=NULL, coef=NULL, contrast=NULL, 
+    min_cells=10, filter=c("both", "genes", "samples", "none"), 
+    verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) 
+{
+    args <- as.list(environment())
+    do.call(pbDS, c(args, list(method="DD")))
 }
diff --git a/R/stagewiseDD.R b/R/stagewiseDD.R
@@ -0,0 +1,121 @@
+#' @importFrom dplyr bind_rows
+.res_DX <- function(res_DS, res_DD) {
+    # for each contrast...
+    names(cts) <- cts <- names(res_DS$table) 
+    lapply(cts, function(ct) { 
+        # for each cluster...
+        names(kids) <- kids <- names(res_DS$table[[ct]]) 
+        lapply(kids, function(kid) { 
+            # get DS/DD results
+            DS <- res_DS$table[[ct]][[kid]]
+            DD <- res_DD$table[[ct]][[kid]]
+            # add missing genes
+            DS <- bind_rows(DS, data.frame(gene=setdiff(DD$gene, DS$gene)))
+            DD <- bind_rows(DD, data.frame(gene=setdiff(DS$gene, DD$gene)))
+            # reorder & return both
+            DD <- DD[match(DS$gene, DD$gene), ]
+            return(list(DS=DS, DD=DD))
+        })
+    })
+}
+
+#' @rdname stagewise_DS_DD
+#' @title Perform two-stage testing on DS and DD
+#'
+#' @param res_DS a list of DS testing results as returned 
+#'   by \code{\link{pbDS}} or \code{\link{mmDS}}.
+#' @param res_DD a list of DD testing results as returned 
+#'   by \code{\link{pbDD}} (or \code{\link{pbDS}} with \code{method="DD"}).
+#' @param sce (optional) \code{SingleCellExperiment} object containing the data 
+#'   that underlies testing, prior to summarization with \code{\link{aggregateData}}. 
+#'   Used for validation of inputs in order to prevent unexpected failure/results.
+#'
+#' @return 
+#' A list of \code{DFrame}s containing results for each contrast and cluster.
+#' Each table contains DS and DD results for genes shared between analyses,
+#' as well as results from stagewise testing analysis, namely:
+#' \itemize{
+#' \item{\code{p_adj}: FDR adjusted p-values for the
+#'   screening hypothesis that a gene is neither DS nor DD
+#'   (see \code{?stageR::getAdjustedPValues} for details)}
+#' \item{\code{p_val.DS/D}: confirmation stage p-values for DS/D}}
+#'
+#' @examples
+#' data(example_sce)
+#' 
+#' pbs_sum <- aggregateData(example_sce, assay="counts", fun="sum")
+#' pbs_det <- aggregateData(example_sce, assay="counts", fun="num.detected")
+#'   
+#' res_DS <- pbDS(pbs_sum, min_cells=0, filter="none", verbose=FALSE)
+#' res_DD <- pbDD(pbs_det, min_cells=0, filter="none", verbose=FALSE)
+#' 
+#' res <- stagewise_DS_DD(res_DS, res_DD)
+#' head(res[[1]][[1]]) # results for 1st cluster
+#'
+#' @importFrom S4Vectors DataFrame
+#' @importFrom purrr map_depth
+#' @export
+
+stagewise_DS_DD <- function(res_DS, res_DD, sce=NULL, verbose=FALSE) {
+    if (!requireNamespace("stageR", quietly=TRUE))
+        stop("Install 'stageR' to use this function.")
+
+    # validity checks
+    # TODO: helper to check validity of 'res_DS/D'
+    # against each other and, optionally, 'sce'
+    stopifnot(
+        # same coefs/constrasts
+        names(x <- res_DS$table) == 
+        names(y <- res_DD$table), 
+        # any shared clusters
+        sum(mapply(\(i, j) 
+            length(intersect(i, j)), 
+            i=lapply(x, names), 
+            j=lapply(y, names))) > 0)
+    if (!is.null(sce)) {
+        .check_sce(sce)
+        . <- map_depth(list(x, y), 3, \(df) df$gene %in% rownames(sce))
+        stopifnot("gene(s) present in 'res_DS/D' not found in 'sce'"=unlist(.))
+        . <- map_depth(list(x, y), 3, \(df) df$cluster_id %in% sce$cluster_id)
+        stopifnot("cluster(s) present in 'res_DS/D' not found in 'sce'"=unlist(.))
+    }
+
+    # assure that results contain same set of genes, in the same order
+    # (indepedent of different filtering criteria for the two analyses)
+    res_DX <- .res_DX(res_DS=res_DS, res_DD=res_DD)
+
+    # perform harmonic mean p-value aggregation according to
+    # (https://www.pnas.org/doi/full/10.1073/pnas.1814092116)
+    .mu <- \(x) 1/mean(1/x, na.rm=TRUE)
+
+    # perform stagewise testing
+    res <- map_depth(res_DX, 2, \(x) {
+        ps <- data.frame(
+            p_val.DS=x$DS$p_val,
+            p_val.DD=x$DD$p_val,
+            row.names=x$DS$gene)
+        qs <- apply(ps, 1, .mu); names(qs) <- x$DS$gene
+        obj <- stageR::stageR(qs, as.matrix(ps), FALSE)
+        eva <- expression({
+            obj <- stageR::stageWiseAdjustment(obj, 
+                method="none", alpha=0.05, allowNA=TRUE)
+            res <- stageR::getAdjustedPValues(obj, 
+                onlySignificantGenes=FALSE, order=FALSE)
+        })
+        res <- if (verbose) eval(eva) else suppressMessages({eval(eva)})
+        # TODO: communicate this better with the user?
+        if (is.null(res)) res <- NA
+        colnames(res)[1] <- "p_adj"
+        return(res)
+    })
+    names(cs) <- cs <- names(res)
+    lapply(cs, \(c) {
+        names(ks) <- ks <- names(res[[c]])
+        lapply(ks, \(k) {
+            gs <- rownames(df <- res[[c]][[k]])
+            res_DS <- I((. <- res_DS$table[[c]][[k]])[match(gs, .$gene), ])
+            res_DD <- I((. <- res_DD$table[[c]][[k]])[match(gs, .$gene), ])
+            DataFrame(gene=gs, df, cluster_id=k, contrast=c, res_DS, res_DD)
+        })
+    })
+}
diff --git a/R/utils-pbDS.R b/R/utils-pbDS.R
@@ -89,6 +89,40 @@
     list(table = tbl, data = y, fit = fit)
 }
 
+#' @importFrom matrixStats rowMedians
+.edgeR_NB <- \(x, k, design, coef, contrast, ct, cs, nc) {
+    y <- assay(x, k)
+    # Gene_level filtering to remove genes detected in
+    # almost all cells of almost all pseudobulk samples
+    med_detection <- rowMedians(sweep(y, 2, nc, "/"))
+    gene_filter <- med_detection < 0.9
+    # Normalization offset to remove systematic differences between pseudobulk 
+    # samples that are due to technical or nuisance biological variability. 
+    # Idea obtained from cellular detection rate (CDR) normalization from MAST. 
+    # Note that this normalization is used instead of 'edgeR::calcNormFactors()'.
+    of <- colMeans(sweep(y[gene_filter, ], 2, nc, "/"))
+    # construct 'DGEList'
+    y <- suppressMessages(DGEList(
+        counts = y[gene_filter, ],
+        group = x$group_id[colnames(y)],
+        remove.zeros = TRUE))
+    # add offsets to 'DGEList'
+    y$offset <- log(nc * of) 
+    # run an 'edgeR' analysis
+    y <- estimateDisp(y, design)
+    fit <- glmQLFit(y, design, robust = TRUE)
+    tbl <- lapply(cs, function(c) {
+        fit <- glmQLFTest(fit,
+            coef[[c]],
+            contrast[, c],
+            poisson.bound = FALSE) 
+        tbl <- topTags(fit, n = Inf, sort.by = "none")
+        tbl <- rename(tbl$table, p_val = "PValue", p_adj.loc = "FDR") 
+        tbl <- .res_df(tbl, k, ct, c)
+    })
+    list(table = tbl, data = y, fit = fit)
+}
+
 #' @importFrom dplyr rename
 #' @importFrom edgeR calcNormFactors DGEList
 #' @importFrom limma contrasts.fit eBayes lmFit topTable topTreat voom treat