From 6b510b6a253a485300aeb48672afa9675cbc5285 Mon Sep 17 00:00:00 2001 From: jmzeng1314 Date: Thu, 28 Mar 2019 15:07:44 +0800 Subject: [PATCH] compare --- compare-4-packages/README.md | 2 + compare-4-packages/compare.Rmd | 883 +++++++++++++++++++ compare-4-packages/compare.Rproj | 13 + compare-4-packages/compare.html | 1369 ++++++++++++++++++++++++++++++ 4 files changed, 2267 insertions(+) create mode 100644 compare-4-packages/README.md create mode 100644 compare-4-packages/compare.Rmd create mode 100644 compare-4-packages/compare.Rproj create mode 100644 compare-4-packages/compare.html diff --git a/compare-4-packages/README.md b/compare-4-packages/README.md new file mode 100644 index 0000000..83d625b --- /dev/null +++ b/compare-4-packages/README.md @@ -0,0 +1,2 @@ +### 比较4个R包 + diff --git a/compare-4-packages/compare.Rmd b/compare-4-packages/compare.Rmd new file mode 100644 index 0000000..84e6889 --- /dev/null +++ b/compare-4-packages/compare.Rmd @@ -0,0 +1,883 @@ +--- +title: "compare 4 R packages for scRNAseq" +author: "jmzeng1314@163.com" +date: "`r format(Sys.time(), '%d %B, %Y')`" +output: + html_document: + toc: yes +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +knitr::opts_chunk$set(warning = F) +knitr::opts_chunk$set(message = F) +``` + +## 安装并且加载必要的包 + +需要自行下载安装一些必要的R包,包括我们的测试数据集来源的R包`scRNAseq` ,以及4个单细胞转录组数据处理包! + +因为大量学员在中国大陆,所以需要特别的R包安装方法,就是切换镜像后再下载R包。参考:http://www.bio-info-trainee.com/3727.html + +```{r} +options()$repos ## 查看使用install.packages安装时的默认镜像 +options()$BioC_mirror ##查看使用bioconductor的默认镜像 +options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") ##指定镜像,这个是中国科技大学镜像 +options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) ##指定install.packages安装镜像,这个是清华镜像 +options()$repos +options()$BioC_mirror +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") +if(!require('Seurat')){ + BiocManager::install('Seurat',ask = F,update = F) +} +if(!require('scater')){ + BiocManager::install(c( 'scater'),ask = F,update = F) +} +if(!require('monocle')){ + BiocManager::install(c( 'monocle'),ask = F,update = F) +} +if(!require('scRNAseq')){ + BiocManager::install(c( 'scRNAseq'),ask = F,update = F) +} +if(!require('SC3')){ + BiocManager::install(c( 'SC3'),ask = F,update = F) +} +if(!require('M3Drop')){ + BiocManager::install(c( 'M3Drop'),ask = F,update = F) +} +if(!require('ggpubr')){ + BiocManager::install(c( 'ggpubr'),ask = F,update = F) +} +``` + +安装成功后就可以加载R包: + +```{r, message=FALSE} +rm(list = ls()) +options(warn=-1) +suppressMessages(library(scater)) +suppressMessages(library(Seurat)) +suppressMessages(library(monocle)) +suppressMessages(library(scRNAseq)) +suppressMessages(library(SC3)) +suppressMessages(library(M3Drop)) +``` + +## 简单了解数据集 + +这个包内置的是 Pollen et al. 2014 数据集,人类单细胞细胞,分成**4类**,分别是 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞,理解这些需要一定的生物学背景知识,如果不感兴趣,可以略过。 + +这个数据集很出名,截止2019年1月已经有近400的引用了,后面的人开发`R包算法`都会在其上面做测试,本例子只使用了数据集的**4种细胞类型**而已,因为 scRNAseq 这个R包就提供了这些,完整的数据是 23730 features, +301 samples, 地址为 , 这个网站非常值得推荐,简直是一个宝藏。 + + +这里面的表达矩阵是由 RSEM (Li and Dewey 2011) 软件根据 hg38 RefSeq transcriptome 得到的,总是130个文库,每个细胞测了两次,测序深度不一样(非常的不一样) + +```{r} +library(scRNAseq) +## ----- Load Example Data ----- +data(fluidigm) +ct <- floor(assays(fluidigm)$rsem_counts) +ct[1:4,1:4] +sample_ann <- as.data.frame(colData(fluidigm)) +DT::datatable(sample_ann, + + rownames= FALSE,extensions = c('Scroller'), + options = list( + pageLength = 10, + lengthMenu = list(c(10, 50, 100,-1), c('10', '50','100', 'All')), + columnDefs = list(list(className = 'dt-center', targets = 0 :8)), + scrollX = TRUE, + fixedHeader = TRUE, + fixedColumns = TRUE , + deferRender = TRUE +), +filter = 'top', +escape = FALSE +) +``` + +### 先探索表型信息 + +前面说到,这个数据集是130个文库,每个细胞测了两次,测序深度不一样,这130个细胞,分成4类,分别是: pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞。 + +批量,粗略的看一看各个细胞的一些统计学指标的分布情况 + +```{r fig.width=10, fig.height=15} +library(ggpubr) +box <- lapply(colnames(sample_ann[,1:19]),function(i) { + dat <- sample_ann[,i,drop=F] + dat$sample=rownames(dat) + dat$group='all cells' + ## 画boxplot + p <- ggboxplot(dat, x = "group", y = i, + add = "jitter" ) + p +}) +plot_grid(plotlist=box, ncol=5 ) +# ggsave(file="stat_all_cells.pdf") +``` + +很明显,他们可以有根据来进行分组,这里不再演示。 不过通常的文章并不会考虑如此多的细节,这里重点是批量,代码技巧非常值得你们学校。 + +因为进行了简单探索,对表型数据就有了把握,接下来可以进行一定程度的过滤,因为细节太多,这里重点是批量,代码技巧非常值得你们学校。 + +```{r} +pa <- colnames(sample_ann[,c(1:9,11:16,18,19)]) +tf <- lapply(pa,function(i) { + # i=pa[1] + dat <- sample_ann[,i] + dat <- abs(log10(dat)) + fivenum(dat) + (up <- mean(dat)+2*sd(dat)) + (down <- mean(dat)- 2*sd(dat) ) + valid <- ifelse(dat > down & dat < up, 1,0 ) +}) + +tf <- do.call(cbind,tf) +choosed_cells <- apply(tf,1,function(x) all(x==1)) +table(sample_ann$Biological_Condition) +sample_ann=sample_ann[choosed_cells,] +table(sample_ann$Biological_Condition) +ct <- ct[,choosed_cells] +``` + + +### 再探索基因表达情况 + +```{r} +ct[1:4,1:4] +counts <- ct +fivenum(apply(counts,1,function(x) sum(x>0) )) +boxplot(apply(counts,1,function(x) sum(x>0) )) +fivenum(apply(counts,2,function(x) sum(x>0) )) +hist(apply(counts,2,function(x) sum(x>0) )) +choosed_genes=apply(counts,1,function(x) sum(x>0) )>0 +table(choosed_genes) +counts <- counts[choosed_genes,] +``` + +## 单细胞转录组分析流程介绍 + +注意,这里的 seurat是2.x版本,同理,monocle也是2版本。 + + +| 用法 | Seurat | scater | monocle | +| ------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| 创建R包要求的对象 | CreateSeuratObject | SingleCellExperiment | newCellDataSet | +| QC and selecting cell | 创建矩阵的同时,可以选择过滤参数min.cell,min.gene等。还有FilterCells函数可以去除不合格的细胞。 | calculateQCMetrics()函数,其中的feature_controls参数可以指定过滤指标,然后有一系列的可视化函数 | 用基础R函数进行初步过滤,还可以用detectGenes()函数加上subset()过滤 | +| 表达量的标准化或者归一化 | NormalizeData | calculateCPM()等系列函数 | estimateSizeFactors()还有estimateDispersions | +| 寻找重要的基因 | FindVariableGenes | 没有看到专门的函数,可以借用R基础函数 | differentialGeneTest()函数 | +| 去除干扰因素 | ScaleData | 借用limma包的 removeBatchEffect 函数 | 去除干扰因素的功能被包装在降维函数中 | +| 降维 | RunPCA或者RunTSNE | runPCA或者runTSNE,runDiffusionMap | reduceDimension函数,可以选择多种参数 | +| 降维后可视化 | VizPCA和PCElbowPlot;PCAPlot或者TSNEPlot | plotPCA和plotTSNE等等 | plot_cell_trajectory()或plot_genes_in_pseudotime | +| 分类群cluster | FindClusters | 并没有包装聚类函数,而且辅助其它R包,或者R基础函数 | clusterCells | +| 聚类后找每个细胞亚群的标志基因 | FindMarkers和FindAllMarkers函数 | 借助SC3包 | newCellTypeHierarchy classifyCells +calculateMarkerSpecificity | + + +### step1: 创建对象 + +首先为 scater 包创建对象 + +```{r} +pheno_data <- as.data.frame(colData(fluidigm)) +ct <- floor(assays(fluidigm)$rsem_counts) +## 这里需要把Pollen的表达矩阵做成我们的 scater 要求的对象 +# data("sc_example_counts") +# data("sc_example_cell_info") +# 你也可以尝试该R包自带的数据集。 +# 参考 https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/vignette-intro.R +sce_scater <- SingleCellExperiment( + assays = list(counts = ct), + colData = pheno_data + ) +sce_scater +# 后面所有的分析都是基于 sce_scater 这个变量 +# 是一个 SingleCellExperiment 对象,被很多单细胞R包采用。 +``` + +然后为 seurat 包创建对象 + +```{r} +meta <- as.data.frame(colData(fluidigm)) +ct <- floor(assays(fluidigm)$rsem_counts) +counts <- ct +sce_seurat <- CreateSeuratObject(raw.data = counts, + meta.data =meta, + min.cells = 3, + min.genes = 200, + project = "Pollen") +sce_seurat +## 后续所有的分析都基于这个 sce_seurat 变量,是一个对象 +``` + +最后为 monocle 包创建对象 + +```{r} +ct <- floor(assays(fluidigm)$rsem_counts) +gene_ann <- data.frame( + gene_short_name = row.names(ct), + row.names = row.names(ct) +) +sample_ann=as.data.frame(colData(fluidigm)) +pd <- new("AnnotatedDataFrame", + data=sample_ann) +fd <- new("AnnotatedDataFrame", + data=gene_ann) +sce_monocle <- newCellDataSet( + ct, + phenoData = pd, + featureData =fd, + expressionFamily = negbinomial.size(), + lowerDetectionLimit=1) +sce_monocle +``` + +再次回顾一下这3个对象。 + +```{r} +sce_seurat +sce_scater +sce_monocle +``` + + +### step2: 质量控制 + +在 `seurat` 包, + +```{r} +mito.genes <- grep(pattern = "^MT-", + x = rownames(sce_seurat@data), + value = TRUE) +# 恰好这个例子的表达矩阵里面没有线粒体基因 +percent.mito <- Matrix::colSums(sce_seurat@raw.data[mito.genes, ]) / Matrix::colSums(sce_seurat@raw.data) +## 也可以加入很多其它属性,比如 ERCC 等。 + +# AddMetaData adds columns to object@meta.data, and is a great place to stash QC stats +sce_seurat <- AddMetaData(object = sce_seurat, + metadata = percent.mito, + col.name = "percent.mito") +VlnPlot(object = sce_seurat, + features.plot = c("nGene", "nUMI", "percent.mito"), + group.by = 'Biological_Condition', nCol = 3) +GenePlot(object = sce_seurat, gene1 = "nUMI", gene2 = "nGene") +CellPlot(sce_seurat, + sce_seurat@cell.names[3], + sce_seurat@cell.names[4], + do.ident = FALSE) +# FilterCells函数 +# sce_seurat +# sce_seurat <- FilterCells(object = sce_seurat, +# subset.names = c("nGene", "percent.mito"), +# low.thresholds = c(200, -Inf), +# high.thresholds = c(2500, 0.05)) +# sce_seurat +``` + + +在scater包, + +这里仅仅是演示 scater 包最简单的质量控制代码,详细代码见:https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/vignette-qc.html + +```{r} +genes <- rownames(rowData(sce_scater)) +genes[grepl('^MT-',genes)] +genes[grepl('^ERCC-',genes)] +exprs(sce_scater) <- log2( + calculateCPM(sce_scater ) + 1) +keep_feature <- rowSums(exprs(sce_scater) > 0) > 0 +table(keep_feature) +sce_scater <- sce_scater[keep_feature,] +sce_scater + +sce_scater <- calculateQCMetrics(sce_scater) +plotHighestExprs(sce_scater, exprs_values = "counts") +plotExprsFreqVsMean(sce_scater) +``` + +在`monocle`包, + +```{r} +sce_monocle +## 起初是 +sce_monocle <- detectGenes(sce_monocle, min_expr = 0.1) +print(head(fData(sce_monocle))) +expressed_genes <- row.names(subset(fData(sce_monocle), + num_cells_expressed >= 5)) +length(expressed_genes) +sce_monocle <- sce_monocle[expressed_genes,] +sce_monocle +# 过滤基因后是 +tmp=pData(sce_monocle) +fivenum(tmp[,1]) +fivenum(tmp[,30]) +# 这里并不需要过滤细胞,如果想过滤,就自己摸索阈值,然后挑选细胞即可。 +# 这里留下来了所有的细胞。 +valid_cells <- row.names(pData(sce_monocle) ) +sce_monocle <- sce_monocle[,valid_cells] +sce_monocle +``` + + +### step3: 表达量的标准化和归一化 + +在 seurat包需要先假定平均细胞测序文库大小,这里是 10000 + +```{r} +sce_seurat <- NormalizeData(object = sce_seurat, + normalization.method = "LogNormalize", + scale.factor = 10000, + display.progress = F) +``` + +在scater包: + +```{r} +assays(sce_scater) + +counts(sce_scater) <- assays(sce_scater)$counts +norm_exprs(sce_scater) <- log2(calculateCPM(sce_scater, use_size_factors = FALSE) + 1) + +stand_exprs(sce_scater) <- log2(calculateCPM(sce_scater, use_size_factors = FALSE) + 1) + +tpm(sce_scater) <- calculateTPM(sce_scater, effective_length = 5e4) + +cpm(sce_scater) <- calculateCPM(sce_scater, use_size_factors = FALSE) + +assays(sce_scater) +``` + +详细理论见:https://hemberg-lab.github.io/scRNA.seq.course/cleaning-the-expression-matrix.html#normalization-theory + +- 7.8.1 Raw +- 7.8.2 CPM +- 7.8.3 Size-factor (RLE) +- 7.8.4 Upperquantile +- 7.8.5 TMM +- 7.8.6 scran +- 7.8.7 Downsampling + +在`monocle`包, + +```{r} +colnames(phenoData(sce_monocle)@data) +sce_monocle <- estimateSizeFactors(sce_monocle) +sce_monocle <- estimateDispersions(sce_monocle) +colnames(phenoData(sce_monocle)@data) +``` + + +### step4: 去除干扰因素 + +在 seurat 包,去除一些文库大小,线粒体基因含量,ERCC含量等因素的功能被包装在 ScaleData 函数里面,前提是需要被去除的因素提供 AddMetaData 函数添加到了对象。 + +```{r } +sce_seurat <- ScaleData(object = sce_seurat, + vars.to.regress = c("nUMI"), + display.progress = F) +## 所有放在 vars.to.regress 参数的变量均可以被去除 +``` + +在scater包, 主要是可视化那些干扰因素: + +```{r} +sce_scater <- runPCA(sce_scater) +# colnames(colData(sce_scater)) +plotPCA( + sce_scater, + colour_by = "Biological_Condition", + size_by = "NALIGNED" +) +# 还有 plotQC 函数。 + +``` + +如果需要去除干扰因素,可以借用limma包的 removeBatchEffect 函数 + +```{r,eval=F} +library(limma) +batch <- rep(1:2, each=20) +corrected <- removeBatchEffect(logcounts(example_sce), block=batch) +assay(example_sce, "corrected_logcounts") <- corrected +``` + + + +在monocle包,去除干扰因素的功能被包装在降维函数中,示例如下: + +```{r,eval=FALSE} +# 放在 residualModelFormulaStr 里面的是需要去除的 +cds <- reduceDimension(cds, max_components = 2, num_dim = 2, + reduction_method = 'tSNE', + residualModelFormulaStr = "~Biological_Condition + num_genes_expressed", + verbose = T) +cds <- clusterCells(cds, num_clusters = 2) +plot_cell_clusters(cds, 1, 2, color = "Biological_Condition") + +## 上面去除了Biological_Condition,所以接下来聚类它们就被打散了。 +``` + + + +### step5: 判断重要的基因 + +寻找波动比较明显的基因,后续用这些基因而非全部基因进行分析,主要为了降低计算量。 + +在 `seurat` 包,必须要先 `normalization` ,然后才能进行`FindVariableGenes` 计算,代码如下: + +```{r} +sce_seurat <- FindVariableGenes(object = sce_seurat, + mean.function = ExpMean, + dispersion.function = LogVMR, + x.low.cutoff = 0.0125, + x.high.cutoff = 3, + y.cutoff = 0.5) +# 通过调整参数可以得到不同数量的 var.genes +length(sce_seurat@var.genes) +``` + +在`scater`包, 没有看到专门的函数,可以借用R基础函数。 + + +在 `monocle` 包中,同样也不是所有的基因都有作用,所以先进行挑选,合适的基因才在后续分析中用来降维聚类。 + +```{r} +disp_table <- dispersionTable(sce_monocle) +# 也可以先挑选差异基因 +unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1) +dim(unsup_clustering_genes) +sce_monocle <- setOrderingFilter(sce_monocle, + unsup_clustering_genes$gene_id) +plot_ordering_genes(sce_monocle) +plot_pc_variance_explained(sce_monocle, return_all = F) +# norm_method='log' +``` + +后面做降维的时候的 `num_dim` 参数选择基于上面的PCA图 + + +### step6: 多种降维算法 + +在 `seurat` 包, 降维之前必须要先 `Run ScaleData()` , 每个降维算法都被单独包装为函数了。 + +```{r} +sce_seurat <- RunPCA(object = sce_seurat, + pc.genes = sce_seurat@var.genes, + do.print = TRUE, + pcs.print = 1:5, + genes.print = 5) +sce_seurat@dr +tmp <- sce_seurat@dr$pca@gene.loadings +sce_seurat <- RunTSNE(object = sce_seurat, + dims.use = 1:10, + do.fast = TRUE, + perplexity=10) +sce_seurat@dr +``` + + +在`scater`包, + +```{r} +sce <- runPCA(sce_scater) +# 这里并没有进行任何基因的挑选,就直接进行了PCA,与 seurat包不一样。 +reducedDimNames(sce) +sce <- runPCA(sce, ncomponents=20) +# Perplexity of 10 just chosen here arbitrarily. +set.seed(1000) +# 这里的这个 perplexity 参数很重要 +sce <- runTSNE(sce, perplexity=30) +sce <- runDiffusionMap(sce) +reducedDimNames(sce) +sce_scater=sce +``` + + +在`monocle`包,降维函数就是 `reduceDimension` , 它包装这一个去除干扰因素的功能, 可供选择的降维算法包括: +`"DDRTree", + "ICA", "tSNE", "SimplePPT", "L1-graph", "SGL-tree"` + +```{r } +# 放在 residualModelFormulaStr 参数里面的是需要去除的 +sce_monocle <- reduceDimension(sce_monocle, + max_components = 2, num_dim = 2, + reduction_method = 'tSNE', + verbose = T) + +``` + + +### step7: 可视化降维结果 + +在 seurat 包, 两个降维算法被单独包装为两个函数,所以可视化也是两个函数。 + +```{r} +sce_seurat@dr +PCAPlot(sce_seurat, dim.1 = 1, dim.2 = 2, + group.by = 'Biological_Condition') +TSNEPlot(sce_seurat,group.by = 'Biological_Condition') + +VizPCA( sce_seurat, pcs.use = 1:2) +PCElbowPlot(object = sce_seurat) +sce_seurat <- ProjectPCA(sce_seurat, do.print = FALSE) +PCHeatmap(object = sce_seurat, + pc.use = 1, + cells.use = ncol(sce_seurat@data), + do.balanced = TRUE, + label.columns = FALSE) +``` + + +在scater包, 同样的是多个降维函数和多个可视化函数 + +```{r} +plotTSNE(sce_scater, + colour_by = "Biological_Condition" ) +plotPCA(sce_scater, + colour_by = "Biological_Condition" ) +plotPCA(sce_scater, ncomponents = 4, + colour_by = "Biological_Condition" ) + +plotDiffusionMap(sce_scater, + colour_by = "Biological_Condition" ) + +``` + +在monocle包,有趣的是降维后必须先分群才能进行可视化。 + +```{r} +sce_monocle <- clusterCells(sce_monocle, num_clusters = 4) +plot_cell_clusters(sce_monocle, 1, 2, color = "Biological_Condition") +``` + + +### step8: 多种聚类算法 + +聚类后就可以根据阈值进行分群 + +在 `seurat` 包, **重点**: 需要搞懂这里的 resolution 参数,而且降维算法可以选PCA或者ICA , 分群算法也可以选择。 + +```{r} +sce_seurat <- FindClusters(object = sce_seurat, + reduction.type = "pca", + dims.use = 1:10, force.recalc = T, + resolution = 0.9, print.output = 0, + save.SNN = TRUE) +PrintFindClustersParams(sce_seurat) +table(sce_seurat@meta.data$res.0.9) +``` + +在`scater`包, 并没有包装聚类函数,而且辅助其它R包,或者R基础函数: + +- SC3 +- pcaReduce +- tSNE + kmeans +- SNN-Cliq +- SINCERA + +最常用的是无缝连接 `SC3` 包: + +```{r} +library(SC3) # BiocManager::install('SC3') +sce <- sc3_estimate_k(sce_scater) +metadata(sce)$sc3$k_estimation +rowData(sce)$feature_symbol <- rownames(rowData(sce)) +``` + +一步运行sc3的所有分析, 相当耗费时间 + +这里`kn`表示的预估聚类数, 考虑到数据集是已知的,我们强行设置为4组, 具体数据要具体考虑。 + +```{r} +# 耗费时间 +kn <- 4 ## 这里可以选择 3:5 看多种分类结果。 +sc3_cluster <- "sc3_4_clusters" +Sys.time() +sce <- sc3(sce, ks = kn, biology = TRUE) +Sys.time() +``` + + + +在`monocle`包,如下: + +```{r} +sce_monocle <- clusterCells(sce_monocle, num_clusters = 4) +plot_cell_clusters(sce_monocle) +plot_cell_clusters(sce_monocle, 1, 2, color = "Biological_Condition") +``` + +可供选择的聚类算法包括:` densityPeak, louvian and DDRTree` + + +### step9: 聚类后找每个细胞亚群的标志基因 + +在`seurat`包, + +```{r} +sce.markers <- FindAllMarkers(object = sce_seurat, only.pos = TRUE, min.pct = 0.25, + thresh.use = 0.25) +# DT::datatable(sce.markers) +library(dplyr) +sce.markers %>% group_by(cluster) %>% top_n(2, avg_logFC) +top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_logFC) + +``` + +可以单独可视化这些标志基因。 + +```{r fig.width=10, fig.height=10} +# setting slim.col.label to TRUE will print just the cluster IDS instead of# every cell name +DoHeatmap(object = sce_seurat, + genes.use = top10$gene, + slim.col.label = TRUE, + remove.key = TRUE) + +``` + + +在`scater`包,可视化展示部分, kn就是聚类数,就能看到标志基因了。 + +热图: 比较先验分类和SC3的聚类的一致性 + +```{r} +sc3_plot_consensus(sce, k = kn, show_pdata = c("Biological_Condition",sc3_cluster)) +``` + +展示表达量信息 + +```{r} +sc3_plot_expression(sce, k = kn, show_pdata = c("Biological_Condition",sc3_cluster)) +``` + +展示可能的标记基因 + +```{r} +sc3_plot_markers(sce, k = kn, show_pdata = c("Biological_Condition",sc3_cluster)) +``` + +在PCA上展示SC3的聚类结果 + +```{r} +plotPCA(sce, colour_by = sc3_cluster ) +# sc3_interactive(sce) +``` + + + +在`monocle`包,应该是没有找标志基因的函数,但是有推断差异基因的函数,而且它多一个功能,就是进行发育轨迹的推断。 +推断发育轨迹才是monocle的拿手好戏,也是它荣升为3大R包的核心竞争力。 + + +第一步: 挑选合适的基因. 有多个方法,例如提供已知的基因集,这里选取统计学显著的差异基因列表 + +```{r} +Sys.time() +cds=sce_monocle +diff_test_res <- differentialGeneTest(cds, + fullModelFormulaStr = "~Biological_Condition") +Sys.time() +# 可以看到运行耗时 + +# Select genes that are significant at an FDR < 10% +sig_genes <- subset(diff_test_res, qval < 0.1) +head(sig_genes[,c("gene_short_name", "pval", "qval")] ) + +ordering_genes <- row.names (subset(diff_test_res, qval < 0.01)) +cds <- setOrderingFilter(cds, ordering_genes) +plot_ordering_genes(cds) + +``` + +第二步: 降维。降维的目的是为了更好的展示数据。函数里提供了很多种方法, 不同方法的最后展示的图都不太一样, 其中“DDRTree”是Monocle2使用的默认方法 + +```{r} +cds <- reduceDimension(cds, max_components = 2, + method = 'DDRTree') +``` + +第三步: 对细胞进行排序 + +```{r} +cds <- orderCells(cds) + +``` + +最后两个可视化函数,对结果进行可视化 + +```{r} +plot_cell_trajectory(cds, color_by = "Biological_Condition") +``` + +可以很明显看到细胞的发育轨迹,正好对应 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这时间进展的孕期细胞。 + + +`plot_genes_in_pseudotime`可以展现marker基因 + +最开始挑选合适基因,除了我们演示的找统计学显著的差异表达基因这个方法外,还可以于已知的标记基因,主要是基于生物学背景知识。 + +如果是已知基因列表,就需要自己读取外包文件,导入R里面来分析。 + + +### step10: 继续分类 + +只需要挑选前面步骤分类好的细胞,去表达矩阵里面进行筛选细胞,重新走一遍上面的流程即可。 + + +### 一些总结: + +首先是:seurat总结 + +`counts`矩阵进来后被包装为对象,方便操作。 + +然后一定要经过 `NormalizeData` 和 `ScaleData` 的操作 + +函数 `FindVariableGenes` 可以挑选适合进行下游分析的基因集。 + +函数 `RunPCA` 和 `RunTSNE` 进行降维 + + +函数 `FindClusters` 直接就分群了,非常方便 +函数 `FindAllMarkers` 可以对分群后各个亚群找标志基因。 + +函数 `FeaturePlot` 可以展示不同基因在所有细胞的表达量 +函数 `VlnPlot` 可以展示不同基因在不同分群的表达量差异情况 +函数 `DoHeatmap` 可以选定基因集后绘制热图 + + +## 使用M3Drop包 + +### 首先构建M3Drop需要的对象 + +```{r,message=F,warning=F} +library(M3Drop) +Normalized_data <- M3DropCleanData(counts, + labels = sample_ann$Biological_Condition , + is.counts=TRUE, min_detected_genes=2000) +dim(Normalized_data$data) +length(Normalized_data$labels) +class(Normalized_data) +str(Normalized_data) +``` + +这个包设计比较简单,并没有构建S4对象,只是一个简单的list而已。 + +### 统计学算法 Michaelis-Menten + +需要深入读该文章,了解其算法,这里略过,总之它对单细胞转录组的表达矩阵进行了一系列的统计检验。 + +```{r} +fits <- M3DropDropoutModels(Normalized_data$data) + +# Sum absolute residuals +data.frame(MM=fits$MMFit$SAr, Logistic=fits$LogiFit$SAr, + DoubleExpo=fits$ExpoFit$SAr) +# Sum squared residuals +data.frame(MM=fits$MMFit$SSr, Logistic=fits$LogiFit$SSr, + DoubleExpo=fits$ExpoFit$SSr) +``` + + +### 找差异基因 + +```{r} +DE_genes <- M3DropDifferentialExpression(Normalized_data$data, + mt_method="fdr", mt_threshold=0.01) +dim(DE_genes) +head(DE_genes) +``` + +这里是针对上面的统计结果来的 + +### 针对差异基因画热图 + +```{r fig.width=10, fig.height=10} +par(mar=c(1,1,1,1)) +heat_out <- M3DropExpressionHeatmap(DE_genes$Gene, Normalized_data$data, + cell_labels = Normalized_data$labels) +``` + +可视化了解一下找到的差异基因在不同的细胞类型的表达分布情况。 + +### 聚类 + +这里可以重新聚类后,针对自己找到的类别来分别找marker基因,不需要使用测试数据自带的表型信息。 + +```{r} +cell_populations <- M3DropGetHeatmapCellClusters(heat_out, k=4) +library("ROCR") +marker_genes <- M3DropGetMarkers(Normalized_data$data, cell_populations) +table(cell_populations,Normalized_data$labels) +``` + +### 每个类别的marker genes + +```{r} +head(marker_genes[marker_genes$Group==4,],20) +marker_genes[rownames(marker_genes)=="FOS",] +``` + +也可以针对这些 marker genes去画热图,当然,得根据AUC和P值来挑选符合要求的差异基因去绘图。 + +```{r fig.width=10, fig.height=10} +par(mar=c(1,1,1,1)) +choosed_marker_genes=as.character(unlist(lapply( + split(marker_genes,marker_genes$Group), + function(x) (rownames(head(x,20))) + ))) +heat_out <- M3DropExpressionHeatmap(choosed_marker_genes, + Normalized_data$data, + cell_labels = cell_populations) +``` + +如果遇到`Error in plot.new() : figure margins too large`报错,则单独将`heat_out`这行命令复制出来运行 + +## 对感兴趣基因集进行注释 + +通常是GO/KEGG等数据库,通常是超几何分布,GSEA,GSVA等算法。 + +拿到基因集后走我GitHub代码即可:https://github.com/jmzeng1314/GEO 简单的例子如下: + +```{r,eval=FALSE} +library(ggplot2) +library(clusterProfiler) +library(org.Hs.eg.db) +# 下面的 gene_up 是一个 entrez ID的向量,约 500左右的 自定义的基因集 +## 下面的 gene_all 也是一个 entrez ID的向量,约10000左右的背景基因,就我们的scRNA检测到的全部基因。 + ### over-representation test + kk.up <- enrichKEGG(gene = gene_up, + organism = 'hsa', + universe = gene_all, + pvalueCutoff = 0.9, + qvalueCutoff =0.9) + head(kk.up)[,1:6] + dotplot(kk.up );ggsave('kk.up.dotplot.png') +``` + + + + +## 其它单细胞R包 + +包括: + +- scran +- SINCERA +- SC3 + +不一一讲解,具体有需求,就仔细研读说明书,其实最后都是R语言熟练与否。 + +## 显示运行环境 + +```{r} +sessionInfo() +``` + + + + + + diff --git a/compare-4-packages/compare.Rproj b/compare-4-packages/compare.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/compare-4-packages/compare.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/compare-4-packages/compare.html b/compare-4-packages/compare.html new file mode 100644 index 0000000..097fc66 --- /dev/null +++ b/compare-4-packages/compare.html @@ -0,0 +1,1369 @@ + + + + + + + + + + + + + + +compare 4 R packages for scRNAseq + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+

安装并且加载必要的包

+

需要自行下载安装一些必要的R包,包括我们的测试数据集来源的R包scRNAseq ,以及4个单细胞转录组数据处理包!

+

因为大量学员在中国大陆,所以需要特别的R包安装方法,就是切换镜像后再下载R包。参考:http://www.bio-info-trainee.com/3727.html

+
options()$repos  ## 查看使用install.packages安装时的默认镜像
+
##     CRAN 
+## "@CRAN@"
+
options()$BioC_mirror ##查看使用bioconductor的默认镜像
+
## NULL
+
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") ##指定镜像,这个是中国科技大学镜像
+options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) ##指定install.packages安装镜像,这个是清华镜像
+options()$repos 
+
##                                         CRAN 
+## "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"
+
options()$BioC_mirror
+
## [1] "https://mirrors.ustc.edu.cn/bioc/"
+
if (!requireNamespace("BiocManager", quietly = TRUE))
+    install.packages("BiocManager") 
+if(!require('Seurat')){
+  BiocManager::install('Seurat',ask = F,update = F)
+}
+if(!require('scater')){
+  BiocManager::install(c( 'scater'),ask = F,update = F)
+}
+if(!require('monocle')){
+  BiocManager::install(c( 'monocle'),ask = F,update = F)
+}
+if(!require('scRNAseq')){
+  BiocManager::install(c( 'scRNAseq'),ask = F,update = F)
+}
+if(!require('SC3')){
+  BiocManager::install(c( 'SC3'),ask = F,update = F)
+}
+if(!require('M3Drop')){
+  BiocManager::install(c( 'M3Drop'),ask = F,update = F)
+}
+if(!require('ggpubr')){
+  BiocManager::install(c( 'ggpubr'),ask = F,update = F)
+}
+

安装成功后就可以加载R包:

+
rm(list = ls())  
+options(warn=-1)  
+suppressMessages(library(scater))
+suppressMessages(library(Seurat))
+suppressMessages(library(monocle))
+suppressMessages(library(scRNAseq)) 
+suppressMessages(library(SC3)) 
+suppressMessages(library(M3Drop)) 
+
+
+

简单了解数据集

+

这个包内置的是 Pollen et al. 2014 数据集,人类单细胞细胞,分成4类,分别是 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞,理解这些需要一定的生物学背景知识,如果不感兴趣,可以略过。

+

这个数据集很出名,截止2019年1月已经有近400的引用了,后面的人开发R包算法都会在其上面做测试,本例子只使用了数据集的4种细胞类型而已,因为 scRNAseq 这个R包就提供了这些,完整的数据是 23730 features, 301 samples, 地址为https://hemberg-lab.github.io/scRNA.seq.datasets/human/tissues/ , 这个网站非常值得推荐,简直是一个宝藏。

+

这里面的表达矩阵是由 RSEM (Li and Dewey 2011) 软件根据 hg38 RefSeq transcriptome 得到的,总是130个文库,每个细胞测了两次,测序深度不一样(非常的不一样)

+
library(scRNAseq)
+## ----- Load Example Data -----
+data(fluidigm) 
+ct <- floor(assays(fluidigm)$rsem_counts)
+ct[1:4,1:4] 
+
##          SRR1275356 SRR1274090 SRR1275251 SRR1275287
+## A1BG              0          0          0          0
+## A1BG-AS1          0          0          0          0
+## A1CF              0          0          0          0
+## A2M               0          0          0         33
+
sample_ann <- as.data.frame(colData(fluidigm))
+DT::datatable(sample_ann,
+              
+            rownames= FALSE,extensions = c('Scroller'), 
+            options = list(  
+  pageLength = 10, 
+  lengthMenu = list(c(10, 50, 100,-1), c('10', '50','100', 'All')),
+  columnDefs = list(list(className = 'dt-center', targets = 0 :8)),
+  scrollX = TRUE,
+  fixedHeader = TRUE,
+  fixedColumns = TRUE ,
+  deferRender = TRUE 
+),
+filter = 'top',
+escape = FALSE
+)
+
+ +
+

先探索表型信息

+

前面说到,这个数据集是130个文库,每个细胞测了两次,测序深度不一样,这130个细胞,分成4类,分别是: pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞。

+

批量,粗略的看一看各个细胞的一些统计学指标的分布情况

+
library(ggpubr)
+box <- lapply(colnames(sample_ann[,1:19]),function(i) {
+    dat <-  sample_ann[,i,drop=F] 
+    dat$sample=rownames(dat)
+    dat$group='all cells'
+    ## 画boxplot 
+   p <- ggboxplot(dat, x = "group", y = i, 
+                add = "jitter" )
+ p
+})
+plot_grid(plotlist=box, ncol=5 )
+

+
# ggsave(file="stat_all_cells.pdf")
+

很明显,他们可以有根据来进行分组,这里不再演示。 不过通常的文章并不会考虑如此多的细节,这里重点是批量,代码技巧非常值得你们学校。

+

因为进行了简单探索,对表型数据就有了把握,接下来可以进行一定程度的过滤,因为细节太多,这里重点是批量,代码技巧非常值得你们学校。

+
pa <- colnames(sample_ann[,c(1:9,11:16,18,19)])
+tf <- lapply(pa,function(i) {
+ # i=pa[1]
+  dat <-  sample_ann[,i]  
+  dat <- abs(log10(dat))
+  fivenum(dat)
+  (up <- mean(dat)+2*sd(dat))
+  (down <- mean(dat)- 2*sd(dat) ) 
+  valid <- ifelse(dat > down & dat < up, 1,0 ) 
+})
+
+tf <- do.call(cbind,tf)
+choosed_cells <- apply(tf,1,function(x) all(x==1))
+table(sample_ann$Biological_Condition)
+
## 
+##   GW16   GW21 GW21+3    NPC 
+##     52     16     32     30
+
sample_ann=sample_ann[choosed_cells,]
+table(sample_ann$Biological_Condition)
+
## 
+##   GW16   GW21 GW21+3    NPC 
+##     36     11     23     29
+
ct <- ct[,choosed_cells]
+
+
+

再探索基因表达情况

+
ct[1:4,1:4] 
+
##          SRR1274090 SRR1275287 SRR1275364 SRR1275269
+## A1BG              0          0          0          0
+## A1BG-AS1          0          0          0          0
+## A1CF              0          0          0          0
+## A2M               0         33          0         51
+
counts <- ct
+fivenum(apply(counts,1,function(x) sum(x>0) ))
+
##      A1CF     OR8G1 LINC01003    MRPS36     YWHAZ 
+##         0         0         4        26        99
+
boxplot(apply(counts,1,function(x) sum(x>0) ))
+

+
fivenum(apply(counts,2,function(x) sum(x>0) ))
+
## SRR1275365 SRR1275345 SRR1275248 SRR1275273 SRR1274125 
+##     1566.0     3043.0     4002.0     5706.5     8024.0
+
hist(apply(counts,2,function(x) sum(x>0) ))
+

+
choosed_genes=apply(counts,1,function(x) sum(x>0) )>0
+table(choosed_genes)
+
## choosed_genes
+## FALSE  TRUE 
+##  9496 16759
+
counts <- counts[choosed_genes,]
+
+
+
+

单细胞转录组分析流程介绍

+

注意,这里的 seurat是2.x版本,同理,monocle也是2版本。

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
用法Seuratscatermonocle
创建R包要求的对象CreateSeuratObjectSingleCellExperimentnewCellDataSet
QC and selecting cell创建矩阵的同时,可以选择过滤参数min.cell,min.gene等。还有FilterCells函数可以去除不合格的细胞。calculateQCMetrics()函数,其中的feature_controls参数可以指定过滤指标,然后有一系列的可视化函数用基础R函数进行初步过滤,还可以用detectGenes()函数加上subset()过滤
表达量的标准化或者归一化NormalizeDatacalculateCPM()等系列函数estimateSizeFactors()还有estimateDispersions
寻找重要的基因FindVariableGenes没有看到专门的函数,可以借用R基础函数differentialGeneTest()函数
去除干扰因素ScaleData借用limma包的 removeBatchEffect 函数去除干扰因素的功能被包装在降维函数中
降维RunPCA或者RunTSNErunPCA或者runTSNE,runDiffusionMapreduceDimension函数,可以选择多种参数
降维后可视化VizPCA和PCElbowPlot;PCAPlot或者TSNEPlotplotPCA和plotTSNE等等plot_cell_trajectory()或plot_genes_in_pseudotime
分类群clusterFindClusters并没有包装聚类函数,而且辅助其它R包,或者R基础函数clusterCells
聚类后找每个细胞亚群的标志基因FindMarkers和FindAllMarkers函数借助SC3包newCellTypeHierarchy classifyCells
calculateMarkerSpecificity
+
+

step1: 创建对象

+

首先为 scater 包创建对象

+
pheno_data <- as.data.frame(colData(fluidigm))
+ct <- floor(assays(fluidigm)$rsem_counts)
+## 这里需要把Pollen的表达矩阵做成我们的 scater 要求的对象
+# data("sc_example_counts")
+# data("sc_example_cell_info") 
+# 你也可以尝试该R包自带的数据集。
+# 参考 https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/vignette-intro.R
+sce_scater <- SingleCellExperiment(
+    assays = list(counts = ct), 
+    colData = pheno_data
+    )
+sce_scater
+
## class: SingleCellExperiment 
+## dim: 26255 130 
+## metadata(0):
+## assays(1): counts
+## rownames(26255): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
+## rowData names(0):
+## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
+## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
+## reducedDimNames(0):
+## spikeNames(0):
+
# 后面所有的分析都是基于 sce_scater 这个变量
+# 是一个 SingleCellExperiment 对象,被很多单细胞R包采用。
+

然后为 seurat 包创建对象

+
meta <- as.data.frame(colData(fluidigm))
+ct <- floor(assays(fluidigm)$rsem_counts)
+counts <- ct
+sce_seurat <- CreateSeuratObject(raw.data = counts, 
+                             meta.data =meta,
+                             min.cells = 3, 
+                             min.genes = 200, 
+                             project = "Pollen")
+sce_seurat
+
## An object of class seurat in project Pollen 
+##  14656 genes across 130 samples.
+
## 后续所有的分析都基于这个 sce_seurat 变量,是一个对象 
+

最后为 monocle 包创建对象

+
ct <- floor(assays(fluidigm)$rsem_counts)
+gene_ann <- data.frame(
+  gene_short_name = row.names(ct), 
+  row.names = row.names(ct)
+)
+sample_ann=as.data.frame(colData(fluidigm))
+pd <- new("AnnotatedDataFrame",
+          data=sample_ann)
+fd <- new("AnnotatedDataFrame",
+          data=gene_ann)
+sce_monocle <- newCellDataSet(
+  ct, 
+  phenoData = pd,
+  featureData =fd,
+  expressionFamily = negbinomial.size(),
+  lowerDetectionLimit=1)
+sce_monocle
+
## CellDataSet (storageMode: environment)
+## assayData: 26255 features, 130 samples 
+##   element names: exprs 
+## protocolData: none
+## phenoData
+##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+##   varLabels: NREADS NALIGNED ... Size_Factor (29 total)
+##   varMetadata: labelDescription
+## featureData
+##   featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
+##   fvarLabels: gene_short_name
+##   fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+

再次回顾一下这3个对象。

+
sce_seurat
+
## An object of class seurat in project Pollen 
+##  14656 genes across 130 samples.
+
sce_scater
+
## class: SingleCellExperiment 
+## dim: 26255 130 
+## metadata(0):
+## assays(1): counts
+## rownames(26255): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
+## rowData names(0):
+## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
+## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
+## reducedDimNames(0):
+## spikeNames(0):
+
sce_monocle
+
## CellDataSet (storageMode: environment)
+## assayData: 26255 features, 130 samples 
+##   element names: exprs 
+## protocolData: none
+## phenoData
+##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+##   varLabels: NREADS NALIGNED ... Size_Factor (29 total)
+##   varMetadata: labelDescription
+## featureData
+##   featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
+##   fvarLabels: gene_short_name
+##   fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+
+
+

step2: 质量控制

+

seurat 包,

+
mito.genes <- grep(pattern = "^MT-", 
+                   x = rownames(sce_seurat@data), 
+                   value = TRUE)
+# 恰好这个例子的表达矩阵里面没有线粒体基因
+percent.mito <- Matrix::colSums(sce_seurat@raw.data[mito.genes, ]) / Matrix::colSums(sce_seurat@raw.data)
+## 也可以加入很多其它属性,比如 ERCC 等。
+
+# AddMetaData adds columns to object@meta.data, and is a great place to stash QC stats
+sce_seurat <- AddMetaData(object = sce_seurat, 
+                          metadata = percent.mito,
+                         col.name = "percent.mito")
+VlnPlot(object = sce_seurat, 
+        features.plot = c("nGene", "nUMI", "percent.mito"), 
+        group.by = 'Biological_Condition', nCol = 3)
+

+
GenePlot(object = sce_seurat, gene1 = "nUMI", gene2 = "nGene")
+

+
CellPlot(sce_seurat,
+         sce_seurat@cell.names[3],
+         sce_seurat@cell.names[4],
+         do.ident = FALSE)
+

+
# FilterCells函数
+# sce_seurat
+# sce_seurat <- FilterCells(object = sce_seurat, 
+#                     subset.names = c("nGene", "percent.mito"), 
+#                     low.thresholds = c(200, -Inf), 
+#                     high.thresholds = c(2500, 0.05))
+# sce_seurat
+

在scater包,

+

这里仅仅是演示 scater 包最简单的质量控制代码,详细代码见:https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/vignette-qc.html

+
genes <- rownames(rowData(sce_scater))
+genes[grepl('^MT-',genes)]
+
## character(0)
+
genes[grepl('^ERCC-',genes)] 
+
## character(0)
+
exprs(sce_scater) <- log2(
+    calculateCPM(sce_scater ) + 1)
+keep_feature <- rowSums(exprs(sce_scater) > 0) > 0
+table(keep_feature)
+
## keep_feature
+## FALSE  TRUE 
+##  9159 17096
+
sce_scater <- sce_scater[keep_feature,]
+sce_scater 
+
## class: SingleCellExperiment 
+## dim: 17096 130 
+## metadata(0):
+## assays(2): counts logcounts
+## rownames(17096): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
+## rowData names(0):
+## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
+## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
+## reducedDimNames(0):
+## spikeNames(0):
+
sce_scater <- calculateQCMetrics(sce_scater)
+plotHighestExprs(sce_scater, exprs_values = "counts")
+

+
plotExprsFreqVsMean(sce_scater)
+

+

monocle包,

+
sce_monocle
+
## CellDataSet (storageMode: environment)
+## assayData: 26255 features, 130 samples 
+##   element names: exprs 
+## protocolData: none
+## phenoData
+##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+##   varLabels: NREADS NALIGNED ... Size_Factor (29 total)
+##   varMetadata: labelDescription
+## featureData
+##   featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
+##   fvarLabels: gene_short_name
+##   fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+
## 起初是 
+sce_monocle <- detectGenes(sce_monocle, min_expr = 0.1)
+print(head(fData(sce_monocle)))
+
##          gene_short_name num_cells_expressed
+## A1BG                A1BG                  10
+## A1BG-AS1        A1BG-AS1                   2
+## A1CF                A1CF                   1
+## A2M                  A2M                  21
+## A2M-AS1          A2M-AS1                   3
+## A2ML1              A2ML1                   9
+
expressed_genes <- row.names(subset(fData(sce_monocle),
+                                    num_cells_expressed >= 5))
+length(expressed_genes)
+
## [1] 13385
+
sce_monocle <- sce_monocle[expressed_genes,]
+sce_monocle
+
## CellDataSet (storageMode: environment)
+## assayData: 13385 features, 130 samples 
+##   element names: exprs 
+## protocolData: none
+## phenoData
+##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+##   varLabels: NREADS NALIGNED ... num_genes_expressed (30 total)
+##   varMetadata: labelDescription
+## featureData
+##   featureNames: A1BG A2M ... ZZZ3 (13385 total)
+##   fvarLabels: gene_short_name num_cells_expressed
+##   fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+
# 过滤基因后是  
+tmp=pData(sce_monocle)
+fivenum(tmp[,1])
+
## [1]    91616   232899   892209  8130850 14477100
+
fivenum(tmp[,30])
+
## [1] 1418.0 2961.0 3841.5 5381.0 8221.0
+
# 这里并不需要过滤细胞,如果想过滤,就自己摸索阈值,然后挑选细胞即可。
+# 这里留下来了所有的细胞。
+valid_cells <- row.names(pData(sce_monocle) )
+sce_monocle <- sce_monocle[,valid_cells]
+sce_monocle 
+
## CellDataSet (storageMode: environment)
+## assayData: 13385 features, 130 samples 
+##   element names: exprs 
+## protocolData: none
+## phenoData
+##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+##   varLabels: NREADS NALIGNED ... num_genes_expressed (30 total)
+##   varMetadata: labelDescription
+## featureData
+##   featureNames: A1BG A2M ... ZZZ3 (13385 total)
+##   fvarLabels: gene_short_name num_cells_expressed
+##   fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+
+
+

step3: 表达量的标准化和归一化

+

在 seurat包需要先假定平均细胞测序文库大小,这里是 10000

+
sce_seurat <- NormalizeData(object = sce_seurat, 
+                     normalization.method = "LogNormalize", 
+                     scale.factor = 10000,
+                     display.progress = F)
+

在scater包:

+
assays(sce_scater)
+
## List of length 2
+## names(2): counts logcounts
+
counts(sce_scater) <- assays(sce_scater)$counts
+norm_exprs(sce_scater) <- log2(calculateCPM(sce_scater, use_size_factors = FALSE) + 1)
+
+stand_exprs(sce_scater) <- log2(calculateCPM(sce_scater, use_size_factors = FALSE) + 1)
+
+tpm(sce_scater) <- calculateTPM(sce_scater, effective_length = 5e4)
+
+cpm(sce_scater) <- calculateCPM(sce_scater, use_size_factors = FALSE)
+
+assays(sce_scater)
+
## List of length 6
+## names(6): counts logcounts norm_exprs stand_exprs tpm cpm
+

详细理论见:https://hemberg-lab.github.io/scRNA.seq.course/cleaning-the-expression-matrix.html#normalization-theory

+
    +
  • 7.8.1 Raw
  • +
  • 7.8.2 CPM
  • +
  • 7.8.3 Size-factor (RLE)
  • +
  • 7.8.4 Upperquantile
  • +
  • 7.8.5 TMM
  • +
  • 7.8.6 scran
  • +
  • 7.8.7 Downsampling
  • +
+

monocle包,

+
colnames(phenoData(sce_monocle)@data)
+
##  [1] "NREADS"                       "NALIGNED"                    
+##  [3] "RALIGN"                       "TOTAL_DUP"                   
+##  [5] "PRIMER"                       "INSERT_SZ"                   
+##  [7] "INSERT_SZ_STD"                "COMPLEXITY"                  
+##  [9] "NDUPR"                        "PCT_RIBOSOMAL_BASES"         
+## [11] "PCT_CODING_BASES"             "PCT_UTR_BASES"               
+## [13] "PCT_INTRONIC_BASES"           "PCT_INTERGENIC_BASES"        
+## [15] "PCT_MRNA_BASES"               "MEDIAN_CV_COVERAGE"          
+## [17] "MEDIAN_5PRIME_BIAS"           "MEDIAN_3PRIME_BIAS"          
+## [19] "MEDIAN_5PRIME_TO_3PRIME_BIAS" "sample_id.x"                 
+## [21] "Lane_ID"                      "LibraryName"                 
+## [23] "avgLength"                    "spots"                       
+## [25] "Biological_Condition"         "Coverage_Type"               
+## [27] "Cluster1"                     "Cluster2"                    
+## [29] "Size_Factor"                  "num_genes_expressed"
+
sce_monocle <- estimateSizeFactors(sce_monocle)
+sce_monocle <- estimateDispersions(sce_monocle)
+colnames(phenoData(sce_monocle)@data)
+
##  [1] "NREADS"                       "NALIGNED"                    
+##  [3] "RALIGN"                       "TOTAL_DUP"                   
+##  [5] "PRIMER"                       "INSERT_SZ"                   
+##  [7] "INSERT_SZ_STD"                "COMPLEXITY"                  
+##  [9] "NDUPR"                        "PCT_RIBOSOMAL_BASES"         
+## [11] "PCT_CODING_BASES"             "PCT_UTR_BASES"               
+## [13] "PCT_INTRONIC_BASES"           "PCT_INTERGENIC_BASES"        
+## [15] "PCT_MRNA_BASES"               "MEDIAN_CV_COVERAGE"          
+## [17] "MEDIAN_5PRIME_BIAS"           "MEDIAN_3PRIME_BIAS"          
+## [19] "MEDIAN_5PRIME_TO_3PRIME_BIAS" "sample_id.x"                 
+## [21] "Lane_ID"                      "LibraryName"                 
+## [23] "avgLength"                    "spots"                       
+## [25] "Biological_Condition"         "Coverage_Type"               
+## [27] "Cluster1"                     "Cluster2"                    
+## [29] "Size_Factor"                  "num_genes_expressed"
+
+
+

step4: 去除干扰因素

+

在 seurat 包,去除一些文库大小,线粒体基因含量,ERCC含量等因素的功能被包装在 ScaleData 函数里面,前提是需要被去除的因素提供 AddMetaData 函数添加到了对象。

+
sce_seurat <- ScaleData(object = sce_seurat, 
+                 vars.to.regress = c("nUMI"),
+                 display.progress = F)
+## 所有放在 vars.to.regress 参数的变量均可以被去除
+

在scater包, 主要是可视化那些干扰因素:

+
sce_scater <- runPCA(sce_scater)
+# colnames(colData(sce_scater))
+plotPCA(
+    sce_scater, 
+    colour_by = "Biological_Condition",
+    size_by = "NALIGNED"
+)
+

+
# 还有 plotQC 函数。
+

如果需要去除干扰因素,可以借用limma包的 removeBatchEffect 函数

+
library(limma)
+batch <- rep(1:2, each=20)
+corrected <- removeBatchEffect(logcounts(example_sce), block=batch)
+assay(example_sce, "corrected_logcounts") <- corrected
+

在monocle包,去除干扰因素的功能被包装在降维函数中,示例如下:

+
# 放在 residualModelFormulaStr 里面的是需要去除的
+cds <- reduceDimension(cds, max_components = 2, num_dim = 2,
+                        reduction_method = 'tSNE',
+                        residualModelFormulaStr = "~Biological_Condition + num_genes_expressed",
+                        verbose = T)
+cds <- clusterCells(cds, num_clusters = 2)
+plot_cell_clusters(cds, 1, 2, color = "Biological_Condition")
+
+## 上面去除了Biological_Condition,所以接下来聚类它们就被打散了。
+
+
+

step5: 判断重要的基因

+

寻找波动比较明显的基因,后续用这些基因而非全部基因进行分析,主要为了降低计算量。

+

seurat 包,必须要先 normalization ,然后才能进行FindVariableGenes 计算,代码如下:

+
sce_seurat <- FindVariableGenes(object = sce_seurat, 
+                                mean.function = ExpMean,
+                                dispersion.function = LogVMR, 
+                         x.low.cutoff = 0.0125, 
+                         x.high.cutoff = 3, 
+                         y.cutoff = 0.5)
+

+
# 通过调整参数可以得到不同数量的 var.genes
+length(sce_seurat@var.genes)
+
## [1] 4944
+

scater包, 没有看到专门的函数,可以借用R基础函数。

+

monocle 包中,同样也不是所有的基因都有作用,所以先进行挑选,合适的基因才在后续分析中用来降维聚类。

+
disp_table <- dispersionTable(sce_monocle)
+# 也可以先挑选差异基因
+unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)
+dim(unsup_clustering_genes)
+
## [1] 12911     4
+
sce_monocle <- setOrderingFilter(sce_monocle,
+                                 unsup_clustering_genes$gene_id)
+plot_ordering_genes(sce_monocle) 
+

+
plot_pc_variance_explained(sce_monocle, return_all = F) 
+

+
# norm_method='log'
+

后面做降维的时候的 num_dim 参数选择基于上面的PCA图

+
+
+

step6: 多种降维算法

+

seurat 包, 降维之前必须要先 Run ScaleData() , 每个降维算法都被单独包装为函数了。

+
sce_seurat <- RunPCA(object = sce_seurat, 
+              pc.genes = sce_seurat@var.genes, 
+              do.print = TRUE, 
+              pcs.print = 1:5, 
+              genes.print = 5)
+
## [1] "PC1"
+## [1] "MLLT11"    "KIDINS220" "SLA"       "CALM1"     "FAM110B"  
+## [1] ""
+## [1] "HMGB2"  "LIX1"   "LDHA"   "TRIM59" "ENO1"  
+## [1] ""
+## [1] ""
+## [1] "PC2"
+## [1] "CENPF"    "SHISA2"   "CDK1"     "HIST1H4C" "BARD1"   
+## [1] ""
+## [1] "ECSCR"  "MYCT1"  "ELK3"   "IMPG2"  "ADGRF5"
+## [1] ""
+## [1] ""
+## [1] "PC3"
+## [1] "ADGRV1"  "GPX3"    "FAM107A" "HOPX"    "PTN"    
+## [1] ""
+## [1] "BCAT1"  "EFNA5"  "DLK1"   "TUBA1C" "CXADR" 
+## [1] ""
+## [1] ""
+## [1] "PC4"
+## [1] "S100A6"   "LOX"      "ADAMTS16" "CA2"      "DKK3"    
+## [1] ""
+## [1] "FAM50A" "DDX54"  "ARFIP2" "IPP"    "PXMP2" 
+## [1] ""
+## [1] ""
+## [1] "PC5"
+## [1] "ASB16-AS1"  "CENPL"      "ATP1A1-AS1" "CHD9"       "KLHL31"    
+## [1] ""
+## [1] "CD44"  "CA2"   "SYNJ2" "LRP10" "TTC38"
+## [1] ""
+## [1] ""
+
sce_seurat@dr
+
## $pca
+## A dimensional reduction object with key PC 
+##  Number of dimensions: 20 
+##  Projected dimensional reduction calculated: FALSE 
+##  Jackstraw run: FALSE
+
tmp <- sce_seurat@dr$pca@gene.loadings
+sce_seurat <- RunTSNE(object = sce_seurat, 
+               dims.use = 1:10, 
+               do.fast = TRUE, 
+               perplexity=10)
+sce_seurat@dr 
+
## $pca
+## A dimensional reduction object with key PC 
+##  Number of dimensions: 20 
+##  Projected dimensional reduction calculated: FALSE 
+##  Jackstraw run: FALSE 
+## 
+## $tsne
+## A dimensional reduction object with key tSNE_ 
+##  Number of dimensions: 2 
+##  Projected dimensional reduction calculated: FALSE 
+##  Jackstraw run: FALSE
+

scater包,

+
sce <- runPCA(sce_scater)
+# 这里并没有进行任何基因的挑选,就直接进行了PCA,与 seurat包不一样。
+reducedDimNames(sce) 
+
## [1] "PCA"
+
sce <- runPCA(sce, ncomponents=20)
+# Perplexity of 10 just chosen here arbitrarily. 
+set.seed(1000)
+# 这里的这个 perplexity 参数很重要
+sce <- runTSNE(sce, perplexity=30)
+sce <- runDiffusionMap(sce)
+reducedDimNames(sce)
+
## [1] "PCA"          "TSNE"         "DiffusionMap"
+
sce_scater=sce
+

monocle包,降维函数就是 reduceDimension , 它包装这一个去除干扰因素的功能, 可供选择的降维算法包括: "DDRTree", "ICA", "tSNE", "SimplePPT", "L1-graph", "SGL-tree"

+
# 放在 residualModelFormulaStr 参数里面的是需要去除的
+sce_monocle <- reduceDimension(sce_monocle, 
+                               max_components = 2, num_dim = 2,
+                        reduction_method = 'tSNE', 
+                        verbose = T)
+
+
+

step7: 可视化降维结果

+

在 seurat 包, 两个降维算法被单独包装为两个函数,所以可视化也是两个函数。

+
sce_seurat@dr 
+
## $pca
+## A dimensional reduction object with key PC 
+##  Number of dimensions: 20 
+##  Projected dimensional reduction calculated: FALSE 
+##  Jackstraw run: FALSE 
+## 
+## $tsne
+## A dimensional reduction object with key tSNE_ 
+##  Number of dimensions: 2 
+##  Projected dimensional reduction calculated: FALSE 
+##  Jackstraw run: FALSE
+
PCAPlot(sce_seurat, dim.1 = 1, dim.2 = 2,
+        group.by = 'Biological_Condition')
+

+
TSNEPlot(sce_seurat,group.by = 'Biological_Condition')
+

+
VizPCA( sce_seurat, pcs.use = 1:2)
+

+
PCElbowPlot(object = sce_seurat)
+

+
sce_seurat <- ProjectPCA(sce_seurat, do.print = FALSE)
+PCHeatmap(object = sce_seurat, 
+          pc.use = 1, 
+          cells.use = ncol(sce_seurat@data), 
+          do.balanced = TRUE, 
+          label.columns = FALSE)
+

+

在scater包, 同样的是多个降维函数和多个可视化函数

+
plotTSNE(sce_scater,  
+         colour_by = "Biological_Condition" )
+

+
plotPCA(sce_scater, 
+        colour_by = "Biological_Condition" )
+

+
plotPCA(sce_scater, ncomponents = 4,  
+        colour_by = "Biological_Condition" )
+

+
plotDiffusionMap(sce_scater,  
+                 colour_by = "Biological_Condition" )
+

+

在monocle包,有趣的是降维后必须先分群才能进行可视化。

+
sce_monocle <- clusterCells(sce_monocle, num_clusters = 4)
+
## Distance cutoff calculated to 0.5035647
+
plot_cell_clusters(sce_monocle, 1, 2, color = "Biological_Condition")
+

+
+
+

step8: 多种聚类算法

+

聚类后就可以根据阈值进行分群

+

seurat 包, 重点: 需要搞懂这里的 resolution 参数,而且降维算法可以选PCA或者ICA , 分群算法也可以选择。

+
sce_seurat <- FindClusters(object = sce_seurat, 
+                    reduction.type = "pca", 
+                    dims.use = 1:10, force.recalc = T,
+                    resolution = 0.9, print.output = 0,
+                    save.SNN = TRUE)
+PrintFindClustersParams(sce_seurat)
+
## Parameters used in latest FindClusters calculation run on: 2019-03-21 17:35:14
+## =============================================================================
+## Resolution: 0.9
+## -----------------------------------------------------------------------------
+## Modularity Function    Algorithm         n.start         n.iter
+##      1                   1                 100             10
+## -----------------------------------------------------------------------------
+## Reduction used          k.param          prune.SNN
+##      pca                 30                0.0667
+## -----------------------------------------------------------------------------
+## Dims used in calculation
+## =============================================================================
+## 1 2 3 4 5 6 7 8 9 10
+
table(sce_seurat@meta.data$res.0.9)
+
## 
+##  0  1  2 
+## 60 36 34
+

scater包, 并没有包装聚类函数,而且辅助其它R包,或者R基础函数:

+
    +
  • SC3
  • +
  • pcaReduce
  • +
  • tSNE + kmeans
  • +
  • SNN-Cliq
  • +
  • SINCERA
  • +
+

最常用的是无缝连接 SC3 包:

+
library(SC3) # BiocManager::install('SC3')
+sce <- sc3_estimate_k(sce_scater)
+metadata(sce)$sc3$k_estimation
+
## [1] 24
+
rowData(sce)$feature_symbol <- rownames(rowData(sce))
+

一步运行sc3的所有分析, 相当耗费时间

+

这里kn表示的预估聚类数, 考虑到数据集是已知的,我们强行设置为4组, 具体数据要具体考虑。

+
# 耗费时间
+kn <- 4 ## 这里可以选择 3:5 看多种分类结果。
+sc3_cluster <- "sc3_4_clusters"
+Sys.time()
+
## [1] "2019-03-21 17:35:14 CST"
+
sce <- sc3(sce, ks = kn, biology = TRUE)
+
Sys.time()
+
## [1] "2019-03-21 17:36:27 CST"
+

monocle包,如下:

+
sce_monocle <- clusterCells(sce_monocle, num_clusters = 4)
+
## Distance cutoff calculated to 0.5035647
+
plot_cell_clusters(sce_monocle)
+

+
plot_cell_clusters(sce_monocle, 1, 2, color = "Biological_Condition")
+

+

可供选择的聚类算法包括:densityPeak, louvian and DDRTree

+
+
+

step9: 聚类后找每个细胞亚群的标志基因

+

seurat包,

+
sce.markers <- FindAllMarkers(object = sce_seurat, only.pos = TRUE, min.pct = 0.25, 
+                              thresh.use = 0.25)
+# DT::datatable(sce.markers)
+library(dplyr)
+sce.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
+
## # A tibble: 6 x 7
+## # Groups:   cluster [3]
+##      p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene     
+##      <dbl>     <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>    
+## 1 1.01e- 5      1.84 0.467 0.114  1.48e- 1 0       ERBB4    
+## 2 7.88e- 3      1.71 0.3   0.114  1.00e+ 0 0       PDZRN3   
+## 3 2.30e-22      2.43 0.806 0      3.37e-18 1       DLK1     
+## 4 1.94e-18      2.47 0.917 0.202  2.84e-14 1       BCAT1    
+## 5 2.91e-14      2.23 0.882 0.292  4.26e-10 2       MEF2C    
+## 6 6.21e- 7      1.86 0.588 0.229  9.10e- 3 2       LINC00643
+
top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
+

可以单独可视化这些标志基因。

+
# setting slim.col.label to TRUE will print just the cluster IDS instead of# every cell name
+DoHeatmap(object = sce_seurat, 
+          genes.use = top10$gene, 
+          slim.col.label = TRUE, 
+          remove.key = TRUE)
+

+

scater包,可视化展示部分, kn就是聚类数,就能看到标志基因了。

+

热图: 比较先验分类和SC3的聚类的一致性

+
sc3_plot_consensus(sce, k = kn, show_pdata = c("Biological_Condition",sc3_cluster))
+

+

展示表达量信息

+
sc3_plot_expression(sce, k = kn, show_pdata =  c("Biological_Condition",sc3_cluster))
+

+

展示可能的标记基因

+
sc3_plot_markers(sce, k = kn, show_pdata =  c("Biological_Condition",sc3_cluster))
+

+

在PCA上展示SC3的聚类结果

+
plotPCA(sce, colour_by =  sc3_cluster )
+

+
# sc3_interactive(sce)
+

monocle包,应该是没有找标志基因的函数,但是有推断差异基因的函数,而且它多一个功能,就是进行发育轨迹的推断。 推断发育轨迹才是monocle的拿手好戏,也是它荣升为3大R包的核心竞争力。

+

第一步: 挑选合适的基因. 有多个方法,例如提供已知的基因集,这里选取统计学显著的差异基因列表

+
Sys.time()
+
## [1] "2019-03-21 17:37:06 CST"
+
cds=sce_monocle
+diff_test_res <- differentialGeneTest(cds,
+                                      fullModelFormulaStr = "~Biological_Condition")
+Sys.time()
+
## [1] "2019-03-21 17:39:49 CST"
+
# 可以看到运行耗时
+
+# Select genes that are significant at an FDR < 10%
+sig_genes <- subset(diff_test_res, qval < 0.1)
+head(sig_genes[,c("gene_short_name", "pval", "qval")] )
+
##       gene_short_name         pval         qval
+## A1BG             A1BG 4.112065e-04 1.460722e-03
+## A2M               A2M 4.251744e-08 4.266086e-07
+## AACS             AACS 2.881832e-03 8.275761e-03
+## AADAT           AADAT 1.069794e-02 2.621123e-02
+## AAGAB           AAGAB 1.156771e-07 1.021331e-06
+## AAMP             AAMP 7.626789e-05 3.243869e-04
+
ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
+cds <- setOrderingFilter(cds, ordering_genes)
+plot_ordering_genes(cds) 
+

+

第二步: 降维。降维的目的是为了更好的展示数据。函数里提供了很多种方法, 不同方法的最后展示的图都不太一样, 其中“DDRTree”是Monocle2使用的默认方法

+
cds <- reduceDimension(cds, max_components = 2,
+                            method = 'DDRTree')
+

第三步: 对细胞进行排序

+
cds <- orderCells(cds)
+

最后两个可视化函数,对结果进行可视化

+
plot_cell_trajectory(cds, color_by = "Biological_Condition")  
+

+

可以很明显看到细胞的发育轨迹,正好对应 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这时间进展的孕期细胞。

+

plot_genes_in_pseudotime可以展现marker基因

+

最开始挑选合适基因,除了我们演示的找统计学显著的差异表达基因这个方法外,还可以于已知的标记基因,主要是基于生物学背景知识。

+

如果是已知基因列表,就需要自己读取外包文件,导入R里面来分析。

+
+
+

step10: 继续分类

+

只需要挑选前面步骤分类好的细胞,去表达矩阵里面进行筛选细胞,重新走一遍上面的流程即可。

+
+
+

一些总结:

+

首先是:seurat总结

+

counts矩阵进来后被包装为对象,方便操作。

+

然后一定要经过 NormalizeDataScaleData 的操作

+

函数 FindVariableGenes 可以挑选适合进行下游分析的基因集。

+

函数 RunPCARunTSNE 进行降维

+

函数 FindClusters 直接就分群了,非常方便 函数 FindAllMarkers 可以对分群后各个亚群找标志基因。

+

函数 FeaturePlot 可以展示不同基因在所有细胞的表达量 函数 VlnPlot 可以展示不同基因在不同分群的表达量差异情况 函数 DoHeatmap 可以选定基因集后绘制热图

+
+
+
+

使用M3Drop包

+
+

首先构建M3Drop需要的对象

+
library(M3Drop) 
+Normalized_data <- M3DropCleanData(counts, 
+                                   labels = sample_ann$Biological_Condition , 
+                                   is.counts=TRUE, min_detected_genes=2000)
+dim(Normalized_data$data)
+
## [1] 13958   124
+
length(Normalized_data$labels)
+
## [1] 124
+
class(Normalized_data)
+
## [1] "list"
+
str(Normalized_data)
+
## List of 2
+##  $ data  : num [1:13958, 1:124] 0 0 0.486 0 0.486 ...
+##   ..- attr(*, "dimnames")=List of 2
+##   .. ..$ : chr [1:13958] "A1BG" "A2M" "A2ML1" "AAAS" ...
+##   .. ..$ : chr [1:124] "SRR1275356" "SRR1274090" "SRR1275251" "SRR1275287" ...
+##  $ labels: chr [1:124] "GW16" "NPC" "GW16" "GW21+3" ...
+

这个包设计比较简单,并没有构建S4对象,只是一个简单的list而已。

+
+
+

统计学算法 Michaelis-Menten

+

需要深入读该文章,了解其算法,这里略过,总之它对单细胞转录组的表达矩阵进行了一系列的统计检验。

+
fits <- M3DropDropoutModels(Normalized_data$data)
+

+
# Sum absolute residuals
+data.frame(MM=fits$MMFit$SAr, Logistic=fits$LogiFit$SAr,
+           DoubleExpo=fits$ExpoFit$SAr) 
+
##     MM Logistic DoubleExpo
+## 1 1607     1625       4075
+
# Sum squared residuals
+data.frame(MM=fits$MMFit$SSr, Logistic=fits$LogiFit$SSr,
+           DoubleExpo=fits$ExpoFit$SSr)
+
##    MM Logistic DoubleExpo
+## 1 378      332       1989
+
+
+

找差异基因

+
DE_genes <- M3DropDifferentialExpression(Normalized_data$data, 
+                                         mt_method="fdr", mt_threshold=0.01)
+

+
dim(DE_genes)
+
## [1] 263   3
+
head(DE_genes)
+
##          Gene      p.value      q.value
+## ABCA1   ABCA1 1.381813e-04 0.0080700163
+## ABCE1   ABCE1 1.956343e-05 0.0017964893
+## ABLIM1 ABLIM1 7.353186e-06 0.0009082812
+## ACAT2   ACAT2 1.886555e-05 0.0017438766
+## ACVR1   ACVR1 1.259054e-05 0.0012970374
+## AMER2   AMER2 5.698985e-06 0.0007231494
+

这里是针对上面的统计结果来的

+
+
+

针对差异基因画热图

+
par(mar=c(1,1,1,1)) 
+heat_out <- M3DropExpressionHeatmap(DE_genes$Gene, Normalized_data$data, 
+                                    cell_labels = Normalized_data$labels)
+

+

可视化了解一下找到的差异基因在不同的细胞类型的表达分布情况。

+
+
+

聚类

+

这里可以重新聚类后,针对自己找到的类别来分别找marker基因,不需要使用测试数据自带的表型信息。

+
cell_populations <- M3DropGetHeatmapCellClusters(heat_out, k=4)
+library("ROCR") 
+marker_genes <- M3DropGetMarkers(Normalized_data$data, cell_populations)
+table(cell_populations,Normalized_data$labels)
+
##                 
+## cell_populations GW16 GW21 GW21+3 NPC
+##                1    0    0      0  30
+##                2   23    6      8   0
+##                3   21   10      0   0
+##                4    2    0     24   0
+
+
+

每个类别的marker genes

+
head(marker_genes[marker_genes$Group==4,],20) 
+
##                AUC Group         pval
+## MEF2C    0.9701727     4 1.155536e-15
+## KIAA1598 0.9638932     4 3.177906e-14
+## CALM1    0.9595761     4 6.736008e-13
+## DPYSL3   0.9160126     4 7.070075e-11
+## STMN2    0.9156201     4 5.197202e-11
+## ADCY1    0.9150314     4 2.830176e-12
+## DLG2     0.9085557     4 8.154496e-13
+## MLLT11   0.9054160     4 2.279087e-10
+## MEG3     0.9034537     4 6.217989e-11
+## ZNF286B  0.9032575     4 2.403252e-10
+## ETNK1    0.8975667     4 4.993506e-10
+## CLASP2   0.8869702     4 9.814153e-10
+## RTN1     0.8861852     4 1.097743e-09
+## PHACTR3  0.8857928     4 2.491380e-12
+## BHLHE22  0.8826531     4 5.787225e-13
+## KIFAP3   0.8802983     4 3.942177e-10
+## MIR100HG 0.8795133     4 2.732201e-09
+## GRIN2B   0.8779435     4 1.667996e-15
+## GRIA2    0.8763736     4 4.161204e-10
+## KIF3C    0.8748038     4 4.317676e-10
+
marker_genes[rownames(marker_genes)=="FOS",] 
+
##          AUC Group        pval
+## FOS 0.832712     2 2.99323e-09
+

也可以针对这些 marker genes去画热图,当然,得根据AUC和P值来挑选符合要求的差异基因去绘图。

+
par(mar=c(1,1,1,1)) 
+choosed_marker_genes=as.character(unlist(lapply(
+  split(marker_genes,marker_genes$Group), 
+  function(x) (rownames(head(x,20)))
+                                                )))
+heat_out <- M3DropExpressionHeatmap(choosed_marker_genes, 
+                                    Normalized_data$data,
+                                    cell_labels =  cell_populations)
+

+

如果遇到Error in plot.new() : figure margins too large报错,则单独将heat_out这行命令复制出来运行

+
+
+
+

对感兴趣基因集进行注释

+

通常是GO/KEGG等数据库,通常是超几何分布,GSEA,GSVA等算法。

+

拿到基因集后走我GitHub代码即可:https://github.com/jmzeng1314/GEO 简单的例子如下:

+
library(ggplot2)
+library(clusterProfiler)
+library(org.Hs.eg.db)
+# 下面的 gene_up 是一个 entrez ID的向量,约 500左右的 自定义的基因集
+## 下面的 gene_all 也是一个 entrez ID的向量,约10000左右的背景基因,就我们的scRNA检测到的全部基因。
+  ###   over-representation test
+  kk.up <- enrichKEGG(gene         = gene_up,
+                      organism     = 'hsa',
+                      universe     = gene_all,
+                      pvalueCutoff = 0.9,
+                      qvalueCutoff =0.9)
+  head(kk.up)[,1:6]
+  dotplot(kk.up );ggsave('kk.up.dotplot.png')
+
+
+

其它单细胞R包

+

包括:

+
    +
  • scran
  • +
  • SINCERA
  • +
  • SC3
  • +
+

不一一讲解,具体有需求,就仔细研读说明书,其实最后都是R语言熟练与否。

+
+
+

显示运行环境

+
sessionInfo()
+
## R version 3.5.2 (2018-12-20)
+## Platform: x86_64-apple-darwin15.6.0 (64-bit)
+## Running under: macOS Mojave 10.14.2
+## 
+## Matrix products: default
+## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
+## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## attached base packages:
+##  [1] splines   parallel  stats4    stats     graphics  grDevices utils    
+##  [8] datasets  methods   base     
+## 
+## other attached packages:
+##  [1] ROCR_1.0-7                  gplots_3.0.1               
+##  [3] bindrcpp_0.2.2              dplyr_0.7.6                
+##  [5] ggpubr_0.1.8                magrittr_1.5               
+##  [7] M3Drop_1.8.1                numDeriv_2016.8-1          
+##  [9] SC3_1.10.1                  scRNAseq_1.8.0             
+## [11] monocle_2.10.1              DDRTree_0.1.5              
+## [13] irlba_2.3.2                 VGAM_1.0-6                 
+## [15] scater_1.10.0               SingleCellExperiment_1.4.0 
+## [17] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
+## [19] BiocParallel_1.14.2         matrixStats_0.54.0         
+## [21] Biobase_2.40.0              GenomicRanges_1.34.0       
+## [23] GenomeInfoDb_1.18.1         IRanges_2.16.0             
+## [25] S4Vectors_0.20.1            BiocGenerics_0.28.0        
+## [27] Seurat_2.3.4                Matrix_1.2-15              
+## [29] cowplot_0.9.3               ggplot2_3.0.0              
+## 
+## loaded via a namespace (and not attached):
+##   [1] ggthemes_4.0.1           prabclus_2.2-6          
+##   [3] R.methodsS3_1.7.1        pkgmaker_0.27           
+##   [5] tidyr_0.8.1              acepack_1.4.1           
+##   [7] bit64_0.9-7              knitr_1.20              
+##   [9] R.utils_2.6.0            data.table_1.11.4       
+##  [11] rpart_4.1-13             RCurl_1.95-4.11         
+##  [13] doParallel_1.0.14        metap_1.0               
+##  [15] snow_0.4-2               RANN_2.6.1              
+##  [17] combinat_0.0-8           proxy_0.4-22            
+##  [19] bit_1.1-14               httpuv_1.4.5            
+##  [21] assertthat_0.2.0         viridis_0.5.1           
+##  [23] hms_0.4.2                evaluate_0.11           
+##  [25] promises_1.0.1           fansi_0.3.0             
+##  [27] DEoptimR_1.0-8           caTools_1.17.1.1        
+##  [29] readxl_1.1.0             igraph_1.2.2            
+##  [31] htmlwidgets_1.2          sparsesvd_0.1-4         
+##  [33] purrr_0.2.5              crosstalk_1.0.0         
+##  [35] backports_1.1.2          trimcluster_0.1-2.1     
+##  [37] gbRd_0.4-11              TTR_0.23-4              
+##  [39] abind_1.4-5              RcppEigen_0.3.3.4.0     
+##  [41] withr_2.1.2              robustbase_0.93-3       
+##  [43] checkmate_1.8.5          vcd_1.4-4               
+##  [45] xts_0.11-2               mclust_5.4.1            
+##  [47] cluster_2.0.7-1          ape_5.2                 
+##  [49] segmented_0.5-3.0        lazyeval_0.2.1          
+##  [51] laeken_0.5.0             crayon_1.3.4            
+##  [53] hdf5r_1.0.1              pkgconfig_2.0.2         
+##  [55] slam_0.1-44              labeling_0.3            
+##  [57] nlme_3.1-137             vipor_0.4.5             
+##  [59] nnet_7.3-12              bindr_0.1.1             
+##  [61] rlang_0.2.2              diptest_0.75-7          
+##  [63] registry_0.5             doSNOW_1.0.16           
+##  [65] cellranger_1.1.0         rprojroot_1.3-2         
+##  [67] lmtest_0.9-36            rngtools_1.3.1          
+##  [69] carData_3.0-1            Rhdf5lib_1.4.2          
+##  [71] boot_1.3-20              zoo_1.8-3               
+##  [73] base64enc_0.1-3          beeswarm_0.2.3          
+##  [75] ggridges_0.5.0           pheatmap_1.0.10         
+##  [77] png_0.1-7                viridisLite_0.3.0       
+##  [79] bitops_1.0-6             R.oo_1.22.0             
+##  [81] KernSmooth_2.23-15       DelayedMatrixStats_1.4.0
+##  [83] doRNG_1.7.1              lars_1.2                
+##  [85] stringr_1.3.1            scales_1.0.0            
+##  [87] plyr_1.8.4               ica_1.0-2               
+##  [89] bibtex_0.4.2             gdata_2.18.0            
+##  [91] zlibbioc_1.26.0          compiler_3.5.2          
+##  [93] HSMMSingleCell_1.2.0     lsei_1.2-0              
+##  [95] bbmle_1.0.20             RColorBrewer_1.1-2      
+##  [97] rrcov_1.4-4              fitdistrplus_1.0-11     
+##  [99] cli_1.0.0                dtw_1.20-1              
+## [101] XVector_0.22.0           pbapply_1.3-4           
+## [103] htmlTable_1.12           Formula_1.2-3           
+## [105] MASS_7.3-51.1            mgcv_1.8-26             
+## [107] tidyselect_0.2.4         stringi_1.2.4           
+## [109] forcats_0.3.0            densityClust_0.3        
+## [111] yaml_2.2.0               latticeExtra_0.6-28     
+## [113] ggrepel_0.8.0            grid_3.5.2              
+## [115] tools_3.5.2              rio_0.5.10              
+## [117] rstudioapi_0.7           foreach_1.4.4           
+## [119] foreign_0.8-71           gridExtra_2.3           
+## [121] smoother_1.1             scatterplot3d_0.3-41    
+## [123] Rtsne_0.15               digest_0.6.18           
+## [125] BiocManager_1.30.3       FNN_1.1.2.2             
+## [127] shiny_1.1.0              qlcMatrix_0.9.7         
+## [129] fpc_2.1-11.1             Rcpp_1.0.0              
+## [131] car_3.0-2                SDMTools_1.1-221        
+## [133] later_0.7.4              WriteXLS_4.1.0          
+## [135] httr_1.3.1               npsurv_0.4-0            
+## [137] kernlab_0.9-27           Rdpack_0.10-1           
+## [139] colorspace_1.3-2         ranger_0.11.1           
+## [141] reticulate_1.10          statmod_1.4.30          
+## [143] sp_1.3-1                 flexmix_2.3-14          
+## [145] xtable_1.8-3             jsonlite_1.5            
+## [147] modeltools_0.2-22        destiny_2.12.0          
+## [149] R6_2.2.2                 Hmisc_4.1-1             
+## [151] pillar_1.3.0             htmltools_0.3.6         
+## [153] mime_0.5                 glue_1.3.0              
+## [155] VIM_4.8.0                DT_0.4                  
+## [157] class_7.3-14             codetools_0.2-15        
+## [159] utf8_1.1.4               tsne_0.1-3              
+## [161] pcaPP_1.9-73             mvtnorm_1.0-8           
+## [163] lattice_0.20-38          tibble_1.4.2            
+## [165] mixtools_1.1.0           curl_3.2                
+## [167] ggbeeswarm_0.6.0         gtools_3.8.1            
+## [169] zip_1.0.0                openxlsx_4.1.0          
+## [171] survival_2.43-3          limma_3.36.3            
+## [173] rmarkdown_1.10           docopt_0.6.1            
+## [175] fastICA_1.2-1            munsell_0.5.0           
+## [177] e1071_1.7-0              rhdf5_2.26.1            
+## [179] GenomeInfoDbData_1.1.0   iterators_1.0.10        
+## [181] HDF5Array_1.10.1         haven_1.1.2             
+## [183] reshape2_1.4.3           gtable_0.2.0
+
+ + + + +
+ + + + + + + +