diff --git a/compare-4-packages/README.md b/compare-4-packages/README.md
new file mode 100644
index 0000000..83d625b
--- /dev/null
+++ b/compare-4-packages/README.md
@@ -0,0 +1,2 @@
+### 比较4个R包
+
diff --git a/compare-4-packages/compare.Rmd b/compare-4-packages/compare.Rmd
new file mode 100644
index 0000000..84e6889
--- /dev/null
+++ b/compare-4-packages/compare.Rmd
@@ -0,0 +1,883 @@
+---
+title: "compare 4 R packages for scRNAseq"
+author: "jmzeng1314@163.com"
+date: "`r format(Sys.time(), '%d %B, %Y')`"
+output:
+ html_document:
+ toc: yes
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(warning = F)
+knitr::opts_chunk$set(message = F)
+```
+
+## 安装并且加载必要的包
+
+需要自行下载安装一些必要的R包,包括我们的测试数据集来源的R包`scRNAseq` ,以及4个单细胞转录组数据处理包!
+
+因为大量学员在中国大陆,所以需要特别的R包安装方法,就是切换镜像后再下载R包。参考:http://www.bio-info-trainee.com/3727.html
+
+```{r}
+options()$repos ## 查看使用install.packages安装时的默认镜像
+options()$BioC_mirror ##查看使用bioconductor的默认镜像
+options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") ##指定镜像,这个是中国科技大学镜像
+options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) ##指定install.packages安装镜像,这个是清华镜像
+options()$repos
+options()$BioC_mirror
+if (!requireNamespace("BiocManager", quietly = TRUE))
+ install.packages("BiocManager")
+if(!require('Seurat')){
+ BiocManager::install('Seurat',ask = F,update = F)
+}
+if(!require('scater')){
+ BiocManager::install(c( 'scater'),ask = F,update = F)
+}
+if(!require('monocle')){
+ BiocManager::install(c( 'monocle'),ask = F,update = F)
+}
+if(!require('scRNAseq')){
+ BiocManager::install(c( 'scRNAseq'),ask = F,update = F)
+}
+if(!require('SC3')){
+ BiocManager::install(c( 'SC3'),ask = F,update = F)
+}
+if(!require('M3Drop')){
+ BiocManager::install(c( 'M3Drop'),ask = F,update = F)
+}
+if(!require('ggpubr')){
+ BiocManager::install(c( 'ggpubr'),ask = F,update = F)
+}
+```
+
+安装成功后就可以加载R包:
+
+```{r, message=FALSE}
+rm(list = ls())
+options(warn=-1)
+suppressMessages(library(scater))
+suppressMessages(library(Seurat))
+suppressMessages(library(monocle))
+suppressMessages(library(scRNAseq))
+suppressMessages(library(SC3))
+suppressMessages(library(M3Drop))
+```
+
+## 简单了解数据集
+
+这个包内置的是 Pollen et al. 2014 数据集,人类单细胞细胞,分成**4类**,分别是 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞,理解这些需要一定的生物学背景知识,如果不感兴趣,可以略过。
+
+这个数据集很出名,截止2019年1月已经有近400的引用了,后面的人开发`R包算法`都会在其上面做测试,本例子只使用了数据集的**4种细胞类型**而已,因为 scRNAseq 这个R包就提供了这些,完整的数据是 23730 features,
+301 samples, 地址为
需要自行下载安装一些必要的R包,包括我们的测试数据集来源的R包scRNAseq
,以及4个单细胞转录组数据处理包!
因为大量学员在中国大陆,所以需要特别的R包安装方法,就是切换镜像后再下载R包。参考:http://www.bio-info-trainee.com/3727.html
+options()$repos ## 查看使用install.packages安装时的默认镜像
+## CRAN
+## "@CRAN@"
+options()$BioC_mirror ##查看使用bioconductor的默认镜像
+## NULL
+options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") ##指定镜像,这个是中国科技大学镜像
+options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) ##指定install.packages安装镜像,这个是清华镜像
+options()$repos
+## CRAN
+## "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"
+options()$BioC_mirror
+## [1] "https://mirrors.ustc.edu.cn/bioc/"
+if (!requireNamespace("BiocManager", quietly = TRUE))
+ install.packages("BiocManager")
+if(!require('Seurat')){
+ BiocManager::install('Seurat',ask = F,update = F)
+}
+if(!require('scater')){
+ BiocManager::install(c( 'scater'),ask = F,update = F)
+}
+if(!require('monocle')){
+ BiocManager::install(c( 'monocle'),ask = F,update = F)
+}
+if(!require('scRNAseq')){
+ BiocManager::install(c( 'scRNAseq'),ask = F,update = F)
+}
+if(!require('SC3')){
+ BiocManager::install(c( 'SC3'),ask = F,update = F)
+}
+if(!require('M3Drop')){
+ BiocManager::install(c( 'M3Drop'),ask = F,update = F)
+}
+if(!require('ggpubr')){
+ BiocManager::install(c( 'ggpubr'),ask = F,update = F)
+}
+安装成功后就可以加载R包:
+rm(list = ls())
+options(warn=-1)
+suppressMessages(library(scater))
+suppressMessages(library(Seurat))
+suppressMessages(library(monocle))
+suppressMessages(library(scRNAseq))
+suppressMessages(library(SC3))
+suppressMessages(library(M3Drop))
+这个包内置的是 Pollen et al. 2014 数据集,人类单细胞细胞,分成4类,分别是 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞,理解这些需要一定的生物学背景知识,如果不感兴趣,可以略过。
+这个数据集很出名,截止2019年1月已经有近400的引用了,后面的人开发R包算法
都会在其上面做测试,本例子只使用了数据集的4种细胞类型而已,因为 scRNAseq 这个R包就提供了这些,完整的数据是 23730 features, 301 samples, 地址为https://hemberg-lab.github.io/scRNA.seq.datasets/human/tissues/ , 这个网站非常值得推荐,简直是一个宝藏。
这里面的表达矩阵是由 RSEM (Li and Dewey 2011) 软件根据 hg38 RefSeq transcriptome 得到的,总是130个文库,每个细胞测了两次,测序深度不一样(非常的不一样)
+library(scRNAseq)
+## ----- Load Example Data -----
+data(fluidigm)
+ct <- floor(assays(fluidigm)$rsem_counts)
+ct[1:4,1:4]
+## SRR1275356 SRR1274090 SRR1275251 SRR1275287
+## A1BG 0 0 0 0
+## A1BG-AS1 0 0 0 0
+## A1CF 0 0 0 0
+## A2M 0 0 0 33
+sample_ann <- as.data.frame(colData(fluidigm))
+DT::datatable(sample_ann,
+
+ rownames= FALSE,extensions = c('Scroller'),
+ options = list(
+ pageLength = 10,
+ lengthMenu = list(c(10, 50, 100,-1), c('10', '50','100', 'All')),
+ columnDefs = list(list(className = 'dt-center', targets = 0 :8)),
+ scrollX = TRUE,
+ fixedHeader = TRUE,
+ fixedColumns = TRUE ,
+ deferRender = TRUE
+),
+filter = 'top',
+escape = FALSE
+)
+
+
+前面说到,这个数据集是130个文库,每个细胞测了两次,测序深度不一样,这130个细胞,分成4类,分别是: pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞。
+批量,粗略的看一看各个细胞的一些统计学指标的分布情况
+library(ggpubr)
+box <- lapply(colnames(sample_ann[,1:19]),function(i) {
+ dat <- sample_ann[,i,drop=F]
+ dat$sample=rownames(dat)
+ dat$group='all cells'
+ ## 画boxplot
+ p <- ggboxplot(dat, x = "group", y = i,
+ add = "jitter" )
+ p
+})
+plot_grid(plotlist=box, ncol=5 )
+
+# ggsave(file="stat_all_cells.pdf")
+很明显,他们可以有根据来进行分组,这里不再演示。 不过通常的文章并不会考虑如此多的细节,这里重点是批量,代码技巧非常值得你们学校。
+因为进行了简单探索,对表型数据就有了把握,接下来可以进行一定程度的过滤,因为细节太多,这里重点是批量,代码技巧非常值得你们学校。
+pa <- colnames(sample_ann[,c(1:9,11:16,18,19)])
+tf <- lapply(pa,function(i) {
+ # i=pa[1]
+ dat <- sample_ann[,i]
+ dat <- abs(log10(dat))
+ fivenum(dat)
+ (up <- mean(dat)+2*sd(dat))
+ (down <- mean(dat)- 2*sd(dat) )
+ valid <- ifelse(dat > down & dat < up, 1,0 )
+})
+
+tf <- do.call(cbind,tf)
+choosed_cells <- apply(tf,1,function(x) all(x==1))
+table(sample_ann$Biological_Condition)
+##
+## GW16 GW21 GW21+3 NPC
+## 52 16 32 30
+sample_ann=sample_ann[choosed_cells,]
+table(sample_ann$Biological_Condition)
+##
+## GW16 GW21 GW21+3 NPC
+## 36 11 23 29
+ct <- ct[,choosed_cells]
+ct[1:4,1:4]
+## SRR1274090 SRR1275287 SRR1275364 SRR1275269
+## A1BG 0 0 0 0
+## A1BG-AS1 0 0 0 0
+## A1CF 0 0 0 0
+## A2M 0 33 0 51
+counts <- ct
+fivenum(apply(counts,1,function(x) sum(x>0) ))
+## A1CF OR8G1 LINC01003 MRPS36 YWHAZ
+## 0 0 4 26 99
+boxplot(apply(counts,1,function(x) sum(x>0) ))
+
+fivenum(apply(counts,2,function(x) sum(x>0) ))
+## SRR1275365 SRR1275345 SRR1275248 SRR1275273 SRR1274125
+## 1566.0 3043.0 4002.0 5706.5 8024.0
+hist(apply(counts,2,function(x) sum(x>0) ))
+
+choosed_genes=apply(counts,1,function(x) sum(x>0) )>0
+table(choosed_genes)
+## choosed_genes
+## FALSE TRUE
+## 9496 16759
+counts <- counts[choosed_genes,]
+注意,这里的 seurat是2.x版本,同理,monocle也是2版本。
+用法 | +Seurat | +scater | +monocle | +
---|---|---|---|
创建R包要求的对象 | +CreateSeuratObject | +SingleCellExperiment | +newCellDataSet | +
QC and selecting cell | +创建矩阵的同时,可以选择过滤参数min.cell,min.gene等。还有FilterCells函数可以去除不合格的细胞。 | +calculateQCMetrics()函数,其中的feature_controls参数可以指定过滤指标,然后有一系列的可视化函数 | +用基础R函数进行初步过滤,还可以用detectGenes()函数加上subset()过滤 | +
表达量的标准化或者归一化 | +NormalizeData | +calculateCPM()等系列函数 | +estimateSizeFactors()还有estimateDispersions | +
寻找重要的基因 | +FindVariableGenes | +没有看到专门的函数,可以借用R基础函数 | +differentialGeneTest()函数 | +
去除干扰因素 | +ScaleData | +借用limma包的 removeBatchEffect 函数 | +去除干扰因素的功能被包装在降维函数中 | +
降维 | +RunPCA或者RunTSNE | +runPCA或者runTSNE,runDiffusionMap | +reduceDimension函数,可以选择多种参数 | +
降维后可视化 | +VizPCA和PCElbowPlot;PCAPlot或者TSNEPlot | +plotPCA和plotTSNE等等 | +plot_cell_trajectory()或plot_genes_in_pseudotime | +
分类群cluster | +FindClusters | +并没有包装聚类函数,而且辅助其它R包,或者R基础函数 | +clusterCells | +
聚类后找每个细胞亚群的标志基因 | +FindMarkers和FindAllMarkers函数 | +借助SC3包 | +newCellTypeHierarchy classifyCells | +
calculateMarkerSpecificity | ++ |
首先为 scater 包创建对象
+pheno_data <- as.data.frame(colData(fluidigm))
+ct <- floor(assays(fluidigm)$rsem_counts)
+## 这里需要把Pollen的表达矩阵做成我们的 scater 要求的对象
+# data("sc_example_counts")
+# data("sc_example_cell_info")
+# 你也可以尝试该R包自带的数据集。
+# 参考 https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/vignette-intro.R
+sce_scater <- SingleCellExperiment(
+ assays = list(counts = ct),
+ colData = pheno_data
+ )
+sce_scater
+## class: SingleCellExperiment
+## dim: 26255 130
+## metadata(0):
+## assays(1): counts
+## rownames(26255): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
+## rowData names(0):
+## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
+## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
+## reducedDimNames(0):
+## spikeNames(0):
+# 后面所有的分析都是基于 sce_scater 这个变量
+# 是一个 SingleCellExperiment 对象,被很多单细胞R包采用。
+然后为 seurat 包创建对象
+meta <- as.data.frame(colData(fluidigm))
+ct <- floor(assays(fluidigm)$rsem_counts)
+counts <- ct
+sce_seurat <- CreateSeuratObject(raw.data = counts,
+ meta.data =meta,
+ min.cells = 3,
+ min.genes = 200,
+ project = "Pollen")
+sce_seurat
+## An object of class seurat in project Pollen
+## 14656 genes across 130 samples.
+## 后续所有的分析都基于这个 sce_seurat 变量,是一个对象
+最后为 monocle 包创建对象
+ct <- floor(assays(fluidigm)$rsem_counts)
+gene_ann <- data.frame(
+ gene_short_name = row.names(ct),
+ row.names = row.names(ct)
+)
+sample_ann=as.data.frame(colData(fluidigm))
+pd <- new("AnnotatedDataFrame",
+ data=sample_ann)
+fd <- new("AnnotatedDataFrame",
+ data=gene_ann)
+sce_monocle <- newCellDataSet(
+ ct,
+ phenoData = pd,
+ featureData =fd,
+ expressionFamily = negbinomial.size(),
+ lowerDetectionLimit=1)
+sce_monocle
+## CellDataSet (storageMode: environment)
+## assayData: 26255 features, 130 samples
+## element names: exprs
+## protocolData: none
+## phenoData
+## sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+## varLabels: NREADS NALIGNED ... Size_Factor (29 total)
+## varMetadata: labelDescription
+## featureData
+## featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
+## fvarLabels: gene_short_name
+## fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+再次回顾一下这3个对象。
+sce_seurat
+## An object of class seurat in project Pollen
+## 14656 genes across 130 samples.
+sce_scater
+## class: SingleCellExperiment
+## dim: 26255 130
+## metadata(0):
+## assays(1): counts
+## rownames(26255): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
+## rowData names(0):
+## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
+## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
+## reducedDimNames(0):
+## spikeNames(0):
+sce_monocle
+## CellDataSet (storageMode: environment)
+## assayData: 26255 features, 130 samples
+## element names: exprs
+## protocolData: none
+## phenoData
+## sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+## varLabels: NREADS NALIGNED ... Size_Factor (29 total)
+## varMetadata: labelDescription
+## featureData
+## featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
+## fvarLabels: gene_short_name
+## fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+在 seurat
包,
mito.genes <- grep(pattern = "^MT-",
+ x = rownames(sce_seurat@data),
+ value = TRUE)
+# 恰好这个例子的表达矩阵里面没有线粒体基因
+percent.mito <- Matrix::colSums(sce_seurat@raw.data[mito.genes, ]) / Matrix::colSums(sce_seurat@raw.data)
+## 也可以加入很多其它属性,比如 ERCC 等。
+
+# AddMetaData adds columns to object@meta.data, and is a great place to stash QC stats
+sce_seurat <- AddMetaData(object = sce_seurat,
+ metadata = percent.mito,
+ col.name = "percent.mito")
+VlnPlot(object = sce_seurat,
+ features.plot = c("nGene", "nUMI", "percent.mito"),
+ group.by = 'Biological_Condition', nCol = 3)
+
+GenePlot(object = sce_seurat, gene1 = "nUMI", gene2 = "nGene")
+
+CellPlot(sce_seurat,
+ sce_seurat@cell.names[3],
+ sce_seurat@cell.names[4],
+ do.ident = FALSE)
+
+# FilterCells函数
+# sce_seurat
+# sce_seurat <- FilterCells(object = sce_seurat,
+# subset.names = c("nGene", "percent.mito"),
+# low.thresholds = c(200, -Inf),
+# high.thresholds = c(2500, 0.05))
+# sce_seurat
+在scater包,
+这里仅仅是演示 scater 包最简单的质量控制代码,详细代码见:https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/vignette-qc.html
+genes <- rownames(rowData(sce_scater))
+genes[grepl('^MT-',genes)]
+## character(0)
+genes[grepl('^ERCC-',genes)]
+## character(0)
+exprs(sce_scater) <- log2(
+ calculateCPM(sce_scater ) + 1)
+keep_feature <- rowSums(exprs(sce_scater) > 0) > 0
+table(keep_feature)
+## keep_feature
+## FALSE TRUE
+## 9159 17096
+sce_scater <- sce_scater[keep_feature,]
+sce_scater
+## class: SingleCellExperiment
+## dim: 17096 130
+## metadata(0):
+## assays(2): counts logcounts
+## rownames(17096): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
+## rowData names(0):
+## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
+## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
+## reducedDimNames(0):
+## spikeNames(0):
+sce_scater <- calculateQCMetrics(sce_scater)
+plotHighestExprs(sce_scater, exprs_values = "counts")
+
+plotExprsFreqVsMean(sce_scater)
+
+在monocle
包,
sce_monocle
+## CellDataSet (storageMode: environment)
+## assayData: 26255 features, 130 samples
+## element names: exprs
+## protocolData: none
+## phenoData
+## sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+## varLabels: NREADS NALIGNED ... Size_Factor (29 total)
+## varMetadata: labelDescription
+## featureData
+## featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
+## fvarLabels: gene_short_name
+## fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+## 起初是
+sce_monocle <- detectGenes(sce_monocle, min_expr = 0.1)
+print(head(fData(sce_monocle)))
+## gene_short_name num_cells_expressed
+## A1BG A1BG 10
+## A1BG-AS1 A1BG-AS1 2
+## A1CF A1CF 1
+## A2M A2M 21
+## A2M-AS1 A2M-AS1 3
+## A2ML1 A2ML1 9
+expressed_genes <- row.names(subset(fData(sce_monocle),
+ num_cells_expressed >= 5))
+length(expressed_genes)
+## [1] 13385
+sce_monocle <- sce_monocle[expressed_genes,]
+sce_monocle
+## CellDataSet (storageMode: environment)
+## assayData: 13385 features, 130 samples
+## element names: exprs
+## protocolData: none
+## phenoData
+## sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+## varLabels: NREADS NALIGNED ... num_genes_expressed (30 total)
+## varMetadata: labelDescription
+## featureData
+## featureNames: A1BG A2M ... ZZZ3 (13385 total)
+## fvarLabels: gene_short_name num_cells_expressed
+## fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+# 过滤基因后是
+tmp=pData(sce_monocle)
+fivenum(tmp[,1])
+## [1] 91616 232899 892209 8130850 14477100
+fivenum(tmp[,30])
+## [1] 1418.0 2961.0 3841.5 5381.0 8221.0
+# 这里并不需要过滤细胞,如果想过滤,就自己摸索阈值,然后挑选细胞即可。
+# 这里留下来了所有的细胞。
+valid_cells <- row.names(pData(sce_monocle) )
+sce_monocle <- sce_monocle[,valid_cells]
+sce_monocle
+## CellDataSet (storageMode: environment)
+## assayData: 13385 features, 130 samples
+## element names: exprs
+## protocolData: none
+## phenoData
+## sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
+## varLabels: NREADS NALIGNED ... num_genes_expressed (30 total)
+## varMetadata: labelDescription
+## featureData
+## featureNames: A1BG A2M ... ZZZ3 (13385 total)
+## fvarLabels: gene_short_name num_cells_expressed
+## fvarMetadata: labelDescription
+## experimentData: use 'experimentData(object)'
+## Annotation:
+在 seurat包需要先假定平均细胞测序文库大小,这里是 10000
+sce_seurat <- NormalizeData(object = sce_seurat,
+ normalization.method = "LogNormalize",
+ scale.factor = 10000,
+ display.progress = F)
+在scater包:
+assays(sce_scater)
+## List of length 2
+## names(2): counts logcounts
+counts(sce_scater) <- assays(sce_scater)$counts
+norm_exprs(sce_scater) <- log2(calculateCPM(sce_scater, use_size_factors = FALSE) + 1)
+
+stand_exprs(sce_scater) <- log2(calculateCPM(sce_scater, use_size_factors = FALSE) + 1)
+
+tpm(sce_scater) <- calculateTPM(sce_scater, effective_length = 5e4)
+
+cpm(sce_scater) <- calculateCPM(sce_scater, use_size_factors = FALSE)
+
+assays(sce_scater)
+## List of length 6
+## names(6): counts logcounts norm_exprs stand_exprs tpm cpm
+
+在monocle
包,
colnames(phenoData(sce_monocle)@data)
+## [1] "NREADS" "NALIGNED"
+## [3] "RALIGN" "TOTAL_DUP"
+## [5] "PRIMER" "INSERT_SZ"
+## [7] "INSERT_SZ_STD" "COMPLEXITY"
+## [9] "NDUPR" "PCT_RIBOSOMAL_BASES"
+## [11] "PCT_CODING_BASES" "PCT_UTR_BASES"
+## [13] "PCT_INTRONIC_BASES" "PCT_INTERGENIC_BASES"
+## [15] "PCT_MRNA_BASES" "MEDIAN_CV_COVERAGE"
+## [17] "MEDIAN_5PRIME_BIAS" "MEDIAN_3PRIME_BIAS"
+## [19] "MEDIAN_5PRIME_TO_3PRIME_BIAS" "sample_id.x"
+## [21] "Lane_ID" "LibraryName"
+## [23] "avgLength" "spots"
+## [25] "Biological_Condition" "Coverage_Type"
+## [27] "Cluster1" "Cluster2"
+## [29] "Size_Factor" "num_genes_expressed"
+sce_monocle <- estimateSizeFactors(sce_monocle)
+sce_monocle <- estimateDispersions(sce_monocle)
+colnames(phenoData(sce_monocle)@data)
+## [1] "NREADS" "NALIGNED"
+## [3] "RALIGN" "TOTAL_DUP"
+## [5] "PRIMER" "INSERT_SZ"
+## [7] "INSERT_SZ_STD" "COMPLEXITY"
+## [9] "NDUPR" "PCT_RIBOSOMAL_BASES"
+## [11] "PCT_CODING_BASES" "PCT_UTR_BASES"
+## [13] "PCT_INTRONIC_BASES" "PCT_INTERGENIC_BASES"
+## [15] "PCT_MRNA_BASES" "MEDIAN_CV_COVERAGE"
+## [17] "MEDIAN_5PRIME_BIAS" "MEDIAN_3PRIME_BIAS"
+## [19] "MEDIAN_5PRIME_TO_3PRIME_BIAS" "sample_id.x"
+## [21] "Lane_ID" "LibraryName"
+## [23] "avgLength" "spots"
+## [25] "Biological_Condition" "Coverage_Type"
+## [27] "Cluster1" "Cluster2"
+## [29] "Size_Factor" "num_genes_expressed"
+在 seurat 包,去除一些文库大小,线粒体基因含量,ERCC含量等因素的功能被包装在 ScaleData 函数里面,前提是需要被去除的因素提供 AddMetaData 函数添加到了对象。
+sce_seurat <- ScaleData(object = sce_seurat,
+ vars.to.regress = c("nUMI"),
+ display.progress = F)
+## 所有放在 vars.to.regress 参数的变量均可以被去除
+在scater包, 主要是可视化那些干扰因素:
+sce_scater <- runPCA(sce_scater)
+# colnames(colData(sce_scater))
+plotPCA(
+ sce_scater,
+ colour_by = "Biological_Condition",
+ size_by = "NALIGNED"
+)
+
+# 还有 plotQC 函数。
+如果需要去除干扰因素,可以借用limma包的 removeBatchEffect 函数
+library(limma)
+batch <- rep(1:2, each=20)
+corrected <- removeBatchEffect(logcounts(example_sce), block=batch)
+assay(example_sce, "corrected_logcounts") <- corrected
+在monocle包,去除干扰因素的功能被包装在降维函数中,示例如下:
+# 放在 residualModelFormulaStr 里面的是需要去除的
+cds <- reduceDimension(cds, max_components = 2, num_dim = 2,
+ reduction_method = 'tSNE',
+ residualModelFormulaStr = "~Biological_Condition + num_genes_expressed",
+ verbose = T)
+cds <- clusterCells(cds, num_clusters = 2)
+plot_cell_clusters(cds, 1, 2, color = "Biological_Condition")
+
+## 上面去除了Biological_Condition,所以接下来聚类它们就被打散了。
+寻找波动比较明显的基因,后续用这些基因而非全部基因进行分析,主要为了降低计算量。
+在 seurat
包,必须要先 normalization
,然后才能进行FindVariableGenes
计算,代码如下:
sce_seurat <- FindVariableGenes(object = sce_seurat,
+ mean.function = ExpMean,
+ dispersion.function = LogVMR,
+ x.low.cutoff = 0.0125,
+ x.high.cutoff = 3,
+ y.cutoff = 0.5)
+
+# 通过调整参数可以得到不同数量的 var.genes
+length(sce_seurat@var.genes)
+## [1] 4944
+在scater
包, 没有看到专门的函数,可以借用R基础函数。
在 monocle
包中,同样也不是所有的基因都有作用,所以先进行挑选,合适的基因才在后续分析中用来降维聚类。
disp_table <- dispersionTable(sce_monocle)
+# 也可以先挑选差异基因
+unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)
+dim(unsup_clustering_genes)
+## [1] 12911 4
+sce_monocle <- setOrderingFilter(sce_monocle,
+ unsup_clustering_genes$gene_id)
+plot_ordering_genes(sce_monocle)
+
+plot_pc_variance_explained(sce_monocle, return_all = F)
+
+# norm_method='log'
+后面做降维的时候的 num_dim
参数选择基于上面的PCA图
在 seurat
包, 降维之前必须要先 Run ScaleData()
, 每个降维算法都被单独包装为函数了。
sce_seurat <- RunPCA(object = sce_seurat,
+ pc.genes = sce_seurat@var.genes,
+ do.print = TRUE,
+ pcs.print = 1:5,
+ genes.print = 5)
+## [1] "PC1"
+## [1] "MLLT11" "KIDINS220" "SLA" "CALM1" "FAM110B"
+## [1] ""
+## [1] "HMGB2" "LIX1" "LDHA" "TRIM59" "ENO1"
+## [1] ""
+## [1] ""
+## [1] "PC2"
+## [1] "CENPF" "SHISA2" "CDK1" "HIST1H4C" "BARD1"
+## [1] ""
+## [1] "ECSCR" "MYCT1" "ELK3" "IMPG2" "ADGRF5"
+## [1] ""
+## [1] ""
+## [1] "PC3"
+## [1] "ADGRV1" "GPX3" "FAM107A" "HOPX" "PTN"
+## [1] ""
+## [1] "BCAT1" "EFNA5" "DLK1" "TUBA1C" "CXADR"
+## [1] ""
+## [1] ""
+## [1] "PC4"
+## [1] "S100A6" "LOX" "ADAMTS16" "CA2" "DKK3"
+## [1] ""
+## [1] "FAM50A" "DDX54" "ARFIP2" "IPP" "PXMP2"
+## [1] ""
+## [1] ""
+## [1] "PC5"
+## [1] "ASB16-AS1" "CENPL" "ATP1A1-AS1" "CHD9" "KLHL31"
+## [1] ""
+## [1] "CD44" "CA2" "SYNJ2" "LRP10" "TTC38"
+## [1] ""
+## [1] ""
+sce_seurat@dr
+## $pca
+## A dimensional reduction object with key PC
+## Number of dimensions: 20
+## Projected dimensional reduction calculated: FALSE
+## Jackstraw run: FALSE
+tmp <- sce_seurat@dr$pca@gene.loadings
+sce_seurat <- RunTSNE(object = sce_seurat,
+ dims.use = 1:10,
+ do.fast = TRUE,
+ perplexity=10)
+sce_seurat@dr
+## $pca
+## A dimensional reduction object with key PC
+## Number of dimensions: 20
+## Projected dimensional reduction calculated: FALSE
+## Jackstraw run: FALSE
+##
+## $tsne
+## A dimensional reduction object with key tSNE_
+## Number of dimensions: 2
+## Projected dimensional reduction calculated: FALSE
+## Jackstraw run: FALSE
+在scater
包,
sce <- runPCA(sce_scater)
+# 这里并没有进行任何基因的挑选,就直接进行了PCA,与 seurat包不一样。
+reducedDimNames(sce)
+## [1] "PCA"
+sce <- runPCA(sce, ncomponents=20)
+# Perplexity of 10 just chosen here arbitrarily.
+set.seed(1000)
+# 这里的这个 perplexity 参数很重要
+sce <- runTSNE(sce, perplexity=30)
+sce <- runDiffusionMap(sce)
+reducedDimNames(sce)
+## [1] "PCA" "TSNE" "DiffusionMap"
+sce_scater=sce
+在monocle
包,降维函数就是 reduceDimension
, 它包装这一个去除干扰因素的功能, 可供选择的降维算法包括: "DDRTree", "ICA", "tSNE", "SimplePPT", "L1-graph", "SGL-tree"
# 放在 residualModelFormulaStr 参数里面的是需要去除的
+sce_monocle <- reduceDimension(sce_monocle,
+ max_components = 2, num_dim = 2,
+ reduction_method = 'tSNE',
+ verbose = T)
+在 seurat 包, 两个降维算法被单独包装为两个函数,所以可视化也是两个函数。
+sce_seurat@dr
+## $pca
+## A dimensional reduction object with key PC
+## Number of dimensions: 20
+## Projected dimensional reduction calculated: FALSE
+## Jackstraw run: FALSE
+##
+## $tsne
+## A dimensional reduction object with key tSNE_
+## Number of dimensions: 2
+## Projected dimensional reduction calculated: FALSE
+## Jackstraw run: FALSE
+PCAPlot(sce_seurat, dim.1 = 1, dim.2 = 2,
+ group.by = 'Biological_Condition')
+
+TSNEPlot(sce_seurat,group.by = 'Biological_Condition')
+
+VizPCA( sce_seurat, pcs.use = 1:2)
+
+PCElbowPlot(object = sce_seurat)
+
+sce_seurat <- ProjectPCA(sce_seurat, do.print = FALSE)
+PCHeatmap(object = sce_seurat,
+ pc.use = 1,
+ cells.use = ncol(sce_seurat@data),
+ do.balanced = TRUE,
+ label.columns = FALSE)
+
+在scater包, 同样的是多个降维函数和多个可视化函数
+plotTSNE(sce_scater,
+ colour_by = "Biological_Condition" )
+
+plotPCA(sce_scater,
+ colour_by = "Biological_Condition" )
+
+plotPCA(sce_scater, ncomponents = 4,
+ colour_by = "Biological_Condition" )
+
+plotDiffusionMap(sce_scater,
+ colour_by = "Biological_Condition" )
+
+在monocle包,有趣的是降维后必须先分群才能进行可视化。
+sce_monocle <- clusterCells(sce_monocle, num_clusters = 4)
+## Distance cutoff calculated to 0.5035647
+plot_cell_clusters(sce_monocle, 1, 2, color = "Biological_Condition")
+
+聚类后就可以根据阈值进行分群
+在 seurat
包, 重点: 需要搞懂这里的 resolution 参数,而且降维算法可以选PCA或者ICA , 分群算法也可以选择。
sce_seurat <- FindClusters(object = sce_seurat,
+ reduction.type = "pca",
+ dims.use = 1:10, force.recalc = T,
+ resolution = 0.9, print.output = 0,
+ save.SNN = TRUE)
+PrintFindClustersParams(sce_seurat)
+## Parameters used in latest FindClusters calculation run on: 2019-03-21 17:35:14
+## =============================================================================
+## Resolution: 0.9
+## -----------------------------------------------------------------------------
+## Modularity Function Algorithm n.start n.iter
+## 1 1 100 10
+## -----------------------------------------------------------------------------
+## Reduction used k.param prune.SNN
+## pca 30 0.0667
+## -----------------------------------------------------------------------------
+## Dims used in calculation
+## =============================================================================
+## 1 2 3 4 5 6 7 8 9 10
+table(sce_seurat@meta.data$res.0.9)
+##
+## 0 1 2
+## 60 36 34
+在scater
包, 并没有包装聚类函数,而且辅助其它R包,或者R基础函数:
最常用的是无缝连接 SC3
包:
library(SC3) # BiocManager::install('SC3')
+sce <- sc3_estimate_k(sce_scater)
+metadata(sce)$sc3$k_estimation
+## [1] 24
+rowData(sce)$feature_symbol <- rownames(rowData(sce))
+一步运行sc3的所有分析, 相当耗费时间
+这里kn
表示的预估聚类数, 考虑到数据集是已知的,我们强行设置为4组, 具体数据要具体考虑。
# 耗费时间
+kn <- 4 ## 这里可以选择 3:5 看多种分类结果。
+sc3_cluster <- "sc3_4_clusters"
+Sys.time()
+## [1] "2019-03-21 17:35:14 CST"
+sce <- sc3(sce, ks = kn, biology = TRUE)
+Sys.time()
+## [1] "2019-03-21 17:36:27 CST"
+在monocle
包,如下:
sce_monocle <- clusterCells(sce_monocle, num_clusters = 4)
+## Distance cutoff calculated to 0.5035647
+plot_cell_clusters(sce_monocle)
+
+plot_cell_clusters(sce_monocle, 1, 2, color = "Biological_Condition")
+
+可供选择的聚类算法包括:densityPeak, louvian and DDRTree
在seurat
包,
sce.markers <- FindAllMarkers(object = sce_seurat, only.pos = TRUE, min.pct = 0.25,
+ thresh.use = 0.25)
+# DT::datatable(sce.markers)
+library(dplyr)
+sce.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
+## # A tibble: 6 x 7
+## # Groups: cluster [3]
+## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
+## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
+## 1 1.01e- 5 1.84 0.467 0.114 1.48e- 1 0 ERBB4
+## 2 7.88e- 3 1.71 0.3 0.114 1.00e+ 0 0 PDZRN3
+## 3 2.30e-22 2.43 0.806 0 3.37e-18 1 DLK1
+## 4 1.94e-18 2.47 0.917 0.202 2.84e-14 1 BCAT1
+## 5 2.91e-14 2.23 0.882 0.292 4.26e-10 2 MEF2C
+## 6 6.21e- 7 1.86 0.588 0.229 9.10e- 3 2 LINC00643
+top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
+可以单独可视化这些标志基因。
+# setting slim.col.label to TRUE will print just the cluster IDS instead of# every cell name
+DoHeatmap(object = sce_seurat,
+ genes.use = top10$gene,
+ slim.col.label = TRUE,
+ remove.key = TRUE)
+
+在scater
包,可视化展示部分, kn就是聚类数,就能看到标志基因了。
热图: 比较先验分类和SC3的聚类的一致性
+sc3_plot_consensus(sce, k = kn, show_pdata = c("Biological_Condition",sc3_cluster))
+
+展示表达量信息
+sc3_plot_expression(sce, k = kn, show_pdata = c("Biological_Condition",sc3_cluster))
+
+展示可能的标记基因
+sc3_plot_markers(sce, k = kn, show_pdata = c("Biological_Condition",sc3_cluster))
+
+在PCA上展示SC3的聚类结果
+plotPCA(sce, colour_by = sc3_cluster )
+
+# sc3_interactive(sce)
+在monocle
包,应该是没有找标志基因的函数,但是有推断差异基因的函数,而且它多一个功能,就是进行发育轨迹的推断。 推断发育轨迹才是monocle的拿手好戏,也是它荣升为3大R包的核心竞争力。
第一步: 挑选合适的基因. 有多个方法,例如提供已知的基因集,这里选取统计学显著的差异基因列表
+Sys.time()
+## [1] "2019-03-21 17:37:06 CST"
+cds=sce_monocle
+diff_test_res <- differentialGeneTest(cds,
+ fullModelFormulaStr = "~Biological_Condition")
+Sys.time()
+## [1] "2019-03-21 17:39:49 CST"
+# 可以看到运行耗时
+
+# Select genes that are significant at an FDR < 10%
+sig_genes <- subset(diff_test_res, qval < 0.1)
+head(sig_genes[,c("gene_short_name", "pval", "qval")] )
+## gene_short_name pval qval
+## A1BG A1BG 4.112065e-04 1.460722e-03
+## A2M A2M 4.251744e-08 4.266086e-07
+## AACS AACS 2.881832e-03 8.275761e-03
+## AADAT AADAT 1.069794e-02 2.621123e-02
+## AAGAB AAGAB 1.156771e-07 1.021331e-06
+## AAMP AAMP 7.626789e-05 3.243869e-04
+ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
+cds <- setOrderingFilter(cds, ordering_genes)
+plot_ordering_genes(cds)
+
+第二步: 降维。降维的目的是为了更好的展示数据。函数里提供了很多种方法, 不同方法的最后展示的图都不太一样, 其中“DDRTree”是Monocle2使用的默认方法
+cds <- reduceDimension(cds, max_components = 2,
+ method = 'DDRTree')
+第三步: 对细胞进行排序
+cds <- orderCells(cds)
+最后两个可视化函数,对结果进行可视化
+plot_cell_trajectory(cds, color_by = "Biological_Condition")
+
+可以很明显看到细胞的发育轨迹,正好对应 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这时间进展的孕期细胞。
+plot_genes_in_pseudotime
可以展现marker基因
最开始挑选合适基因,除了我们演示的找统计学显著的差异表达基因这个方法外,还可以于已知的标记基因,主要是基于生物学背景知识。
+如果是已知基因列表,就需要自己读取外包文件,导入R里面来分析。
+只需要挑选前面步骤分类好的细胞,去表达矩阵里面进行筛选细胞,重新走一遍上面的流程即可。
+首先是:seurat总结
+counts
矩阵进来后被包装为对象,方便操作。
然后一定要经过 NormalizeData
和 ScaleData
的操作
函数 FindVariableGenes
可以挑选适合进行下游分析的基因集。
函数 RunPCA
和 RunTSNE
进行降维
函数 FindClusters
直接就分群了,非常方便 函数 FindAllMarkers
可以对分群后各个亚群找标志基因。
函数 FeaturePlot
可以展示不同基因在所有细胞的表达量 函数 VlnPlot
可以展示不同基因在不同分群的表达量差异情况 函数 DoHeatmap
可以选定基因集后绘制热图
library(M3Drop)
+Normalized_data <- M3DropCleanData(counts,
+ labels = sample_ann$Biological_Condition ,
+ is.counts=TRUE, min_detected_genes=2000)
+dim(Normalized_data$data)
+## [1] 13958 124
+length(Normalized_data$labels)
+## [1] 124
+class(Normalized_data)
+## [1] "list"
+str(Normalized_data)
+## List of 2
+## $ data : num [1:13958, 1:124] 0 0 0.486 0 0.486 ...
+## ..- attr(*, "dimnames")=List of 2
+## .. ..$ : chr [1:13958] "A1BG" "A2M" "A2ML1" "AAAS" ...
+## .. ..$ : chr [1:124] "SRR1275356" "SRR1274090" "SRR1275251" "SRR1275287" ...
+## $ labels: chr [1:124] "GW16" "NPC" "GW16" "GW21+3" ...
+这个包设计比较简单,并没有构建S4对象,只是一个简单的list而已。
+需要深入读该文章,了解其算法,这里略过,总之它对单细胞转录组的表达矩阵进行了一系列的统计检验。
+fits <- M3DropDropoutModels(Normalized_data$data)
+
+# Sum absolute residuals
+data.frame(MM=fits$MMFit$SAr, Logistic=fits$LogiFit$SAr,
+ DoubleExpo=fits$ExpoFit$SAr)
+## MM Logistic DoubleExpo
+## 1 1607 1625 4075
+# Sum squared residuals
+data.frame(MM=fits$MMFit$SSr, Logistic=fits$LogiFit$SSr,
+ DoubleExpo=fits$ExpoFit$SSr)
+## MM Logistic DoubleExpo
+## 1 378 332 1989
+DE_genes <- M3DropDifferentialExpression(Normalized_data$data,
+ mt_method="fdr", mt_threshold=0.01)
+
+dim(DE_genes)
+## [1] 263 3
+head(DE_genes)
+## Gene p.value q.value
+## ABCA1 ABCA1 1.381813e-04 0.0080700163
+## ABCE1 ABCE1 1.956343e-05 0.0017964893
+## ABLIM1 ABLIM1 7.353186e-06 0.0009082812
+## ACAT2 ACAT2 1.886555e-05 0.0017438766
+## ACVR1 ACVR1 1.259054e-05 0.0012970374
+## AMER2 AMER2 5.698985e-06 0.0007231494
+这里是针对上面的统计结果来的
+par(mar=c(1,1,1,1))
+heat_out <- M3DropExpressionHeatmap(DE_genes$Gene, Normalized_data$data,
+ cell_labels = Normalized_data$labels)
+
+可视化了解一下找到的差异基因在不同的细胞类型的表达分布情况。
+这里可以重新聚类后,针对自己找到的类别来分别找marker基因,不需要使用测试数据自带的表型信息。
+cell_populations <- M3DropGetHeatmapCellClusters(heat_out, k=4)
+library("ROCR")
+marker_genes <- M3DropGetMarkers(Normalized_data$data, cell_populations)
+table(cell_populations,Normalized_data$labels)
+##
+## cell_populations GW16 GW21 GW21+3 NPC
+## 1 0 0 0 30
+## 2 23 6 8 0
+## 3 21 10 0 0
+## 4 2 0 24 0
+head(marker_genes[marker_genes$Group==4,],20)
+## AUC Group pval
+## MEF2C 0.9701727 4 1.155536e-15
+## KIAA1598 0.9638932 4 3.177906e-14
+## CALM1 0.9595761 4 6.736008e-13
+## DPYSL3 0.9160126 4 7.070075e-11
+## STMN2 0.9156201 4 5.197202e-11
+## ADCY1 0.9150314 4 2.830176e-12
+## DLG2 0.9085557 4 8.154496e-13
+## MLLT11 0.9054160 4 2.279087e-10
+## MEG3 0.9034537 4 6.217989e-11
+## ZNF286B 0.9032575 4 2.403252e-10
+## ETNK1 0.8975667 4 4.993506e-10
+## CLASP2 0.8869702 4 9.814153e-10
+## RTN1 0.8861852 4 1.097743e-09
+## PHACTR3 0.8857928 4 2.491380e-12
+## BHLHE22 0.8826531 4 5.787225e-13
+## KIFAP3 0.8802983 4 3.942177e-10
+## MIR100HG 0.8795133 4 2.732201e-09
+## GRIN2B 0.8779435 4 1.667996e-15
+## GRIA2 0.8763736 4 4.161204e-10
+## KIF3C 0.8748038 4 4.317676e-10
+marker_genes[rownames(marker_genes)=="FOS",]
+## AUC Group pval
+## FOS 0.832712 2 2.99323e-09
+也可以针对这些 marker genes去画热图,当然,得根据AUC和P值来挑选符合要求的差异基因去绘图。
+par(mar=c(1,1,1,1))
+choosed_marker_genes=as.character(unlist(lapply(
+ split(marker_genes,marker_genes$Group),
+ function(x) (rownames(head(x,20)))
+ )))
+heat_out <- M3DropExpressionHeatmap(choosed_marker_genes,
+ Normalized_data$data,
+ cell_labels = cell_populations)
+
+如果遇到Error in plot.new() : figure margins too large
报错,则单独将heat_out
这行命令复制出来运行
通常是GO/KEGG等数据库,通常是超几何分布,GSEA,GSVA等算法。
+拿到基因集后走我GitHub代码即可:https://github.com/jmzeng1314/GEO 简单的例子如下:
+library(ggplot2)
+library(clusterProfiler)
+library(org.Hs.eg.db)
+# 下面的 gene_up 是一个 entrez ID的向量,约 500左右的 自定义的基因集
+## 下面的 gene_all 也是一个 entrez ID的向量,约10000左右的背景基因,就我们的scRNA检测到的全部基因。
+ ### over-representation test
+ kk.up <- enrichKEGG(gene = gene_up,
+ organism = 'hsa',
+ universe = gene_all,
+ pvalueCutoff = 0.9,
+ qvalueCutoff =0.9)
+ head(kk.up)[,1:6]
+ dotplot(kk.up );ggsave('kk.up.dotplot.png')
+包括:
+不一一讲解,具体有需求,就仔细研读说明书,其实最后都是R语言熟练与否。
+sessionInfo()
+## R version 3.5.2 (2018-12-20)
+## Platform: x86_64-apple-darwin15.6.0 (64-bit)
+## Running under: macOS Mojave 10.14.2
+##
+## Matrix products: default
+## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
+## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
+##
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+##
+## attached base packages:
+## [1] splines parallel stats4 stats graphics grDevices utils
+## [8] datasets methods base
+##
+## other attached packages:
+## [1] ROCR_1.0-7 gplots_3.0.1
+## [3] bindrcpp_0.2.2 dplyr_0.7.6
+## [5] ggpubr_0.1.8 magrittr_1.5
+## [7] M3Drop_1.8.1 numDeriv_2016.8-1
+## [9] SC3_1.10.1 scRNAseq_1.8.0
+## [11] monocle_2.10.1 DDRTree_0.1.5
+## [13] irlba_2.3.2 VGAM_1.0-6
+## [15] scater_1.10.0 SingleCellExperiment_1.4.0
+## [17] SummarizedExperiment_1.12.0 DelayedArray_0.8.0
+## [19] BiocParallel_1.14.2 matrixStats_0.54.0
+## [21] Biobase_2.40.0 GenomicRanges_1.34.0
+## [23] GenomeInfoDb_1.18.1 IRanges_2.16.0
+## [25] S4Vectors_0.20.1 BiocGenerics_0.28.0
+## [27] Seurat_2.3.4 Matrix_1.2-15
+## [29] cowplot_0.9.3 ggplot2_3.0.0
+##
+## loaded via a namespace (and not attached):
+## [1] ggthemes_4.0.1 prabclus_2.2-6
+## [3] R.methodsS3_1.7.1 pkgmaker_0.27
+## [5] tidyr_0.8.1 acepack_1.4.1
+## [7] bit64_0.9-7 knitr_1.20
+## [9] R.utils_2.6.0 data.table_1.11.4
+## [11] rpart_4.1-13 RCurl_1.95-4.11
+## [13] doParallel_1.0.14 metap_1.0
+## [15] snow_0.4-2 RANN_2.6.1
+## [17] combinat_0.0-8 proxy_0.4-22
+## [19] bit_1.1-14 httpuv_1.4.5
+## [21] assertthat_0.2.0 viridis_0.5.1
+## [23] hms_0.4.2 evaluate_0.11
+## [25] promises_1.0.1 fansi_0.3.0
+## [27] DEoptimR_1.0-8 caTools_1.17.1.1
+## [29] readxl_1.1.0 igraph_1.2.2
+## [31] htmlwidgets_1.2 sparsesvd_0.1-4
+## [33] purrr_0.2.5 crosstalk_1.0.0
+## [35] backports_1.1.2 trimcluster_0.1-2.1
+## [37] gbRd_0.4-11 TTR_0.23-4
+## [39] abind_1.4-5 RcppEigen_0.3.3.4.0
+## [41] withr_2.1.2 robustbase_0.93-3
+## [43] checkmate_1.8.5 vcd_1.4-4
+## [45] xts_0.11-2 mclust_5.4.1
+## [47] cluster_2.0.7-1 ape_5.2
+## [49] segmented_0.5-3.0 lazyeval_0.2.1
+## [51] laeken_0.5.0 crayon_1.3.4
+## [53] hdf5r_1.0.1 pkgconfig_2.0.2
+## [55] slam_0.1-44 labeling_0.3
+## [57] nlme_3.1-137 vipor_0.4.5
+## [59] nnet_7.3-12 bindr_0.1.1
+## [61] rlang_0.2.2 diptest_0.75-7
+## [63] registry_0.5 doSNOW_1.0.16
+## [65] cellranger_1.1.0 rprojroot_1.3-2
+## [67] lmtest_0.9-36 rngtools_1.3.1
+## [69] carData_3.0-1 Rhdf5lib_1.4.2
+## [71] boot_1.3-20 zoo_1.8-3
+## [73] base64enc_0.1-3 beeswarm_0.2.3
+## [75] ggridges_0.5.0 pheatmap_1.0.10
+## [77] png_0.1-7 viridisLite_0.3.0
+## [79] bitops_1.0-6 R.oo_1.22.0
+## [81] KernSmooth_2.23-15 DelayedMatrixStats_1.4.0
+## [83] doRNG_1.7.1 lars_1.2
+## [85] stringr_1.3.1 scales_1.0.0
+## [87] plyr_1.8.4 ica_1.0-2
+## [89] bibtex_0.4.2 gdata_2.18.0
+## [91] zlibbioc_1.26.0 compiler_3.5.2
+## [93] HSMMSingleCell_1.2.0 lsei_1.2-0
+## [95] bbmle_1.0.20 RColorBrewer_1.1-2
+## [97] rrcov_1.4-4 fitdistrplus_1.0-11
+## [99] cli_1.0.0 dtw_1.20-1
+## [101] XVector_0.22.0 pbapply_1.3-4
+## [103] htmlTable_1.12 Formula_1.2-3
+## [105] MASS_7.3-51.1 mgcv_1.8-26
+## [107] tidyselect_0.2.4 stringi_1.2.4
+## [109] forcats_0.3.0 densityClust_0.3
+## [111] yaml_2.2.0 latticeExtra_0.6-28
+## [113] ggrepel_0.8.0 grid_3.5.2
+## [115] tools_3.5.2 rio_0.5.10
+## [117] rstudioapi_0.7 foreach_1.4.4
+## [119] foreign_0.8-71 gridExtra_2.3
+## [121] smoother_1.1 scatterplot3d_0.3-41
+## [123] Rtsne_0.15 digest_0.6.18
+## [125] BiocManager_1.30.3 FNN_1.1.2.2
+## [127] shiny_1.1.0 qlcMatrix_0.9.7
+## [129] fpc_2.1-11.1 Rcpp_1.0.0
+## [131] car_3.0-2 SDMTools_1.1-221
+## [133] later_0.7.4 WriteXLS_4.1.0
+## [135] httr_1.3.1 npsurv_0.4-0
+## [137] kernlab_0.9-27 Rdpack_0.10-1
+## [139] colorspace_1.3-2 ranger_0.11.1
+## [141] reticulate_1.10 statmod_1.4.30
+## [143] sp_1.3-1 flexmix_2.3-14
+## [145] xtable_1.8-3 jsonlite_1.5
+## [147] modeltools_0.2-22 destiny_2.12.0
+## [149] R6_2.2.2 Hmisc_4.1-1
+## [151] pillar_1.3.0 htmltools_0.3.6
+## [153] mime_0.5 glue_1.3.0
+## [155] VIM_4.8.0 DT_0.4
+## [157] class_7.3-14 codetools_0.2-15
+## [159] utf8_1.1.4 tsne_0.1-3
+## [161] pcaPP_1.9-73 mvtnorm_1.0-8
+## [163] lattice_0.20-38 tibble_1.4.2
+## [165] mixtools_1.1.0 curl_3.2
+## [167] ggbeeswarm_0.6.0 gtools_3.8.1
+## [169] zip_1.0.0 openxlsx_4.1.0
+## [171] survival_2.43-3 limma_3.36.3
+## [173] rmarkdown_1.10 docopt_0.6.1
+## [175] fastICA_1.2-1 munsell_0.5.0
+## [177] e1071_1.7-0 rhdf5_2.26.1
+## [179] GenomeInfoDbData_1.1.0 iterators_1.0.10
+## [181] HDF5Array_1.10.1 haven_1.1.2
+## [183] reshape2_1.4.3 gtable_0.2.0
+