This is documents shows the quality control of dataset scRNA-seq data from human glioblastoma cells and gliomasphere cell lines.
Load packages
library('ggplot2')
library('rtracklayer')
require(gplots)
library(plyr)
library(reshape2)
library(scater)
library(knitr)
options(stringsAsFactors = FALSE)
Load annotation
gtf_gencode <- readGFF("~/Documents/HCA/reference/gencode.v27.primary_assembly.annotation.gtf", version=2L, tags = c("gene_name","gene_id", "transcript_id","gene_type"))
genes<-subset(gtf_gencode,gtf_gencode$type == "gene")
Load dataset. All outputs of pipeline,include metrics and quantification have been saved in a Rdata.
load('hisat2_outputs_bundle.RData')
Sequencing Data QC
General quality control with sequencing data
Basecall Quality Distribution
The mean quality value across each base position in the read.
m.qual<-melt(hisat2.basequality,id="Bin")
p<-ggplot(m.qual,aes(x=Bin,y=value,group=variable,color=value))+geom_point()+geom_line()
p<-p+ scale_colour_gradientn(colours = rainbow(10))
pngfile<-paste('./images/base_quality_50bp_hisat2.png',sep='')
myimages<-pngfile
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(myimages)
Basecall Distribution
The proportion of each base position for which each of the four normal DNA bases and a missing base has been called.
colnames(hisat2.basecall)
[1] "sraID" "READ_END" "CYCLE" "PCT_A" "PCT_C" "PCT_G" "PCT_T" "PCT_N"
b50<-names(which(table(hisat2.basecall$sraID)==50))
dd<-subset(hisat2.basecall,hisat2.basecall$sraID %in% b50)
m.bc<-melt(dd,id=c('sraID','READ_END','CYCLE'))
p<-ggplot(m.bc,aes(x=CYCLE,y=value,group=interaction(sraID,READ_END),color=variable))+geom_line()+facet_grid(variable~.)
pngfile<-paste('./images/basecall_50bp_hisat2.png',sep='')
myimages<-pngfile
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(myimages)
Alignment Statistics
Two Picard metrics provide main alignment QC metrics, MarkDuplication
and CollectAlignmentMetrics
.
colnames(hisat2.aln)
colnames(hisat2.dup)
Alignment Bias within Pairs
Compare alignment rate of R1
and R2
r1<-subset(hisat2.aln,hisat2.aln$CATEGORY == "FIRST_OF_PAIR")
r2<-subset(hisat2.aln,hisat2.aln$CATEGORY=="SECOND_OF_PAIR")
dd<-data.frame('sraID'=r1$sraID,'R1'=r1$PCT_PF_READS_ALIGNED,'R2'=r2$PCT_PF_READS_ALIGNED)
mlist<-match(dd$sraID,metadata$Run_s)
metadata<-metadata[mlist,] ## match metadat with metrics file
p<-ggplot(dd,aes(x=R1,y=R2,color=as.factor(metadata$AvgSpotLen_l)))+geom_point()
p<-p+geom_abline(intercept=0, slope=1,color='blue')
pngfile<-paste('./images/hisat2_R1_R2_PCT_PF_READS_ALIGNED.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
Aligned, multiple aligned, unmapped
Aligned vs unmapped.
unq.rds<-as.numeric(hisat2.logs[3,-1])*2+as.numeric(hisat2.logs[8,-1])+as.numeric(hisat2.logs[5,-1])*2
mult.rds<-as.numeric(hisat2.logs[4,-1])*2+as.numeric(hisat2.logs[9,-1])
aln.rds<-unq.rds+mult.rds
unq.pct<-unq.rds/aln.rds
mult.pct<-mult.rds/aln.rds
dd<-data.frame('sraID'=colnames(hisat2.logs)[-1],'multi-mapping'=mult.pct,'uniq-mapping'=unq.pct)
mdd<-melt(dd)
Using sraID as id variables
p<-ggplot(mdd, aes(x = sraID,y = value)) + geom_area(aes(fill=variable,group=variable), position='stack')
pngfile<-paste('./images/hisat2_aln_secondary_primary_PCT_Stacking.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
Uniquely vs multiple mapping reads
unq.rds<-as.numeric(hisat2.logs[3,-1])*2+as.numeric(hisat2.logs[8,-1])+as.numeric(hisat2.logs[5,-1])*2
mult.rds<-as.numeric(hisat2.logs[4,-1])*2+as.numeric(hisat2.logs[9,-1])
aln.rds<-unq.rds+mult.rds
unq.pct<-unq.rds/aln.rds
mult.pct<-mult.rds/aln.rds
dd<-data.frame('sraID'=colnames(hisat2.logs)[-1],'multi-mapping'=mult.pct,'uniq-mapping'=unq.pct)
mdd<-melt(dd)
Using sraID as id variables
p<-ggplot(mdd, aes(x = sraID,y = value)) + geom_area(aes(fill=variable,group=variable), position='stack')
pngfile<-paste('./images/hisat2_aln_secondary_primary_PCT_Stacking.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
RNA QC metrics
RNA metrics
colnames(hisat2.rna)
We report PCT_RIBOSOMAL_BASES
,PCT_CODING_BASES
,PCT_UTR_BASES
,PCT_INTRONIC_BASES
and PCT_INTERGENIC_BASES
##extract metrics
rna<-hisat2.rna[,c('sraID','PCT_INTERGENIC_BASES','PCT_INTRONIC_BASES','PCT_UTR_BASES','PCT_CODING_BASES','PCT_RIBOSOMAL_BASES')]
rna.m<-melt(rna)
Using sraID as id variables
p<-ggplot(rna.m, aes(x = sraID,y = value)) + geom_area(aes(fill=variable,group = variable), position='stack')
pngfile<-paste('./images/hisat2_RNA_PCT_Stacking.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
Another metrics we would like to show is the All_Reads.normalized_coverage
alone transcripts and normalized by GC-content. Picard first bin transcripts from 3` to 5’ and then reports this metrics by calculating reads counts in each bin along transcripts
cov.m<-melt(hisat2.coverage,id='Bin')
mlist<-match(cov.m$variable,metadata$Run_s)
mdata<-metadata$source_name_s[mlist]
cov.m<-data.frame(cov.m,'cell'=mdata)
p<-ggplot(cov.m,aes(x=Bin,y=value,group=interaction(variable,cell),color=cell))+geom_line()+facet_grid(cell~.)+scale_y_log10()
pngfile<-paste('./images/hisat2_RNA_coverage.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
Quantification QC
We can define detected genes
by require the reads count of the gene to be at least >5
##dim(hisat2rsem.cnt)
cnt.dd<-hisat2rsem.cnt[,-1]
## total reads aln
cnt.tot<-apply(cnt.dd,2,sum)
## total gene reads count
cnt.gene<-apply(cnt.dd,2,function(x){sum(x>5)})
## mt genes
x<-subset(cnt.dd,hisat2rsem.cnt$ensID %in% mt.genes$gene_id)
##mt gene read counts
cnt.mt<-apply(x,2,sum)
## create df
dd<-data.frame('sraID'=colnames(dd),'readCounts'=cnt.tot,'mt'=cnt.mt,'detectedGenes'=cnt.gene)
Reads Counts saturation plots
p<-ggplot(dd,aes(x=readCounts,y=detectedGenes,color=detectedGenes))+geom_point()+scale_x_log10()+scale_y_log10()
pngfile<-paste('./images/hisat2_gene_count_saturation.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
mt gene coverage
p<-ggplot(dd,aes(x=cnt.mt/readCounts))+geom_histogram()+xlab('% in MT genes')
pngfile<-paste('./images/hisat2_mt_gene_count.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
---
title: "scRNA-Seq QC Report"
output: html_notebook
---

This is documents shows the quality control of dataset scRNA-seq [data](https://www.ncbi.nlm.nih.gov/bioproject/PRJNA248302) from human glioblastoma cells and gliomasphere cell lines.

Load packages

```{r}
library('ggplot2')
library('rtracklayer')
require(gplots) 
library(plyr)
library(reshape2)
library(scater)
library(knitr)
options(stringsAsFactors = FALSE)
```
Load annotation
```{r}
gtf_gencode <- readGFF("~/Documents/HCA/reference/gencode.v27.primary_assembly.annotation.gtf", version=2L, tags = c("gene_name","gene_id", "transcript_id","gene_type"))
genes<-subset(gtf_gencode,gtf_gencode$type == "gene")
mt.genes<-subset(genes,genes$gene_type %in% c('Mt_tRNA','Mt_rRNA'))
```
Load dataset. All outputs of pipeline,include metrics and quantification have been saved in a Rdata.
```{r}
load('hisat2_outputs_bundle.RData')
```
# Sequencing Data QC
General quality control with sequencing data

## Basecall Quality Distribution
The mean quality value across each base position in the read. 
```{r, echo =TRUE, out.width="30%"}
m.qual<-melt(hisat2.basequality,id="Bin")
p<-ggplot(m.qual,aes(x=Bin,y=value,group=variable,color=value))+geom_point()+geom_line()
p<-p+  scale_colour_gradientn(colours = rainbow(10))
pngfile<-paste('./images/base_quality_50bp_hisat2.png',sep='')
myimages<-pngfile
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(myimages)
```


## Basecall Distribution
The proportion of each base position for which each of the four normal DNA bases and a missing base has been called. 
```{r,echo =TRUE, out.width="30%"}
colnames(hisat2.basecall)
b50<-names(which(table(hisat2.basecall$sraID)==50))
dd<-subset(hisat2.basecall,hisat2.basecall$sraID %in% b50)
m.bc<-melt(dd,id=c('sraID','READ_END','CYCLE'))
p<-ggplot(m.bc,aes(x=CYCLE,y=value,group=interaction(sraID,READ_END),color=variable))+geom_line()+facet_grid(variable~.)
pngfile<-paste('./images/basecall_50bp_hisat2.png',sep='')
myimages<-pngfile
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(myimages)
```
# Alignment Statistics
Two Picard metrics provide main alignment QC metrics, `MarkDuplication` and `CollectAlignmentMetrics`.
```{r}
colnames(hisat2.aln)
colnames(hisat2.dup)
```
## Alignment Bias within Pairs
Compare alignment rate of `R1` and `R2`
```{r,echo =TRUE, out.width="30%"}
r1<-subset(hisat2.aln,hisat2.aln$CATEGORY == "FIRST_OF_PAIR")
r2<-subset(hisat2.aln,hisat2.aln$CATEGORY=="SECOND_OF_PAIR")
dd<-data.frame('sraID'=r1$sraID,'R1'=r1$PCT_PF_READS_ALIGNED,'R2'=r2$PCT_PF_READS_ALIGNED)
mlist<-match(dd$sraID,metadata$Run_s)
metadata<-metadata[mlist,] ## match metadat with metrics file
p<-ggplot(dd,aes(x=R1,y=R2,color=as.factor(metadata$AvgSpotLen_l)))+geom_point()
p<-p+geom_abline(intercept=0, slope=1,color='blue')
pngfile<-paste('./images/hisat2_R1_R2_PCT_PF_READS_ALIGNED.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
```
## Aligned, multiple aligned, unmapped
Aligned vs unmapped. 
```{r}
##extract metrics
x1<-subset(hisat2.aln,hisat2.aln$CATEGORY == "PAIR")
dd<-data.frame('sraID'=x1$sraID,'UNMAPPED'=hisat2.dup$UNMAPPED_READS/x1$TOTAL_READS,'MAPPED'=(x1$TOTAL_READS-hisat2.dup$UNMAPPED_READS)/x1$TOTAL_READS)
aln.m<-melt(dd)
p<-ggplot(aln.m, aes(x = sraID,y = value)) + geom_area(aes(fill=variable,group = variable), position='stack')
pngfile<-paste('./images/hisat2_aln_mapped_unmapped_PCT_Stacking.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
```
Uniquely vs multiple mapping reads
```{r}
unq.rds<-as.numeric(hisat2.logs[3,-1])*2+as.numeric(hisat2.logs[8,-1])+as.numeric(hisat2.logs[5,-1])*2
mult.rds<-as.numeric(hisat2.logs[4,-1])*2+as.numeric(hisat2.logs[9,-1])
aln.rds<-unq.rds+mult.rds
unq.pct<-unq.rds/aln.rds
mult.pct<-mult.rds/aln.rds
dd<-data.frame('sraID'=colnames(hisat2.logs)[-1],'multi-mapping'=mult.pct,'uniq-mapping'=unq.pct)
mdd<-melt(dd)
p<-ggplot(mdd, aes(x = sraID,y = value)) + geom_area(aes(fill=variable,group=variable), position='stack')
pngfile<-paste('./images/hisat2_aln_secondary_primary_PCT_Stacking.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
```
# RNA QC metrics
RNA metrics
```{r}
colnames(hisat2.rna)
```
We report `PCT_RIBOSOMAL_BASES`,`PCT_CODING_BASES`,`PCT_UTR_BASES`,`PCT_INTRONIC_BASES` and `PCT_INTERGENIC_BASES`

```{r,echo =TRUE, out.width="30%"}
##extract metrics
rna<-hisat2.rna[,c('sraID','PCT_INTERGENIC_BASES','PCT_INTRONIC_BASES','PCT_UTR_BASES','PCT_CODING_BASES','PCT_RIBOSOMAL_BASES')]
rna.m<-melt(rna)
p<-ggplot(rna.m, aes(x = sraID,y = value)) + geom_area(aes(fill=variable,group = variable), position='stack')
pngfile<-paste('./images/hisat2_RNA_PCT_Stacking.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
```
Another metrics we would like to show is the `All_Reads.normalized_coverage` alone transcripts and normalized by GC-content. Picard first bin transcripts from 3` to 5' and then reports this metrics by calculating reads counts in each bin along transcripts
```{r,echo =TRUE, out.width="30%"}
cov.m<-melt(hisat2.coverage,id='Bin')
mlist<-match(cov.m$variable,metadata$Run_s)
mdata<-metadata$source_name_s[mlist]
cov.m<-data.frame(cov.m,'cell'=mdata)
p<-ggplot(cov.m,aes(x=Bin,y=value,group=interaction(variable,cell),color=cell))+geom_line()+facet_grid(cell~.)+scale_y_log10()
pngfile<-paste('./images/hisat2_RNA_coverage.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)

```

# Quantification QC
We can define `detected genes` by require the reads count of the gene to be at least `>5`

```{r}
##dim(hisat2rsem.cnt)
cnt.dd<-hisat2rsem.cnt[,-1]
## total reads aln
cnt.tot<-apply(cnt.dd,2,sum)
## total gene reads count
cnt.gene<-apply(cnt.dd,2,function(x){sum(x>5)})
## mt genes
x<-subset(cnt.dd,hisat2rsem.cnt$ensID %in% mt.genes$gene_id)
##mt gene read counts
cnt.mt<-apply(x,2,sum)
## create df
dd<-data.frame('sraID'=colnames(dd),'readCounts'=cnt.tot,'mt'=cnt.mt,'detectedGenes'=cnt.gene)

```
## Reads Counts saturation plots
```{r}
p<-ggplot(dd,aes(x=readCounts,y=detectedGenes,color=detectedGenes))+geom_point()+scale_x_log10()+scale_y_log10()
pngfile<-paste('./images/hisat2_gene_count_saturation.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
```
## mt gene coverage

```{r}
p<-ggplot(dd,aes(x=cnt.mt/readCounts))+geom_histogram()+xlab('% in MT genes')
pngfile<-paste('./images/hisat2_mt_gene_count.png',sep='')
ggsave(plot=p,file=pngfile,type='cairo-png',width=10,height=10)
include_graphics(pngfile)
```
