-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMicroarray_GE_Analysis.R
369 lines (285 loc) · 14 KB
/
Microarray_GE_Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
setwd("~/FGT/ICA")
# Affymetrix Microarray Analysis Basic (Skeleton) Workflow
##Load the required libraries & load the files for the workflow
library(limma)
library(affy)
library(annotate)
library(mouse4302.db)# load chip-specific annotation
library(simpleaffy)
library(ggrepel)
library(scatterplot3d)
library(SummarizedExperiment)
library(org.Hs.eg.db)
## Load the main data- commented code is just for information
##system("tar -xvf GSE23556_RAW.tar")
#####system("cp /shared_files/targets.txt .")
###### 1. LOADING CELL FILES #######################################################################################
# Load the target file into a table object
adf <-read.table("targets.txt",header=TRUE,as.is=TRUE)
# Load the expression values of all CEL files in the R working directory
raw_data <- ReadAffy()
# rename columns
sampleNames(raw_data) <- adf$Name
# View a summary of the example data
raw_data
###### 2. QUALITY CONTROL #######################################################################################
### 2.1 Access Control Data (Added code)
control_idx <-grep("AFFX", rownames(raw_data))
control_idx
## 2.2 Build Quality Control Plots
colours <- c(rep("yellow",3),rep("red",3))
#QC Plot (Added code)
x.mas <- call.exprs(raw_data,"mas5")
qcs <-qc(raw_data,x.mas)
png("qc_plot.png")
plot(qcs, xlab="Fold Change", ylab="Array")
dev.off()
#Quality control histogram
png("quality_control_hist.png")
hist(raw_data, main="Quality control of raw data")
dev.off()
# Boxplot with different colour for the two sample group
png("quality_control_box.png")
boxplot(raw_data, col = adf$Colour, las=2, main="Quality control of raw data", ylab="Log Intensity", xlab="Samples")
dev.off()
#Added code
#Build presence/ abscence call table
calls <-mas5calls(raw_data)
calls <- exprs(calls)
absent <- colSums(calls == 'A')/ nrow(calls)
present <-colSums(calls=='P')/ nrow(calls)
quality_control <- data.frame(absent, present)
write.table(quality_control,"quality_control.txt")
###### 3. DATA NORMALISATION #######################################################################################
## Normalise the data using RMA
normalised_data <- rma(raw_data)
normalised_data
# To obtain a matrix of the expression values, use exprs()
exp_values <- exprs(normalised_data)
## Control Check - Plot Normalised Data
# Boxplot to observe the results of normalisation
# Notice differences with the boxplot from the raw data
png("data_normalisation.png")
boxplot(exp_values, col=colours,las=2, main="Quality Control After Normalisation", ylab="Log Intensity", xlab="Samples")
dev.off()
# MA plot of all samples
png("MA_normalised.png")
mva.pairs(exp_values, cex=0.9, main = "MA Plot for Normalised Data")
dev.off()
# The same plot for the non-normalised raw data
png("MA_non_normalised.png")
mva.pairs(pm(raw_data), cex=0.9, main = "MA Plot for Non-Normalised Data")
dev.off()
#process the expression data
#save the data for use in the shiny app
variances <- apply(exp_values, 1, var)
#Rank 20 most expressed genes by highest variance
expression <- exprsvals[order(variances, decreasing=TRUE)[1:20],]
gene_names <- tmp$Symbol[match(rownames(expression), tmp$ID)]
rownames(expression)[! is.na(gene_names)] <- paste(gene_names[
! is.na(gene_names)], rownames(expression)[! is.na(gene_names)
], sep = ' / ')
save(expression,file="expression.Rdata")
###### 4. HIERARCHICAL CLUSTERING OF NORMALISED DATA####################################################################
# Performs hierarchical clustering with average linkage based on
# Pearsons Correlation Coefficient
hc<-hclust(as.dist(1-cor(exp_values, method="pearson")), method="average")
png("Hierarchical_plot.png")
plot(hc)
dev.off()
###### 5. PRINCIPAL COMPONENTS ANALYSIS ####################################################################
## Perform PCA of normalised data
pca <- prcomp(t(exp_values), scale=T)
# Plot the PCA results
png("PCA_plot.png")
legends <- c("SRF_control","SRF_KO")
percentVar <- round(100 * attr(pca, "percentVar"))
s3d<-scatterplot3d(pca$x[,1:3], pch=19, color=colours, main="Principal Components Analysis", xlab(paste0("PC1: ", percentVar[1], "% variance")))
legend("bottom", legend = adf$Group[1,4],
col = colours, pch = 16,
inset = -0.1, xpd = TRUE, horiz = TRUE)
s3d.coords <- s3d$xyz.convert(pca$x[,1:3])
text(s3d.coords$x, s3d.coords$y, labels = colnames(exp_values),pos = 3,offset = 0.5)
scores = as.data.frame(pca$x)
summary(pca)
dev.off()
###### 6. FOLD FILTERING ####################################################################
## Perform fold filtering - Obtaining Expression Values#################################################################
#using expression matrix obtained in step 3 a matrix of expression values
#RMA outputs log2 data while MAS5 outputs linear data
#To convert from log
exprsvals10 <-2^exp_values
#check conversion
exp_values[1:10,]
#converted
exprsvals10[1:10,]
#More fold filtering
#check order of sample names
mysamples <- sampleNames(normalised_data)
#display the list
mysamples
#it is useful to obtain a vector of ProbeIDs here
probesets <- probeNames(raw_data)
#display the first 10 ProbeSets
probesets[1:10]
#Build final fold table
#Calculate the means
#Note mean of the log is not the same as the log of the mean!!
Srf_control.mean <- apply(exprsvals10[,adf$Name[1:3]],1,mean)
Srf_KO.mean <- apply(exprsvals10[,adf$Name[4:6]],1,mean)
#calculate some fold changes
Fold_Change <-Srf_control.mean /Srf_KO.mean
#build a summary table to hold all the data
fold_data= cbind(Srf_control.mean,Srf_KO.mean,Fold_Change)
#check the column names
colnames(fold_data)
#write the table of means as an output
write.table(fold_data,file="fold_data.txt", quote=F,
sep="\t",col.names=NA)
###### 7. STATISTICAL ANALYSIS OF DIFFERENTIALLY EXPRESSED GENES####################################################################
#Check original sample order
sampleNames(normalised_data)
#Rename the samples
sampleNames(normalised_data) <-c("Srf_WT1","Srf_WT2","Srf_WT3","Srf_KO1","Srf_KO2","Srf_KO3")
#Check the samples have renamed
sampleNames(normalised_data)
##Building annotation for differential gene identification
#establish annotation for MOE430v2
#which annotation do we need
#modified from #http://gettinggeneticsdone.blogspot.co.uk/2012/01/annotating-limma-#results-with-gene.html
normalised_data@annotation
#packages in the annotation package
ls("package:mouse4302.db")
#build an annotation table
ID <- featureNames(normalised_data)
Symbol <- getSYMBOL(ID, "mouse4302.db")
Name <- as.character(lookUp(ID, "mouse4302.db", "GENENAME"))
tmp <- data.frame(ID=ID, Symbol=Symbol, Name=Name, stringsAsFactors=F)
tmp[tmp=="NA"] <- NA #fix padding with NA characters
#assign as feature data of the current normalised_data
fData(normalised_data) <- tmp
## Statistical analysis using Limma
#Build the design matrix which indicates which RNA sample has been applied to each array
design <- model.matrix(~-1+factor(c(1,1,1,2,2,2)))
colnames(design) <- c("Srf_WT","Srf_KO")
#Check it makes sense
sampleNames(normalised_data)
#output the design matrix
design
#Contrast matrix instructs Limma which comparisons to make between the RNA samples
contrastmatrix <- makeContrasts(Srf_WT-Srf_KO,levels=design)
contrastmatrix
#Based on design matrix: linear model is fitted for each gene on the array
#To model systematic part of data, so it can be distinguished from random variation
#and make the contrasts
fit <- lmFit(normalised_data, design)
#Based on contrast matrix
#Obtain coefficients and standard errors for contrasts of the coefficients of the original model
fit2 <- contrasts.fit(fit, contrastmatrix)
#this last part essentially moderates the t-statistic using
#the borrowed variance approach described in class
#To asses differential expression and compute moderated t-statistics, moderated Fstatistic, and log-odds of differential expression
fit2 <- eBayes(fit2)
#make a venn diagram
#Multiple Testing Genewise Across Contrasts
#Does not adjust for multiple testing across genes
#nested F-test approach
png("venn_diagram.png")
clas <- classifyTestsF(fit2)
vennDiagram(clas)
dev.off()
#Top differentially expressed genes according to Limma (based on adjusted p-value)
topTable(fit2,coef=1,adjust="fdr")
limma_genes <-topTable(fit2,coef=1, adjust="fdr", number=nrow(normalised_data))
write.table(limma_genes,"limma_genes.txt")
#Top 10 differentially expressed genes according to Limma (based on adjusted p-value)
limma_genes10 <- topTable(fit2,coef=1, adjust="fdr", n=10)
write.table(limma_genes10,"limma_genes10.txt", sep = ",")
#Top differentially expressed genes based on Fold change
#order[-x] as want descending order
# abs(x) as need to ignore -ve and +ve sign for fold change, only interested in absolute value
fold_change_limma10 <- limma_genes[order(-abs(limma_genes$logFC)),]
fold_change_limma10 <- head(fold_change_limma10,10)
write.table(fold_change_limma10,"fold_change_top10.txt", sep = ",")
fold_change_limma10
###### 8. FUNCTIONAL ENRICHMENT ANALYSIS ####################################################################
Mm.H <- readRDS("/shared_files/MSigDB/Mm.h.all.v7.1.entrez.rds")
#Check that you have the required objects
ls()
#Show the full contents of the annotation package
ls("package:mouse4302.db")
#Show the annotation keys in this database
keytypes(mouse4302.db)
sampleNames(normalised_data)
## Process annotation for functional enrichment
#Here we select from the annotation a number of keys with the primary key being PROBEID
res <- select(mouse4302.db, keys = rownames(normalised_data), columns = c("ENTREZID", "ENSEMBL","SYMBOL"), keytype="PROBEID")
#View the top of the table
head(res)
#find the index of each row of the expression set in the #annotation object res
idx <- match(rownames(normalised_data), res$PROBEID)
#Use the index to set the phenotypic data in the ExpressionSet
fData(normalised_data) <- res[idx, ]
head(fData(normalised_data), 10)
#Find all rows that dont have an EntrezID and remove then
normalised_data_corrected <- normalised_data[is.na(fData(normalised_data)$ENTREZID)==0,]
## Functional Enrichment Analysis
#convert to indexes
H.indices <- ids2indices(Mm.H,fData(normalised_data_corrected)$ENTREZID)
#if you want to run mroast
enrichment_roast <-mroast(normalised_data_corrected,index=H.indices,design=design,contrast=contrastmatrix[,1],adjust.method = "BH")
#if you want to run camera
enrichment_camera <-camera(normalised_data_corrected,index=H.indices,design=design,contrast=contrastmatrix[,1],adjust.method = "BH")
#if you want to run romer
enrichment_romer <-romer(normalised_data_corrected,index=H.indices,design=design,contrast=contrastmatrix[,1],adjust.method = "BH")
#View the results
write.table(enrichment_camera,"enrichment_camera.txt",sep="\t")
#You can then examine the results in enrichment.txt. It is a text file. It can be downloaded to view in a spreadsheet such as Excel.
#results of functional enrichment analysis to be used for shiny app
save(enrichment_camera,file="enrichment_camera.Rdata")
save(enrichment_romer,file="enrichment_romer.Rdata")
save(enrichment_roast,file="enrichment_roast.Rdata")
## Session Information
sessionInfo()
###### 9. SHINY APP ####################################################################
library(shiny)
runApp("~/FGT/ICA/ICA_app/app.R")
###### 9. VOLCANO PLOT ####################################################################
#Code adapted from https://biocorecrg.github.io/CRG_RIntroduction/volcano-plots.html
# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.
# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)
# add a column of NAs
limma_genes$diffexpressed <- "NO"
# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP"
limma_genes$diffexpressed[limma_genes$logFC > 0.6 & limma_genes$P.Value < 0.05] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
limma_genes$diffexpressed[limma_genes$logFC < -0.6 & limma_genes$P.Value < 0.05] <- "DOWN"
# Re-plot but this time color the points with "diffexpressed"
p <- ggplot(data=limma_genes, aes(x=logFC, y=-log10(P.Value), col=diffexpressed)) + geom_point() + theme_minimal()
# Add vertical lines for log2FoldChange thresholds, and one horizontal line for the p-value threshold
p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
## Change point color
# 1. by default, it is assigned to the categories in an alphabetical order):
p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))
# 2. to automate a bit: ceate a named vector: the values are the colors to be used, the names are the categories they will be assigned to:
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)
# Now write down the name of genes beside the points...
# Create a new column "delabel" to de, that will contain the name of genes differentially expressed (NA in case they are not)
limma_genes$delabel <- NA
limma_genes$delabel[limma_genes$diffexpressed != "NO"] <- limma_genes$NGenes[limma_genes$diffexpressed != "NO"]
ggplot(data=limma_genes, aes(x=logFC, y=-log10(P.Value), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text()
# Finally, we can organize the labels nicely using the "ggrepel" package and the geom_text_repel() function
# plot adding up all layers we have seen so far
ggplot(data=limma_genes, aes(x=logFC, y=-log10(P.Value), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "black", "red")) +
geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")