-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy path4_Clustering.R
executable file
·222 lines (166 loc) · 7.92 KB
/
4_Clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# Utilize methods for evaluating the selection of PCs to use for clustering
# Perform clustering of cells based on significant PCs
# setwd("~/KP/singleCellProjects/multiomeProject")
library(Seurat)
library(ggplot2)
library(harmony) # 1.0
library(Rcpp)
library(openxlsx)
library(here)
library(dplyr)
library(stringr)
library(tibble)
library(markdown)
library(XML)
library(RCurl)
library(AnnotationHub) # BiocManager::install("AnnotationHub")
library(gridExtra)
library(cowplot)
library(purrr)
set.seed(1234)
# Now that we have our high quality cells integrated, we want to know the different cell types present within our population of cells.
# Determining how many PCs to include in the clustering step is
# therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.
seurat_integrated <- readRDS('results/integrated_seurat.rds')
# Elbow Plot --------------------------------------------------
ElbowPlot(object = seurat_integrated,
ndims = 43)
# quantitative approach to choose PCs
# We can calculate where the principal components start to elbow by taking the larger value of:
#
# 1. The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation.
# 2. The point where the percent change in variation between the consecutive PCs is less than 0.1%.
# Determine percent of variation associated with each PC
pct <- seurat_integrated[["pca"]]@stdev / sum(seurat_integrated[["pca"]]@stdev) * 100
# Calculate cumulative percents for each PC
cumu <- cumsum(pct)
# Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5
co1 <- which(cumu > 90 & pct < 5)[1]
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%.
co2
# Minimum of the two calculation
pcs <- min(co1, co2)
pcs
# Create a dataframe with values
plot_df <- data.frame(pct = pct,
cumu = cumu,
rank = 1:length(pct))
# Elbow plot to visualize
ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pcs)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
theme_bw()
# Cluster the cells ---------------------------------------------
# The resolution is an important argument that sets the "granularity" of the downstream clustering and will need to be optimized for every individual experiment.
# For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering.
# Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.
# Determine the K-nearest neighbor graph
# Using first 40 PCs to cluster
seurat_integrated <- FindNeighbors(object = seurat_integrated,
dims = 1:40)
# Determine the clusters for various resolutions
seurat_integrated <- FindClusters(object = seurat_integrated,
resolution = c(0.4, 0.6, 0.8, 1.0, 1.4))
# ....Explore resolutions ---------------
View()
# ....Assign identity of clusters ----------------------
# We will start with a resolution of 0.8 by assigning the identity of the clusters using the Idents() function.
Idents(object = seurat_integrated) <- "integrated_snn_res.0.8"
# Plot the UMAP
umap_0.8 <- DimPlot(seurat_integrated,
reduction = "umap",
label = TRUE,
label.size = 6) + ggtitle('Resolution = 0.8')
grid.arrange(umap_0.4, umap_0.6, umap_0.8, umap_1, umap_1.4)
# Going ahead with 0.8 resolution
ggsave(umap_0.8, filename = 'figures/UMAPplot_clustering_integrated_object.pdf', width = 10, height = 10)
# Clustering QC ---------------------------------------------
# Goals:
# Evaluate whether clustering artifacts are present
# Determine the quality of clustering with PCA and UMAP plots and understand when to re-cluster
# Assess known cell type markers to hypothesize cell type identities of clusters
# ....Segregation of clusters by sample -----------------
# Extract identity and sample information from seurat object to determine the number of cells per cluster per sample
# n_cells <- FetchData(seurat_integrated,
# vars = c("ident", "orig.ident")) %>%
# dplyr::count(ident, orig.ident) %>%
# tidyr::spread(ident, n)
#
# # View table
# View(n_cells)
# UMAP of cells in each cluster by sample
qc1 <- DimPlot(seurat_integrated,
label = TRUE,
split.by = "sample") + NoLegend()
ggsave(qc1, filename = 'figures/UMAP_QC1_postClustering_cluster_by_sample.pdf', width = 10, height = 10)
# Generally, we expect to see the majority of the cell type clusters to be present in all conditions;
# however, depending on the experiment we might expect to see some condition-specific cell types present.
# These clusters look pretty similar between conditions, which is good since we expected similar cell types to be present in both PDX and Parental tumors
# ....Segregation of clusters by cell cycle phase -----------------
# Explore whether clusters segregate by cell cycle phase
qc2 <- DimPlot(seurat_integrated,
label = TRUE,
split.by = "Phase") + NoLegend()
# !!!
ggsave(qc2, filename = 'figures/UMAP_QC2_postClustering_cluster_by_cellCyclePhase.pdf', width = 10, height = 10)
# ....Segregation of clusters by various sources of uninteresting variation -----------------
# Determine metrics to plot present in [email protected]
metrics <- c("nUMI", "nGene", "S.Score", "G2M.Score", "mitoRatio")
qc3 <- FeaturePlot(seurat_integrated,
reduction = "umap",
features = metrics,
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q10',
label = TRUE)
ggsave(qc3, filename = 'figures/FeaturePlot_QC3_postClustering.pdf', width = 10, height = 10)
# ....Exploration of the PCs driving the different clusters -----------------
# Defining the information in the seurat object of interest
columns <- c(paste0("PC_", 1:16),
"ident",
"UMAP_1", "UMAP_2")
# Extracting this data from the seurat object
pc_data <- FetchData(seurat_integrated,
vars = columns)
# Adding cluster label to center of cluster on UMAP
umap_label <- FetchData(seurat_integrated,
vars = c("ident", "UMAP_1", "UMAP_2")) %>%
group_by(ident) %>%
summarise(x=mean(UMAP_1), y=mean(UMAP_2))
# Plotting a UMAP plot for each of the PCs
map(paste0("PC_", 1:16), function(pc){
ggplot(pc_data,
aes(UMAP_1, UMAP_2)) +
geom_point(aes_string(color=pc),
alpha = 0.7) +
scale_color_gradient(guide = FALSE,
low = "grey90",
high = "blue") +
geom_text(data=umap_label,
aes(label=ident, x, y)) +
ggtitle(pc)
}) %>%
plot_grid(plotlist = .)
# Examine PCA results
print(seurat_integrated[["pca"]], dims = 1:5, nfeatures = 5)
# ....Exploration of known cell type markers -----------------
aa1 <- DimPlot(object = seurat_integrated,
reduction = "umap",
label = TRUE)
# The SCTransform normalization was performed only on the 3000 most variable genes, so many of our genes of interest may not be present in this data.
# Hence need to normalize data of genes not present in 3000 most variable genes
# Select the RNA counts slot to be the default assay
DefaultAssay(seurat_integrated) <- "RNA"
# Normalize RNA data for visualization purposes
seurat_integrated <- NormalizeData(seurat_integrated, verbose = FALSE)
FeaturePlot(seurat_integrated,
reduction = "umap",
features =c('HBB', 'HBA2'),
order = TRUE,
min.cutoff = 'q10',
label = TRUE)
# unable to detect most commonly known markers