-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path14.ehh_pbs.Rmd
199 lines (146 loc) · 8.65 KB
/
14.ehh_pbs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
---
title: "Population Branch Statistic (PBS) in EHH sweep regions"
output:
github_document
bibliography: bibliography.bib
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, fig.retina = 2)
library(tidyverse)
library(ggpubr)
library(bedr)
```
Regions identified as sweeps via EHH statistics also tended to coincide with extreme values of the population branch statistic (PBS)
```{r, results='hide'}
#map_ehh_regions_to_scaffolds
# Read the translated equivalent of ehh_bed (translated with command below)
#python ../../../scripts/translate_coords.py ehh_sweeps.tsv ragtag_output/ragtag.scaffolds.agp > ehh_sweeps_scaff.tsv
ehh_sweeps <- read_rds("data/r_essentials/candidate_regions_genes_ehh.rds") %>%
mutate(chr=str_trim(chr))
sweep_lengths <- ehh_sweeps %>%
mutate(length = end-start)
ehh_sweeps_scaff <- read_tsv("data/hpc/ragtag/ehh_sweeps_scaff.tsv", col_names = colnames(ehh_sweeps)) %>%
mutate(end = start + sweep_lengths$length)
offsets <- read_tsv("data/hpc/ragtag/all.lengths.scaf.txt", col_names = c("chr","length"),show_col_types = FALSE) %>%
arrange(desc(length)) %>%
dplyr::mutate(offset=lag(cumsum(length),default=0)) %>%
dplyr::mutate(scaffold_num=row_number())
ehh_sweep_manhattan_data <- ehh_sweeps_scaff %>%
left_join(offsets) %>%
dplyr::mutate(abs_start = start + offset, abs_end = end + offset)
```
```{r, results='hide'}
pop_names <- c("inshore"="Inshore","northoffshore"="North Offshore","southoffshore"="South Offshore")
pbs_manhattan_data <- read_rds("data/r_essentials/pbsdata.rds") %>%
extract(population,into="population",regex = "PBS_(.*)")
pbs_thresholds <- read_rds("data/r_essentials/pbs_thresholds.rds")
pbs_ehh_overlapping <- read_tsv("data/hpc/selection2/pbs/pbs_sweeps.scaff.tsv",col_names = c("chr","pos","pos2","PBS_IN","PBS_NO","PBS_SO")) %>%
dplyr::select(-pos2) %>%
pivot_longer(cols = starts_with("PBS"), values_to = "PBS",names_to = "population") %>%
left_join(offsets,by=c("chr"="chr")) %>%
dplyr::mutate(abs_pos= pos + offset) %>%
extract(population,into="population",regex = "PBS_(.*)") %>%
left_join(pbs_thresholds,by=c("population"="loc")) %>%
filter(PBS>=pbs_thresh)
# pbs_ehh_overlapping %>%
# filter(PBS>4) %>%
# left_join(pbs_manhattan_data,by=c("chr","pos")) %>% head()
highlight_sweeps <- ehh_sweep_manhattan_data %>%
mutate(population = case_when(
pop=="inshore" ~ "IN",
pop=="northoffshore" ~ "NO",
pop=="southoffshore" ~ "SO"
))
highlight_regions <- highlight_sweeps %>%
filter(grepl("s0150.g24",x = genes)) %>% # | grepl("s0005.g341",genes)) %>%
filter(population=="IN")
fix_pops <- function(data){
data %>%
mutate(population = case_when(
population=="IN" ~ "Inshore",
population=="NO" ~ "North Offshore",
population=="SO" ~ "South Offshore"
))
}
mh_plot <- pbs_manhattan_data %>%
filter(PBS>0) %>%
fix_pops() %>%
ggplot() +
geom_point(aes(x=abs_pos/1e6,y=PBS,color=as.character(scaffold_num %% 2)),size=0.02) +
geom_point(data = pbs_ehh_overlapping %>% fix_pops(),aes(x=abs_pos/1e6,y=PBS),color="blue",size=0.1) +
geom_linerange(data=highlight_sweeps %>% fix_pops(), aes(xmin=(abs_start/1e6)-0.1,xmax=(abs_end/1e6)+0.1,y=-0.25),size=1.5,color="red",alpha=1) +
geom_linerange(data=highlight_regions %>% fix_pops(), aes(xmin=(abs_start/1e6)-0.2,xmax=(abs_end/1e6)+0.2,y=-0.25),size=1000,color="black",alpha=1) +
# geom_point(data = to_label,aes(x=abs_pos/1e6,y=PBS),color="purple",size=0.5) +
# geom_label_repel(data = to_label,aes(x=abs_pos/1e6,y=PBS, label=dated_locus_number),size=2,nudge_y=1,nudge_x = -10) +
ylab("Population Branch Statistic") + xlab("Genome Position / Mb") + ylim(-0.3,NA) +
scale_color_grey() +
scale_y_continuous(breaks=c(0,2,4)) +
theme_bw() +
theme(panel.grid = element_blank(),
legend.position = "none",
legend.text = element_text(),
axis.text.y = element_text(size=9),
axis.text.x = element_text(size=9),
axis.title.x = element_text(size=11),
axis.title.y = element_text(size=11),
strip.background = element_blank(),
strip.text = element_text(size=9)) + facet_wrap(~population,ncol = 1)
#ggsave(mh_plot,filename = "figures/pbs_ehh_manhattan.pdf",width = 16.9,height = 7.5, units = "cm")
ggsave(mh_plot,filename = "figures/pbs_ehh_manhattan.png",width = 16.9,height = 7.5, units = "cm",dpi = 300)
```
```{r}
knitr::include_graphics("figures/pbs_ehh_manhattan.png")
```
**Figure 1:** Manhattan plots showing the coincidence of extreme values of the population branch statistic (PBS) and regions under selection identified by EHH based scans. PBS estimates for each population are shown as points with the other two considered as outgroups. Points are shown in black and grey to indicate transitions between alternating pseudo-chromosomes via mapping to the A. millepora assembly from Fuller et al (Fuller et al. 2020). The purple shaded baseline shows the location of regions identified as candidates for positive selection using EHH-based scans. Blue points indicate windows where outlying PBS values are coincident with EHH scans.
# Genes associated with the intersection of significant PBS and EHH regions
We used a [python script](./scripts/pbs2bed.py) to identify regions where the population branch statistic for each population exceeded the significance threshold for a false discovery rate of 1%.
```bash
cat data/hpc/selection2/pbs/plink2_noheader.pbs | awk 'BEGIN{OFS="\t"}{print $1,$2,$3}' | ./scripts/pbs2bed.py -t 0.76 > data/hpc/selection2/pbs/pbs_in.bed
cat data/hpc/selection2/pbs/plink2_noheader.pbs | awk 'BEGIN{OFS="\t"}{print $1,$2,$4}' | ./scripts/pbs2bed.py -t 0.49 > data/hpc/selection2/pbs/pbs_no.bed
cat data/hpc/selection2/pbs/plink2_noheader.pbs | awk 'BEGIN{OFS="\t"}{print $1,$2,$5}' | ./scripts/pbs2bed.py -t 0.44 > data/hpc/selection2/pbs/pbs_so.bed
# 10% FDR
cat data/hpc/selection2/pbs/plink2_noheader.pbs | awk 'BEGIN{OFS="\t"}{print $1,$2,$3}' | ./scripts/pbs2bed.py -t 0.6 > data/hpc/selection2/pbs/pbs_in.bed
cat data/hpc/selection2/pbs/plink2_noheader.pbs | awk 'BEGIN{OFS="\t"}{print $1,$2,$4}' | ./scripts/pbs2bed.py -t 0.47 > data/hpc/selection2/pbs/pbs_no.bed
cat data/hpc/selection2/pbs/plink2_noheader.pbs | awk 'BEGIN{OFS="\t"}{print $1,$2,$5}' | ./scripts/pbs2bed.py -t 0.41 > data/hpc/selection2/pbs/pbs_so.bed
```
Bedtools (v2.30.0) was then used to find genes that intersected with these significant intervals and these in turn were then intersected with sweep regions identified via EHH statistics.
```bash
cd data/hpc/selection2/
bedtools intersect -wo -b pbs_in.bed -a ../../../genome/adig-v2-ncbi.gff | awk '$14>0 && $3=="gene"' > pbs_genes_in.gff
bedtools intersect -wao -a pbs_genes_in.gff -b ../../tracks/sweeps.gff3 | grep 'inshore' > pbs_genes_in_sweeps.tsv
bedtools intersect -wo -b pbs_no.bed -a ../../../genome/adig-v2-ncbi.gff | awk '$14>0 && $3=="gene"' > pbs_genes_no.gff
bedtools intersect -wao -a pbs_genes_no.gff -b ../../tracks/sweeps.gff3 | grep 'northoffshore' > pbs_genes_no_sweeps.tsv
bedtools intersect -wo -b pbs_so.bed -a ../../../genome/adig-v2-ncbi.gff | awk '$14>0 && $3=="gene"' > pbs_genes_so.gff
bedtools intersect -wao -a pbs_genes_so.gff -b ../../tracks/sweeps.gff3 | grep 'southoffshore' > pbs_genes_so_sweeps.tsv
```
Bedtools was also used to simply find ehh-based sweeps overlapping with significant pbs regions
```bash
bedtools intersect -u -b pbs_in.bed -a ../../tracks/sweeps.gff3 | grep 'inshore' > sweeps_in_pbs.gff
bedtools intersect -u -b pbs_no.bed -a ../../tracks/sweeps.gff3 | grep 'northoffshore' > sweeps_no_pbs.gff
bedtools intersect -u -b pbs_so.bed -a ../../tracks/sweeps.gff3 | grep 'southoffshore' > sweeps_so_pbs.gff
```
```{r}
# If not present run 09.annotate
uniprot_gene_annot <- read_tsv("data/hpc/annotation/uniprot_gene_annot.tsv", show_col_types = FALSE) %>%
mutate(go=go_ipr) %>%
dplyr::select(-go_ipr)
```
```{r}
read_sg <- function(path){
read_tsv(path,col_names = NULL,show_col_types = FALSE) %>%
unite("region_id",X14,X17,X18,sep="_") %>%
tidyr::extract(X9,into="gene_id",regex="ID=([^B]+)") %>%
dplyr::select(region_id,gene_id, max_pbs=X12,sum_pbs=X13,max_ehh=X19)
}
sweep_genes <- rbind(
read_sg("data/hpc/selection2/pbs/pbs_genes_in_sweeps.tsv") %>% add_column(pop="inshore"),
read_sg("data/hpc/selection2/pbs/pbs_genes_no_sweeps.tsv") %>% add_column(pop="northoffshore"),
read_sg("data/hpc/selection2/pbs/pbs_genes_so_sweeps.tsv") %>% add_column(pop="southoffshore")
) %>%
group_by(region_id,gene_id,pop) %>%
summarise(max_pbs = max(max_pbs),sum_pbs=sum(sum_pbs),max_ehh=max(max_ehh))
```
```{r}
candidate_genes_table <- sweep_genes %>%
left_join(uniprot_gene_annot,by=c("gene_id"="geneid"))
```