-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6eeb71d
commit d91a02a
Showing
7 changed files
with
2,664 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Thu Nov 19 15:54:15 2020 | ||
@author: steven | ||
""" | ||
|
||
from Bio import SeqIO | ||
|
||
import csv | ||
|
||
#loading in the file with three columns 1 bacterial bin contig name 2 base site 3 coverage output from mpileup | ||
path_mpileup="Data_For_Scripts/three_columns_mpileup_DAScheckbins" | ||
tsv_file = open(path_mpileup) | ||
|
||
read_tsv = csv.reader(tsv_file, delimiter="\t") | ||
|
||
#turns tsv into a list of lists, each list is a row | ||
mpileup = [] | ||
for row in read_tsv: | ||
|
||
mpileup.append(row) | ||
|
||
contig_list = [] | ||
|
||
|
||
#makes a list of all contigs in tsv | ||
for n in mpileup: | ||
if n[0] not in contig_list: | ||
contig_list.append(n[0]) | ||
|
||
print('made contig list') | ||
|
||
#iterate through all the contigs and see if there is a prophage region and add them to the dictionary | ||
|
||
prophage_regions = [] | ||
|
||
#contig by contig | ||
for contig in contig_list: | ||
|
||
|
||
#this takes all lines that belong to this specifc contig and makes a contig specific pileup list | ||
mpileup_contig = [] | ||
|
||
#collect all base-pairs or rows for a contig | ||
for i in mpileup: | ||
|
||
if i[0] == contig: | ||
|
||
mpileup_contig.append(i) | ||
|
||
#For regions where there were no hits mpileup skips these lines, so I added a zero coverage value to fill in the 'gaps' | ||
for k in range(1,len(mpileup_contig)): | ||
|
||
if (int(mpileup_contig[k-1][1]) + 1) != (int(mpileup_contig[k][1])): | ||
|
||
mpileup_contig.insert(int(k), [contig, (int(mpileup_contig[k-1][1]) + 1), 0]) | ||
|
||
#Now we calculate coverage for a sliding window I can define | ||
sliding_window = 1000 | ||
|
||
#Set a min coverage to alert me | ||
min_coverage = 10 | ||
contig_coverage = 0 | ||
|
||
#Go over contig measure number of reads that map to each base | ||
for d in range(0,(len(mpileup_contig) - sliding_window)): | ||
contig_reads = 0 | ||
for q in range(d,(d + sliding_window)): | ||
contig_reads += int(mpileup_contig[q][2]) | ||
|
||
#calculate coverage | ||
contig_coverage = contig_reads/sliding_window | ||
|
||
if contig_coverage >= min_coverage: | ||
# print(contig) | ||
# print(str(d) + ":" + str(d + sliding_window)) | ||
# print(contig_coverage) | ||
# print(d) | ||
prophage_regions.append([contig, d, (d+sliding_window)]) | ||
|
||
for b in range(0,len(prophage_regions)): | ||
outF = open("prophage_regions_step1.txt", "a") | ||
outF.writelines(str(prophage_regions[b][0]) + ' ' + str(prophage_regions[b][1]) + ' ' + str(prophage_regions[b][2]) + '\n') | ||
outF.close() | ||
|
||
|
||
print('found prophage regions') | ||
|
||
for n in range(0, len(prophage_regions)): | ||
|
||
|
||
while (len(prophage_regions) > (n + 1)) and (prophage_regions[n][0] == prophage_regions[n+1][0]) and (int(prophage_regions[n][2]) > int(prophage_regions[n+1][1])): | ||
prophage_regions[n][2] = prophage_regions[n+1][2] | ||
prophage_regions.pop(n+1) | ||
|
||
|
||
for op in range(0,len(prophage_regions)): | ||
outF = open("prophage_regions_step1.txt", "a") | ||
outF.writelines(str(prophage_regions[op][0]) + ' ' + str(prophage_regions[op][1]) + ' ' + str(prophage_regions[op][2]) + '\n') | ||
outF.close() | ||
|
||
print('Regions') | ||
|
||
#Here I load in the concatenated fasta file of all the bacterial bins involved in the study | ||
bacterial_bin = {} | ||
for seq_record1 in SeqIO.parse("Data_For_Scripts/Final_Collection_DAS_check_MAGs.fa"", "fasta"): | ||
bacterial_bin[seq_record1.id] = seq_record1.seq | ||
|
||
for b1 in range(0, len(prophage_regions)): | ||
if prophage_regions[b1][0] in bacterial_bin: | ||
|
||
outF = open("bowtie_prophages_DASins.fa", "a") | ||
outF.writelines(">" + prophage_regions[b1][0] + '\n') | ||
outF.writelines(bacterial_bin[prophage_regions[b1][0]][(int(prophage_regions[b1][1])):(int(prophage_regions[b1][2]))] + '\n') | ||
outF.close() | ||
|
||
print('End') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#Open packages | ||
library(dplyr) | ||
library(readr) | ||
library(ggplot2) | ||
library(tidyr) | ||
library(stringr) | ||
library("ggsci") | ||
|
||
|
||
|
||
#Relative abundances from CheckM | ||
SRR828645_profile <- read.delim("SRR828645_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community')) | ||
SRR828660_profile <- read.delim("SRR828660_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community')) | ||
SRR828661_profile <- read.delim("SRR828661_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community')) | ||
|
||
#Elsewhere I turned the GTDB_tk into a phylogentically sorted list | ||
phylo_GTDB_tk <- read.csv("phylogeny_of_bins.csv", header = TRUE) | ||
phylo_GTDB_tk$GTDB_taxa <- as.factor(phylo_GTDB_tk$GTDB_taxa) | ||
|
||
#Create a normalized for genome size percent of mapped reads | ||
|
||
SRR828645_profile$norm_reads <- (SRR828645_profile$MappedReads/(SRR828645_profile$BinSize*100000000)) | ||
SRR828645_norm_reads <- sum(SRR828645_profile$norm_reads) | ||
SRR828645_profile$norm_perc_reads <- (SRR828645_profile$norm_reads/SRR828645_norm_reads)*100 | ||
SRR828645_mini <- cbind(SRR828645_profile$BinId, SRR828645_profile$norm_reads, (replicate(25, "week1"))) | ||
colnames(SRR828645_mini) <- c('BinID', 'NormPercReads', 'Week') | ||
SRR828645_mini <- as.data.frame(SRR828645_mini) | ||
|
||
SRR828661_profile$norm_reads <- (SRR828661_profile$MappedReads/(SRR828661_profile$BinSize*100000000)) | ||
SRR828661_norm_reads <- sum(SRR828661_profile$norm_reads) | ||
SRR828661_profile$norm_perc_reads <- (SRR828661_profile$norm_reads/SRR828661_norm_reads)*100 | ||
SRR828661_mini <- cbind(SRR828661_profile$BinId, SRR828661_profile$norm_reads, (replicate(25, "week2"))) | ||
colnames(SRR828661_mini) <- c('BinID', 'NormPercReads', 'Week') | ||
SRR828661_mini <- as.data.frame(SRR828661_mini) | ||
|
||
SRR828660_profile$norm_reads <- (SRR828660_profile$MappedReads/(SRR828660_profile$BinSize*100000000)) | ||
SRR828660_norm_reads <- sum(SRR828660_profile$norm_reads) | ||
SRR828660_profile$norm_perc_reads <- (SRR828660_profile$norm_reads/SRR828660_norm_reads)*100 | ||
SRR828660_mini <- cbind(SRR828660_profile$BinId, SRR828660_profile$norm_reads, (replicate(25, "week3"))) | ||
colnames(SRR828660_mini) <- c('BinID', 'NormPercReads', 'Week') | ||
SRR828660_mini <- as.data.frame(SRR828660_mini) | ||
|
||
all_weeks <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini) | ||
all_weeks <- transform(all_weeks, NormPercReads = as.numeric(NormPercReads)) | ||
all_weeks$BinID <- phylo_GTDB_tk[(match(all_weeks$BinID,phylo_GTDB_tk$Bin)),9] | ||
all_weeks$BinID <- factor(all_weeks$BinID, levels = phylo_GTDB_tk$GTDB_taxa) | ||
|
||
stacked_norm_perc <- ggplot(all_weeks, aes(fill=BinID, y=NormPercReads, x=Week,)) + | ||
geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +theme(panel.border = element_blank()) | ||
|
||
stacked_norm_perc <- stacked_norm_perc + labs(fill = 'Bacteria') | ||
stacked_norm_perc | ||
#save as .svg | ||
ggsave(file="stacked_norm_perc.svg", plot=stacked_norm_perc, width=10, height=8) | ||
|
||
#Alejandro wanted me to try an alternative order, specifically in order of the most abundant bacteria. | ||
|
||
#I interpreted this as the most abundant bacteria over the three days, the noramlized by each day or percentage of reads. | ||
|
||
abundance_order <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini) | ||
abundance_order$NormPercReads <- as.numeric(abundance_order$NormPercReads) | ||
binorder <- abundance_order %>% group_by(BinID) %>% summarise(total_abund = sum(NormPercReads)) %>% arrange(desc(total_abund)) %>% select(BinID) | ||
abundance_order <- transform(abundance_order, NormPercReads = as.numeric(NormPercReads)) | ||
abundance_order$BinID <- factor(abundance_order$BinID, levels = binorder$BinID) | ||
abundance_order$Bacteria <- phylo_GTDB_tk[(match(abundance_order$BinID,phylo_GTDB_tk$Bin)),9] | ||
abundance_order$Bacteria <- as.character(abundance_order$Bacteria) | ||
abundance_order$Bacteria <- factor(abundance_order$Bacteria, levels = phylo_GTDB_tk[(match(binorder$BinID,phylo_GTDB_tk$Bin)),9]) | ||
|
||
stacked_norm_perc2 <- ggplot(abundance_order, aes(fill=Bacteria, y=NormPercReads, x=Week)) + | ||
geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(panel.border = element_blank()) + | ||
+ theme(text = element_text(size = 10)) | ||
stacked_norm_perc2 + theme(text = element_text(size = 10)) + theme(axis.title = element_text(size = 20)) + theme(legend.text = element_text(size = 20)) | ||
stacked_norm_perc2 <- stacked_norm_perc2 + theme(text = element_text(size = 10)) + theme(axis.title = element_text(size = 20)) + theme(legend.text = element_text(size = 10)) + theme(axis.title.x = element_text(size = 15, angle=90, hjust=1, vjust=1)) | ||
stacked_norm_perc2 | ||
ggsave(file="stacked_norm_perc2.svg", plot=stacked_norm_perc2 )#, width=10, height=8) | ||
|
||
#I will try it based on abundance of week 1 | ||
|
||
abundance_order2 <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini) | ||
abundance_order2$NormPercReads <- as.numeric(abundance_order2$NormPercReads) | ||
binorder2 <- abundance_order2 %>% group_by(BinID) %>% filter(Week == 'week1') %>% arrange(desc(NormPercReads)) | ||
abundance_order2 <- transform(abundance_order2, NormPercReads = as.numeric(NormPercReads)) | ||
abundance_order2$BinID <- factor(abundance_order2$BinID, levels = binorder2$BinID) | ||
abundance_order2$Bacteria <- phylo_GTDB_tk[(match(abundance_order2$BinID,phylo_GTDB_tk$Bin)),9] | ||
abundance_order2$Bacteria <- as.character(abundance_order2$Bacteria) | ||
abundance_order2$Bacteria <- factor(abundance_order2$Bacteria, levels = phylo_GTDB_tk[(match(binorder2$BinID,phylo_GTDB_tk$Bin)),9]) | ||
stacked_norm_perc3 <- ggplot(abundance_order2, aes(fill=Bacteria, y=NormPercReads, x=Week,)) + | ||
geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +theme(panel.border = element_blank()) | ||
|
||
stacked_norm_perc3 | ||
|
||
#The order for organizing bacteria for figures is | ||
bacteria_order <- as.data.frame(binorder$BinID) | ||
bacteria_order$`binorder$BinID` <- phylo_GTDB_tk[(match(bacteria_order$`binorder$BinID`,phylo_GTDB_tk$Bin)),9] | ||
write_csv(bacteria_order, file = 'bacteria_order.csv') | ||
|
||
#Alternative colour schemes I could use | ||
library(pals) | ||
stacked_norm_perc + scale_fill_manual(values = rev(cols25(n=25))) | ||
stacked_norm_perc + scale_fill_manual(values = cols25(n=26)) | ||
stacked_norm_perc + scale_fill_manual(values = (glasbey(n=25))) | ||
stacked_norm_perc + scale_fill_manual(values = rev(glasbey(n=25))) | ||
stacked_norm_perc + scale_fill_manual(values = rev(unname(polychrome(n=25)))) | ||
stacked_norm_perc + scale_fill_manual(values = rev(unname(alphabet(n=25)))) | ||
stacked_norm_perc + scale_fill_manual(values = (unname(alphabet(n=25)))) | ||
|
Oops, something went wrong.