From d91a02a74495a56c321c05628aee3cfdb26baa66 Mon Sep 17 00:00:00 2001 From: sgsutcliffe <47899883+sgsutcliffe@users.noreply.github.com> Date: Thu, 22 Dec 2022 15:17:57 -0500 Subject: [PATCH] Scripts used in publication --- BowtieProphageFinding.py | 120 ++++ MAGS_relative_abundance.R | 106 ++++ Modified-PropagAtE_script.py | 968 +++++++++++++++++++++++++++++++ VLP_analysis.R | 1042 ++++++++++++++++++++++++++++++++++ checkV_coverage_script.R | 80 +++ prophage_coverage_script.R | 277 +++++++++ prophage_merger_v2.py | 71 +++ 7 files changed, 2664 insertions(+) create mode 100644 BowtieProphageFinding.py create mode 100644 MAGS_relative_abundance.R create mode 100644 Modified-PropagAtE_script.py create mode 100644 VLP_analysis.R create mode 100644 checkV_coverage_script.R create mode 100644 prophage_coverage_script.R create mode 100644 prophage_merger_v2.py diff --git a/BowtieProphageFinding.py b/BowtieProphageFinding.py new file mode 100644 index 0000000..88ee6b2 --- /dev/null +++ b/BowtieProphageFinding.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Nov 19 15:54:15 2020 + +@author: steven +""" + +from Bio import SeqIO + +import csv + +#loading in the file with three columns 1 bacterial bin contig name 2 base site 3 coverage output from mpileup +path_mpileup="Data_For_Scripts/three_columns_mpileup_DAScheckbins" +tsv_file = open(path_mpileup) + +read_tsv = csv.reader(tsv_file, delimiter="\t") + +#turns tsv into a list of lists, each list is a row +mpileup = [] +for row in read_tsv: + + mpileup.append(row) + +contig_list = [] + + +#makes a list of all contigs in tsv +for n in mpileup: + if n[0] not in contig_list: + contig_list.append(n[0]) + +print('made contig list') + +#iterate through all the contigs and see if there is a prophage region and add them to the dictionary + +prophage_regions = [] + +#contig by contig +for contig in contig_list: + + + #this takes all lines that belong to this specifc contig and makes a contig specific pileup list + mpileup_contig = [] + + #collect all base-pairs or rows for a contig + for i in mpileup: + + if i[0] == contig: + + mpileup_contig.append(i) + + #For regions where there were no hits mpileup skips these lines, so I added a zero coverage value to fill in the 'gaps' + for k in range(1,len(mpileup_contig)): + + if (int(mpileup_contig[k-1][1]) + 1) != (int(mpileup_contig[k][1])): + + mpileup_contig.insert(int(k), [contig, (int(mpileup_contig[k-1][1]) + 1), 0]) + + #Now we calculate coverage for a sliding window I can define + sliding_window = 1000 + + #Set a min coverage to alert me + min_coverage = 10 + contig_coverage = 0 + + #Go over contig measure number of reads that map to each base + for d in range(0,(len(mpileup_contig) - sliding_window)): + contig_reads = 0 + for q in range(d,(d + sliding_window)): + contig_reads += int(mpileup_contig[q][2]) + + #calculate coverage + contig_coverage = contig_reads/sliding_window + + if contig_coverage >= min_coverage: + # print(contig) + # print(str(d) + ":" + str(d + sliding_window)) + # print(contig_coverage) + # print(d) + prophage_regions.append([contig, d, (d+sliding_window)]) + +for b in range(0,len(prophage_regions)): + outF = open("prophage_regions_step1.txt", "a") + outF.writelines(str(prophage_regions[b][0]) + ' ' + str(prophage_regions[b][1]) + ' ' + str(prophage_regions[b][2]) + '\n') + outF.close() + + +print('found prophage regions') + +for n in range(0, len(prophage_regions)): + + + while (len(prophage_regions) > (n + 1)) and (prophage_regions[n][0] == prophage_regions[n+1][0]) and (int(prophage_regions[n][2]) > int(prophage_regions[n+1][1])): + prophage_regions[n][2] = prophage_regions[n+1][2] + prophage_regions.pop(n+1) + + +for op in range(0,len(prophage_regions)): + outF = open("prophage_regions_step1.txt", "a") + outF.writelines(str(prophage_regions[op][0]) + ' ' + str(prophage_regions[op][1]) + ' ' + str(prophage_regions[op][2]) + '\n') + outF.close() + +print('Regions') + +#Here I load in the concatenated fasta file of all the bacterial bins involved in the study +bacterial_bin = {} +for seq_record1 in SeqIO.parse("Data_For_Scripts/Final_Collection_DAS_check_MAGs.fa"", "fasta"): + bacterial_bin[seq_record1.id] = seq_record1.seq + +for b1 in range(0, len(prophage_regions)): + if prophage_regions[b1][0] in bacterial_bin: + + outF = open("bowtie_prophages_DASins.fa", "a") + outF.writelines(">" + prophage_regions[b1][0] + '\n') + outF.writelines(bacterial_bin[prophage_regions[b1][0]][(int(prophage_regions[b1][1])):(int(prophage_regions[b1][2]))] + '\n') + outF.close() + +print('End') + diff --git a/MAGS_relative_abundance.R b/MAGS_relative_abundance.R new file mode 100644 index 0000000..86c4c86 --- /dev/null +++ b/MAGS_relative_abundance.R @@ -0,0 +1,106 @@ +#Open packages +library(dplyr) +library(readr) +library(ggplot2) +library(tidyr) +library(stringr) +library("ggsci") + + + +#Relative abundances from CheckM +SRR828645_profile <- read.delim("SRR828645_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community')) +SRR828660_profile <- read.delim("SRR828660_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community')) +SRR828661_profile <- read.delim("SRR828661_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community')) + +#Elsewhere I turned the GTDB_tk into a phylogentically sorted list +phylo_GTDB_tk <- read.csv("phylogeny_of_bins.csv", header = TRUE) +phylo_GTDB_tk$GTDB_taxa <- as.factor(phylo_GTDB_tk$GTDB_taxa) + +#Create a normalized for genome size percent of mapped reads + +SRR828645_profile$norm_reads <- (SRR828645_profile$MappedReads/(SRR828645_profile$BinSize*100000000)) +SRR828645_norm_reads <- sum(SRR828645_profile$norm_reads) +SRR828645_profile$norm_perc_reads <- (SRR828645_profile$norm_reads/SRR828645_norm_reads)*100 +SRR828645_mini <- cbind(SRR828645_profile$BinId, SRR828645_profile$norm_reads, (replicate(25, "week1"))) +colnames(SRR828645_mini) <- c('BinID', 'NormPercReads', 'Week') +SRR828645_mini <- as.data.frame(SRR828645_mini) + +SRR828661_profile$norm_reads <- (SRR828661_profile$MappedReads/(SRR828661_profile$BinSize*100000000)) +SRR828661_norm_reads <- sum(SRR828661_profile$norm_reads) +SRR828661_profile$norm_perc_reads <- (SRR828661_profile$norm_reads/SRR828661_norm_reads)*100 +SRR828661_mini <- cbind(SRR828661_profile$BinId, SRR828661_profile$norm_reads, (replicate(25, "week2"))) +colnames(SRR828661_mini) <- c('BinID', 'NormPercReads', 'Week') +SRR828661_mini <- as.data.frame(SRR828661_mini) + +SRR828660_profile$norm_reads <- (SRR828660_profile$MappedReads/(SRR828660_profile$BinSize*100000000)) +SRR828660_norm_reads <- sum(SRR828660_profile$norm_reads) +SRR828660_profile$norm_perc_reads <- (SRR828660_profile$norm_reads/SRR828660_norm_reads)*100 +SRR828660_mini <- cbind(SRR828660_profile$BinId, SRR828660_profile$norm_reads, (replicate(25, "week3"))) +colnames(SRR828660_mini) <- c('BinID', 'NormPercReads', 'Week') +SRR828660_mini <- as.data.frame(SRR828660_mini) + +all_weeks <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini) +all_weeks <- transform(all_weeks, NormPercReads = as.numeric(NormPercReads)) +all_weeks$BinID <- phylo_GTDB_tk[(match(all_weeks$BinID,phylo_GTDB_tk$Bin)),9] +all_weeks$BinID <- factor(all_weeks$BinID, levels = phylo_GTDB_tk$GTDB_taxa) + +stacked_norm_perc <- ggplot(all_weeks, aes(fill=BinID, y=NormPercReads, x=Week,)) + + geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +theme(panel.border = element_blank()) + +stacked_norm_perc <- stacked_norm_perc + labs(fill = 'Bacteria') +stacked_norm_perc +#save as .svg +ggsave(file="stacked_norm_perc.svg", plot=stacked_norm_perc, width=10, height=8) + +#Alejandro wanted me to try an alternative order, specifically in order of the most abundant bacteria. + +#I interpreted this as the most abundant bacteria over the three days, the noramlized by each day or percentage of reads. + +abundance_order <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini) +abundance_order$NormPercReads <- as.numeric(abundance_order$NormPercReads) +binorder <- abundance_order %>% group_by(BinID) %>% summarise(total_abund = sum(NormPercReads)) %>% arrange(desc(total_abund)) %>% select(BinID) +abundance_order <- transform(abundance_order, NormPercReads = as.numeric(NormPercReads)) +abundance_order$BinID <- factor(abundance_order$BinID, levels = binorder$BinID) +abundance_order$Bacteria <- phylo_GTDB_tk[(match(abundance_order$BinID,phylo_GTDB_tk$Bin)),9] +abundance_order$Bacteria <- as.character(abundance_order$Bacteria) +abundance_order$Bacteria <- factor(abundance_order$Bacteria, levels = phylo_GTDB_tk[(match(binorder$BinID,phylo_GTDB_tk$Bin)),9]) + +stacked_norm_perc2 <- ggplot(abundance_order, aes(fill=Bacteria, y=NormPercReads, x=Week)) + + geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(panel.border = element_blank()) + + + theme(text = element_text(size = 10)) +stacked_norm_perc2 + theme(text = element_text(size = 10)) + theme(axis.title = element_text(size = 20)) + theme(legend.text = element_text(size = 20)) +stacked_norm_perc2 <- stacked_norm_perc2 + theme(text = element_text(size = 10)) + theme(axis.title = element_text(size = 20)) + theme(legend.text = element_text(size = 10)) + theme(axis.title.x = element_text(size = 15, angle=90, hjust=1, vjust=1)) +stacked_norm_perc2 +ggsave(file="stacked_norm_perc2.svg", plot=stacked_norm_perc2 )#, width=10, height=8) + +#I will try it based on abundance of week 1 + +abundance_order2 <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini) +abundance_order2$NormPercReads <- as.numeric(abundance_order2$NormPercReads) +binorder2 <- abundance_order2 %>% group_by(BinID) %>% filter(Week == 'week1') %>% arrange(desc(NormPercReads)) +abundance_order2 <- transform(abundance_order2, NormPercReads = as.numeric(NormPercReads)) +abundance_order2$BinID <- factor(abundance_order2$BinID, levels = binorder2$BinID) +abundance_order2$Bacteria <- phylo_GTDB_tk[(match(abundance_order2$BinID,phylo_GTDB_tk$Bin)),9] +abundance_order2$Bacteria <- as.character(abundance_order2$Bacteria) +abundance_order2$Bacteria <- factor(abundance_order2$Bacteria, levels = phylo_GTDB_tk[(match(binorder2$BinID,phylo_GTDB_tk$Bin)),9]) +stacked_norm_perc3 <- ggplot(abundance_order2, aes(fill=Bacteria, y=NormPercReads, x=Week,)) + + geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +theme(panel.border = element_blank()) + +stacked_norm_perc3 + +#The order for organizing bacteria for figures is +bacteria_order <- as.data.frame(binorder$BinID) +bacteria_order$`binorder$BinID` <- phylo_GTDB_tk[(match(bacteria_order$`binorder$BinID`,phylo_GTDB_tk$Bin)),9] +write_csv(bacteria_order, file = 'bacteria_order.csv') + +#Alternative colour schemes I could use +library(pals) +stacked_norm_perc + scale_fill_manual(values = rev(cols25(n=25))) +stacked_norm_perc + scale_fill_manual(values = cols25(n=26)) +stacked_norm_perc + scale_fill_manual(values = (glasbey(n=25))) +stacked_norm_perc + scale_fill_manual(values = rev(glasbey(n=25))) +stacked_norm_perc + scale_fill_manual(values = rev(unname(polychrome(n=25)))) +stacked_norm_perc + scale_fill_manual(values = rev(unname(alphabet(n=25)))) +stacked_norm_perc + scale_fill_manual(values = (unname(alphabet(n=25)))) + diff --git a/Modified-PropagAtE_script.py b/Modified-PropagAtE_script.py new file mode 100644 index 0000000..072dcf8 --- /dev/null +++ b/Modified-PropagAtE_script.py @@ -0,0 +1,968 @@ +#! /usr/bin/env python3 +# Author: Kristopher Kieft, UW-Madison +# kieft@wisc.edu + + +# PropagAtE: Prophage Activity Estimator +# Version: v1.0.0 +# Release date: January 21 2021 +# dev: 22 + +# Modified by Steven Sutcliffe +# Allows prophages with no flanking regions to be included using the bacterial MAG mean coverage + +# Imports +try: + import warnings + warnings.filterwarnings("ignore") + import argparse + import subprocess + import sys + import random + import time + import datetime + from datetime import date + from scipy import stats + import statistics + from scipy.stats import mannwhitneyu + from scipy.stats import combine_pvalues + import math + import uuid + import logging + import os + from Bio.SeqIO.FastaIO import SimpleFastaParser +except Exception as e: + sys.stderr.write("\nError: please verify dependancy imports are installed and up to date:\n\n") + sys.stderr.write(str(e) + "\n\n") + exit(1) + +from sys import exit +# Set up variables +start = time.time() +start_time = str(datetime.datetime.now().time()).rsplit(".",1)[0] +u = str(uuid.uuid1()).split("-")[0] + +# propagate = argparse.ArgumentParser(description='Example 1: python3 PropagAtE_run.py -f scaffolds.fasta -r forward.fastq reverse.fastq -o output.tsv -v prophage_coordinates.tsv -t threads -i ID . . . Example 2: python3 PropagAtE_run.py -s alignment.sam -o output.tsv -v prophage_coordinates.tsv') +# propagate.add_argument('--version', action='version', version='PropagAtE v1.0.0') + +# # Input required +# propagate.add_argument('-s', type=str, nargs=1, default = [''], help='input SAM (.sam) sequence alignment file (skip -sb/-b/-r/-f).') +# propagate.add_argument('-b', type=str, nargs=1, default = [''], help='input BAM (.bam) sequence alignment file (skip -sb/-s/-r/-f).') +# propagate.add_argument('-sb', type=str, nargs=1, default = [''], help='input sorted BAM (.bam) sequence alignment file (skip -b/-s/-r/-f).') +# propagate.add_argument('-r', type=str, nargs=2, default = ['',''], help='input forward and reverse read files (.fastq) separated by a space (skip -sb/-b/-s, requires -f)') +# propagate.add_argument('-v', type=str, nargs=1, required = True, help='VIBRANT "integrated_prophage_coordinates" file or custom prophage coordinates file.') + +# # Optional +# propagate.add_argument('-o', type=str, nargs=1, default = ['PropagAtE_result.' + str(u) + '.tsv'], help='name of output tab-separated file for results.') +# propagate.add_argument('-f', type=str, nargs=1, default = [''], help='if input is -r provide genomes/scaffolds (.fna, .fastq, .fa, .fsa).') +# propagate.add_argument('-t', type=str, nargs=1, default = ['1'], help='threads used for Bowtie2 mapping (if input is -r and -f).') +# propagate.add_argument('-i', type=str, nargs=1, default = [''], help='unique ID to append to Bowtie2 SAM output for reference to read sets used (not requied, useful when analyzing multiple read sets).') +# propagate.add_argument('-p', type=str, nargs=1, default = ['0.05'], help='p-value cutoff for Mann-Whitney statistical test (default=0.05).') +# propagate.add_argument('-g', type=str, nargs=1, default = ['1'], choices=["0", "1", "2", "3"], help='number of gap extensions allowed in read alignment (per read) [default=1, options=0-3].') +# propagate.add_argument('-m', type=str, nargs=1, default = ['3'], choices=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], help='number of mismatches allowed in read alignment (per read) [default=3, options=0-10].') +# propagate.add_argument('-n', type=str, nargs=1, default = ['5'], help='number of coverage sub-samples to take for Mann-Whitney statistical analysis [default=5, minimum=5].') +# propagate.add_argument('-a', type=str, nargs=1, default = ['4'], choices=["0", "1", "2", "3", "4"], help='remove outlier coverage values "a" standard deviations from the average [default=4].') +# propagate.add_argument('-e', type=str, nargs=1, default = ['0.75'], help="minimum effect size for significance by Cohen's d test [default=0.75, minimum=0.6].") +# propagate.add_argument('-c', type=str, nargs=1, default = ['1.65'], help="minimum prophage:host coverage ratio for significance [default=1.65, minimum=1.5].") +# propagate.add_argument('-y', type=str, nargs=1, default = [''], help='term to distinguish host from prophage name in genome/scaffold definition lines [default="_fragment_"].') +# propagate.add_argument('-x', type=str, nargs=1, default = [''], help="path to Samtools executable if not in PATH.") +# propagate.add_argument('-clean', action='store_true', help='use this setting to remove (if applicable) any generated SAM, unsorted BAM and Bowtie2 index files. Will retain any user input files, raw reads and sorted BAM files (default=off).') +# propagate.add_argument('-all', action='store_true', help='use this setting to keep all aligned reads (ignore -g and -m, default=off).') +# propagate.add_argument('-z', type=str, nargs=1, default = [''], help='character(s) used to replace spaces in genome/scaffold names. Use this setting if spaces were replaced in the alignment file (SAM/BAM) but not in the prophage coordinates file.') + +# Parse arguments +args = False +samfile = '' +bamfile = '' +sortbamfile = "SRR828661_das_check_bin_abundance.gap-1_mm-3.sorted.bam" +fasta = '' +forward = '' +reverse = '' +threads = '' +vibe = "propagate_coordinates" +id = '' +pval = 0.05 +pval = 0.050 +gaps = 1 +mismatches = 3 +subs = 5 +outliers = 4 +effect = 0.75 +ratio_cutoff = 1.65 +termsplit = '' +###Making a split for the bin as well: +binsplit = '_k119_' +outfile = "test.tsv" +outpath = "Test_Bowtie" +clean = False +args_all = False + +try: + temp_out = str(outfile).rsplit("/",1)[1] +except Exception: + temp_out = str(outfile) +outfile = outpath + str(temp_out) +sampath = '/usr/local/bin/samtools' +spaces = '' +replace = False +if termsplit == '': + termsplit = "_fragment_" +if spaces == '': + spaces == '~~' +else: + replace = True +if sampath != '': + if "samtools" == sampath[-8:]: + sampath = sampath[:-8] + if sampath[-1] != '/': + sampath += '/' + +# log file +logfilename = str(outfile).rsplit(".",1)[0] + ".log" +subprocess.run("rm " + logfilename + " 2> /dev/null", shell=True) +logging.basicConfig(filename=logfilename, level=logging.INFO, format='%(message)s') + +# verify inputs and versions +if termsplit[0] == " ": + sys.stderr.write("\nThe -y input term cannot start with a space. If you tried '-y -gene' then try '-y-gene' instead. Exiting.\n") + logging.info("\nThe -y input term cannot start with a space. If you tried '-y -gene' then try '-y-gene' instead. Exiting.\n") + exit(1) +if outfile.rsplit(".",1)[1] == "log": + sys.stderr.write("\nThe output file cannot end in .log, suggested is .tsv. Exiting.\n") + logging.info("\nThe output file cannot end in .log, suggested is .tsv. Exiting.\n") + exit(1) +if (samfile != '' and bamfile != '') or (samfile != '' and forward != '') or (bamfile != '' and forward != '') or (samfile != '' and sortbamfile != '') or (sortbamfile != '' and bamfile != '') or (forward != '' and sortbamfile != ''): + sys.stderr.write("\nOnly one input file (-s, -sb, -b, -r) is allowed. Exiting.\n") + logging.info("\nOnly one input file (-s, -sb, -b, -r) is allowed. Exiting.\n") + exit(1) +try: + if (forward.rsplit('.',1)[1] != 'fastq' or reverse.rsplit('.',1)[1] != 'fastq') and ('.'.join(forward.rsplit('.',2)[1:]) != 'fastq.gz' or '.'.join(reverse.rsplit('.',2)[1:]) != 'fastq.gz'): + sys.stderr.write("\nError: Provided reads files must both have the extension .fastq or .fastq.gz. Exiting.\n") + logging.info("\nError: Provided reads files must both have the extension .fastq or .fastq.gz. Exiting.\n") + exit(1) +except Exception: + pass +try: + if forward != '' and reverse != '' and fasta.rsplit('.',1)[1] != 'fa' and fasta.rsplit('.',1)[1] != 'fna' and fasta.rsplit('.',1)[1] != 'fasta' and fasta.rsplit('.',1)[1] != 'fsa': + sys.stderr.write("\nError: Provided fasta file must have the extension .fasta, .fna, .fa or .fsa. Exiting.\n") + logging.info("\nError: Provided fasta file must have the extension .fasta, .fna, .fa or .fsa. Exiting.\n") + exit(1) +except Exception: + pass +if forward != '' and reverse != '' and fasta == '': + sys.stderr.write("\nError: Reads (-r) were provided but a sequence file (-f) was not. Exiting.\n") + logging.info("\nError: Reads (-r) were provided but a sequence file (-f) was not. Exiting.\n") + exit(1) +try: + if samfile.rsplit('.',1)[1] != 'sam': + sys.stderr.write("\nError: Provided sam file must have the extension .sam. Exiting.\n") + logging.info("\nError: Provided sam file must have the extension .sam. Exiting.\n") + exit(1) +except Exception: + pass +try: + if bamfile.rsplit('.',1)[1] != 'bam': + sys.stderr.write("\nError: Provided bam file must have the extension .bam. Exiting.\n") + logging.info("\nError: Provided bam file must have the extension .bam. Exiting.\n") + exit(1) +except Exception: + pass +try: + if sortbamfile.rsplit('.',1)[1] != 'bam': + sys.stderr.write("\nError: Provided bam (sorted) file must have the extension .bam. Exiting.\n") + logging.info("\nError: Provided bam (sorted) file must have the extension .bam. Exiting.\n") + exit(1) +except Exception: + pass + +if not os.path.exists(vibe): + sys.stderr.write("\nError: could not identify prophage coordinates file (-v). Exiting.\n") + logging.info("\nError: could not identify prophage coordinates file (-v). Exiting.\n") + exit(1) +if effect < 0.6: + sys.stderr.write("\nError: Cohen's d effect size should not be set below 0.6. Exiting.\n") + logging.info("\nError: Cohen's d effect size should not be set below 0.6. Exiting.\n") + exit(1) +if ratio_cutoff < 1.5: + sys.stderr.write("\nError: ratio cutoff should not be set below 1.5. Exiting.\n") + logging.info("\nError: ratio cutoff should not be set below 1.5. Exiting.\n") + exit(1) +if subs < 5: + sys.stderr.write("\nError: subsample number should not be set below 5. Exiting.\n") + logging.info("\nError: subsample number should not be set below 5. Exiting.\n") + exit(1) +try: + subprocess.check_output("which " + sampath + "samtools", shell=True) +except Exception: + sys.stderr.write("\nError: samtools does not appear to be installed, is not in the system's PATH or is not in the indicated path (-x). Exiting.\n") + logging.info("\nError: samtools does not appear to be installed, is not in the system's PATH or is not in the indicated path (-x). Exiting.\n") + exit(1) +try: + subprocess.check_output(sampath + "samtools depth -a placeholder.bam 2> " + outpath + u + "_samtools_checkfile.txt", shell=True) +except Exception as e: + with open(outpath + u + "_samtools_checkfile.txt",'r') as checkfile: + for line in checkfile: + if "depth: invalid option -- 'a'" in line: + sys.stderr.write("\nError: samtools version is incorrect. Please update and verify samtools '-a' flag is available.") + sys.stderr.write("\n Alternatively use the '-x' flag to provide the location of the correct samtools executable.") + sys.stderr.write("\n Example update using conda: 'conda install -c bioconda samtools'. Exiting.\n") + logging.info("\nError: samtools version is incorrect. Please update and verify samtools '-a' flag is available.") + logging.info("\n Alternatively use the '-x' flag to provide the location of the correct samtools executable.") + subprocess.run("rm " + outpath + u + "_samtools_checkfile.txt 2> /dev/null", shell=True) + exit(1) +subprocess.run("rm " + outpath + u + "_samtools_checkfile.txt 2> /dev/null", shell=True) + +if forward == '' and fasta != '': + logging.info("Note: Reads not provided, ignoring input fasta file.\n") +if forward == '' and threads != '1': + logging.info("Note: Reads not provided, using 1 thread.\n") +if forward == '' and id != '': + logging.info("Note: Reads not provided, ignoring identifier ID input.\n") + +# Statistics +def MWdownsampling(phage, background): + n = 1 + pvalue_list = [] + while n <= subs: + cov_p_100 = random.sample(phage, 100) + cov_h_100 = random.sample(background, 100) + if sum(cov_p_100) != 0 and sum(cov_h_100) != 0: + stat, pvalue = mannwhitneyu(cov_p_100, cov_h_100) + pvalue_list.append(pvalue) + n += 1 + result = combine_pvalues(pvalue_list) + return result + +def cohenD(phage_mean, phage_sd, host_mean, host_sd): + pool = math.sqrt((phage_sd**2+host_sd**2)/2) + d = (host_mean-phage_mean)/pool + return abs(d) + + +##### ----------------------------------------------------------------------------------------------------------------------- ##### +logging.info("Command: %s" % ' '.join(sys.argv)) +logging.info("Outfolder: %s" % outpath) +logging.info("") +logging.info("Date: %s" % str(date.today())) +logging.info("Time: %s" % str(datetime.datetime.now().time()).rsplit(".",1)[0]) +logging.info("Program: PropagAtE v1.0.0") +logging.info("Run ID: %s\n" % u) + +logging.info("Time (min) | Log ") +logging.info("--------------------------------------------------------------------") + +reads_input = False +sam_input = False +bam_input = False +sortbam_input = False + +# If input is reads/fasta run Bowtie2 +if forward != '' and fasta != '': + reads_input = True + logging.info("%s Reads input identified, using %s threads to run Bowtie2." % (str(round((time.time() - float(start))/60,1)),threads)) + + try: + subprocess.check_output("which bowtie2", shell=True) + except Exception: + sys.stderr.write("\nError: Bowtie2 does not appear to be installed or is not in the system's PATH. Exiting.\n") + logging.info("\nError: Bowtie2 does not appear to be installed or is not in the system's PATH. Exiting.\n") + exit(1) + try: + temp = fasta.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = fasta.rsplit(".",1)[0] + + logging.info("%s Checking format of FASTA file, replacing spaces if necessary." % str(round((time.time() - float(start))/60,1))) + with open(fasta, 'r') as fasta_in, open(outpath + base + ".no-spaces.fasta", 'w') as fasta_out: + for name,seq in SimpleFastaParser(fasta_in): + if " " in name: + replace = True + name = name.replace(" ", str(spaces)) + fasta_out.write(">" + str(name) + "\n" + str(seq) + "\n") + if replace == True: + logging.info('%s Spaces identified in sequence names. Replaced with "~~".' % str(round((time.time() - float(start))/60,1))) + fasta = outpath + base + ".no-spaces.fasta" + + if not os.path.exists(outpath + str(base)+".bowtie2.index.1.bt2"): + logging.info("%s Building index ..." % str(round((time.time() - float(start))/60,1))) + build = True + subprocess.run("bowtie2-build " + str(fasta) + " " + outpath + str(base)+".bowtie2.index > /dev/null 2> /dev/null", shell=True) + else: + build = False + logging.info("%s Existing index identified, skipping build ..." % str(round((time.time() - float(start))/60,1))) + logging.info("%s Mapping reads ..." % str(round((time.time() - float(start))/60,1))) + if id != '': + subprocess.run("bowtie2 -x " + outpath + str(base)+".bowtie2.index -1 " + str(forward) + " -2 " + str(reverse) + " -S " + outpath + str(base)+"." + str(id) + ".sam -q -p " + str(threads) + " --no-unal > /dev/null 2> /dev/null", shell=True) + else: + subprocess.run("bowtie2 -x " + outpath + str(base)+".bowtie2.index -1 " + str(forward) + " -2 " + str(reverse) + " -S " + outpath + str(base)+".sam -q -p " + str(threads) + " --no-unal > /dev/null 2> /dev/null", shell=True) + if clean == True and build == True: + subprocess.run("rm " + outpath + str(base)+".bowtie2.index.* 2> /dev/null", shell=True) + logging.info("%s Bowtie2 done." % str(round((time.time() - float(start))/60,1))) + if str(id) != "": + samfile = outpath + str(base)+"." + str(id) + ".sam" + else: + samfile = outpath + str(base)+".sam" + subprocess.run("rm " + outpath + base + ".no-spaces.fasta 2> /dev/null", shell=True) + +# Read in prophage coordinate data +prophage_dict = {} +prophage_dict_frags = {} +vibrant_genomes_full = [] +logging.info("%s Generating a list of all prophage regions ..." % str(round((time.time() - float(start))/60,1))) +logging.info("%s Distinguishing host from prophage names by the term '%s' ..." % (str(round((time.time() - float(start))/60,1)),str(termsplit))) +with open(vibe, 'r') as vibrant: + phage_list = vibrant.read().replace("\n","\t").split("\t") + if phage_list[0] == "scaffold" and phage_list[1] == "fragment" and phage_list[5] == "nucleotide start" and phage_list[6] == "nucleotide stop": + n = 9 + while n < len(phage_list): + name = str(phage_list[n]).split(termsplit)[0] + if replace == True: + name = name.replace(" ", str(spaces)) + if name[0] == "'" or name[0] == '"': + name = name[1:] + if replace == True: + frag = str(phage_list[n]).replace(" ", str(spaces)) + else: + frag = str(phage_list[n]) + if frag[0] == "'" or frag[0] == '"': + frag = frag[1:-1] + + if n == 9: # in case of error + save_name = name + save_frag = frag + if name != '' and frag != '' and name == frag: # names were unable to be split + sys.stderr.write("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n") + sys.stderr.write("Host name detected: %s\n" % name) + sys.stderr.write("Prophage name detected: %s\n" % frag) + sys.stderr.write("-y flag input detected: %s\n" % termsplit) + logging.info("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n") + logging.info("Host name detected: %s" % name) + logging.info("Prophage name detected: %s" % frag) + logging.info("-y flag input detected: %s" % termsplit) + exit(1) + + prophage_dict.update({str(name) + "~" + str(phage_list[n+4]):str(frag)}) + prophage_dict.update({str(name) + "~" + str(phage_list[n+5]):str(frag)}) + prophage_dict_frags.update({str(name):str(frag)}) + vibrant_genomes_full.append(str(name)) + + n += 8 + + elif phage_list[0] == "scaffold" and phage_list[1] == "fragment" and phage_list[2] == "start" and phage_list[3] == "stop": + n = 5 + while n < len(phage_list): + name = str(phage_list[n]).split(termsplit)[0] + if replace == True: + name = name.replace(" ", str(spaces)) + if name[0] == "'" or name[0] == '"': + name = name[1:] + if replace == True: + frag = str(phage_list[n]).replace(" ", str(spaces)) + else: + frag = str(phage_list[n]) + if frag[0] == "'" or frag[0] == '"': + frag = frag[1:-1] + + if n == 5: # in case of error + save_name = name + save_frag = frag + if name != '' and frag != '' and name == frag: # names were unable to be split + sys.stderr.write("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n") + sys.stderr.write("Host name detected: %s\n" % name) + sys.stderr.write("Prophage name detected: %s\n" % frag) + sys.stderr.write("-y flag input detected: %s\n" % termsplit) + logging.info("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n") + logging.info("Host name detected: %s" % name) + logging.info("Prophage name detected: %s" % frag) + logging.info("-y flag input detected: %s" % termsplit) + exit(1) + + prophage_dict.update({str(name) + "~" + str(phage_list[n+1]):str(frag)}) + prophage_dict.update({str(name) + "~" + str(phage_list[n+2]):str(frag)}) + prophage_dict_frags.update({str(name):str(frag)}) + vibrant_genomes_full.append(str(name)) + + n += 4 + + else: + sys.stderr.write("\nError: -v coordinates file appears to be incorrect. Either input a VIBRANT 'integrated_prophage_coordinates' file or a custom tab separated coordinates file with the header 'scaffold/fragment/start/stop'. Exiting.\n") + logging.info("\nError: -v coordinates file appears to be incorrect. Either input a VIBRANT 'integrated_prophage_coordinates' file or a custom tab separated coordinates file with the header 'scaffold/fragment/start/stop'. Exiting.\n") + exit(1) + + number_hosts = list(set(vibrant_genomes_full)) + logging.info("%s Number of prophage regions identified: %s" % (str(round((time.time() - float(start))/60,1)),len(vibrant_genomes_full))) + logging.info("%s Number of unique host regions identified: %s" % (str(round((time.time() - float(start))/60,1)),len(number_hosts))) + vibrant_genomes_full = list(set(vibrant_genomes_full)) + +###### Make a bin list not scaffold list + +bin_list = [] + +for i in vibrant_genomes_full: + name = str(i).split(binsplit)[0] + bin_list.append(str(name)) + +bin_list = list(set(bin_list)) + +# process sam/bam files +logging.info("%s Processing alignment (.sam/.bam) file." % str(round((time.time() - float(start))/60,1))) + +# Gap/mismatch processing for sam file, convert sam to bam +processed = False +if samfile != '': + sam_input = True + try: + temp = samfile.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = samfile.rsplit(".",1)[0] + + if args_all == False: + processed = True + logging.info("%s Alignment .sam input identified, processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1))) + tossed = 0 + kept = 0 + with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf: + for line in sf: + if "XG:i:" in line and "XM:i:" in line: + xg = line.split("\t")[15] + xg = xg.split(":")[2] + xm = line.split("\t")[13] + xm = xm.split(":")[2] + if int(xg) <= gaps and int(xm) <= mismatches: + psf.write(line) + kept += 1 + else: + tossed += 1 + else: + psf.write(line) + if reads_input == True and clean == True: + subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam + samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam' + try: + temp = samfile.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = samfile.rsplit(".",1)[0] + logging.info("%s %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept))) + else: + logging.info("%s Alignment .sam input identified, skipping gaps/mismatches processing ..." % str(round((time.time() - float(start))/60,1))) + + logging.info("%s Converting .sam to .bam ..." % str(round((time.time() - float(start))/60,1))) + subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True) + logging.info("%s Conversion done." % str(round((time.time() - float(start))/60,1))) + bamfile = outpath + str(base) + ".bam" + +samfile_cleaned = False +if reads_input == True and clean == True: + samfile_cleaned = True + subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam + +# Gap/mismatch processing for bam file +if bamfile != '': + bam_input = True + if sam_input == False and args_all == False: + try: + temp = bamfile.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = bamfile.rsplit(".",1)[0] + logging.info("%s Alignment .bam input identified, coverting to .sam for gap/mismatch filtering ..." % str(round((time.time() - float(start))/60,1))) + subprocess.run(sampath + "samtools view -h -o " + outpath + str(base) + ".sam " + str(bamfile) + " 2> /dev/null", shell=True) + + samfile = outpath + str(base) + ".sam" + processed = True + logging.info("%s Alignment .sam input identified, processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1))) + tossed = 0 + kept = 0 + with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf: + for line in sf: + if "XG:i:" in line and "XM:i:" in line: + xg = line.split("\t")[15] + xg = xg.split(":")[2] + xm = line.split("\t")[13] + xm = xm.split(":")[2] + if int(xg) <= gaps and int(xm) <= mismatches: + psf.write(line) + kept += 1 + else: + tossed += 1 + else: + psf.write(line) + if clean == True: + subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam + samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam' + try: + temp = samfile.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = samfile.rsplit(".",1)[0] + logging.info("%s %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept))) + logging.info("%s Converting processed .sam to .bam ..." % str(round((time.time() - float(start))/60,1))) + subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True) + logging.info("%s Conversion done." % str(round((time.time() - float(start))/60,1))) + bamfile = outpath + str(base) + ".bam" + + if clean == True: + samfile_cleaned = True + subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam + + if sam_input == False and args_all == True: + logging.info("%s Alignment .bam input identified, sorting .bam file ..." % str(round((time.time() - float(start))/60,1))) + try: + temp = bamfile.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = bamfile.rsplit(".",1)[0] + subprocess.run(sampath + "samtools sort -o " + outpath + str(base) + ".sorted.bam " + str(bamfile) + " 2> /dev/null", shell=True) + logging.info("%s Sorting done." % str(round((time.time() - float(start))/60,1))) + sortbam = outpath + str(base) + ".sorted.bam" + +bamfile_cleaned = False +if sam_input == True and clean == True: + bamfile_cleaned = True + subprocess.run("rm " + str(bamfile), shell=True) + +# Gap/mismatch processing for sorted bam file +if sortbamfile != '': + if args_all == False and processed == False: + try: + temp = sortbamfile.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = sortbamfile.rsplit(".",1)[0] + + logging.info("%s Sorted .bam input identified, coverting to .sam for gap/mismatch filtering ..." % str(round((time.time() - float(start))/60,1))) + subprocess.run(sampath + "samtools view -h -o " + outpath + str(base) + ".sam " + str(sortbamfile) + " 2> /dev/null", shell=True) + samfile = outpath + str(base) + ".sam" + processed = True + logging.info("%s Processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1))) + tossed = 0 + kept = 0 + with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf: + for line in sf: + if "XG:i:" in line and "XM:i:" in line: + xg = line.split("\t")[15] + xg = xg.split(":")[2] + xm = line.split("\t")[13] + xm = xm.split(":")[2] + if int(xg) <= gaps and int(xm) <= mismatches: + psf.write(line) + kept += 1 + else: + tossed += 1 + else: + psf.write(line) + + if clean == True: + subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam + samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam' + try: + temp = samfile.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = samfile.rsplit(".",1)[0] + logging.info("%s %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept))) + + logging.info("%s Converting processed .sam to .bam ..." % str(round((time.time() - float(start))/60,1))) + subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True) + logging.info("%s Conversion done." % str(round((time.time() - float(start))/60,1))) + bamfile = outpath + str(base) + ".bam" + + if clean == True: + samfile_cleaned = True + subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam + + logging.info("%s Sorting .bam file ..." % str(round((time.time() - float(start))/60,1))) + try: + temp = bamfile.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = bamfile.rsplit(".",1)[0] + subprocess.run(sampath + "samtools sort -o " + outpath + str(base) + ".sorted.bam " + str(bamfile) + " 2> /dev/null", shell=True) + logging.info("%s Sorting done." % str(round((time.time() - float(start))/60,1))) + sortbam = outpath + str(base) + ".sorted.bam" + if clean == True: + samfile_cleaned = True + subprocess.run("rm " + str(bamfile), shell=True) # remove processed or non-processed sam + + sortbam = sortbamfile + if args_all == True: + try: + temp = sortbam.rsplit("/",1)[1] + base = temp.rsplit(".",1)[0] + except Exception: + base = sortbam.rsplit(".",1)[0] + logging.info("%s Sorted alignment .bam input identified, skipping sorting and gap/mismatch processing." % str(round((time.time() - float(start))/60,1))) + +# Extract coverage for each nucleotide +temp = outpath + str(base) + ".temp-list.txt" +subprocess.run("echo '" + str(sortbam) + "' > " + str(temp), shell=True) +cov_file = str(sortbam).rsplit(".",1)[0] + ".coverage-by-nt.tsv" +logging.info("%s Indexing sorted .bam file ..." % str(round((time.time() - float(start))/60,1))) +subprocess.run(sampath + "samtools index " + str(sortbam), shell=True) # index bam file +logging.info("%s Extracting coverage information from sorted .bam file ..." % str(round((time.time() - float(start))/60,1))) + +# take the whole scaffold +for genome in vibrant_genomes_full: + + subprocess.run(sampath + "samtools depth -r " + str(genome) + ":0-999999999 -a -f " + str(temp) + " >> " + cov_file + " 2> /dev/null", shell=True) # extract coverage info + +#subprocess.run("rm " + str(temp) + " 2> /dev/null", shell=True) +#subprocess.run("rm " + str(sortbam) + ".bai 2> /dev/null", shell=True) # remove index +subprocess.run("echo 'next\t0\t0\n' >> " + str(cov_file), shell=True) +logging.info("%s Coverage extraction done." % str(round((time.time() - float(start))/60,1))) + +##### Make a list of all the scafolds + +scaffold_file = str(sortbam).rsplit(".",1)[0] + ".scaffold_list.tsv" +scaffold_cov_file = str(sortbam).rsplit(".",1)[0] + ".scaffold-coverage-by-nt.tsv" +subprocess.run(sampath + "samtools idxstats " + str(sortbam) + " | cut -f 1 >> " + scaffold_file + " 2> /dev/null", shell=True) +scaffold_list = [] + +a_file = open(scaffold_file, "r") + +for line in a_file: + stripped_line = line.strip() + scaffold_list.append(stripped_line) + +a_file.close() + + +##### Take coverage for all scaffolds not just those that have prophages + + +for scaffold in scaffold_list: + + subprocess.run(sampath + "samtools depth -r " + str(scaffold) + ":0-999999999 -a -f " + str(temp) + " >> " + scaffold_cov_file + " 2> /dev/null", shell=True) # extract coverage info + +subprocess.run("echo 'next\t0\t0' >> " + str(scaffold_cov_file), shell=True) + +if int(os.stat(cov_file).st_size) < 50: # Exit if file is empty (< 50 bytes) + if samfile == "": + samfile = "N/A" + if bamfile == "": + bamfile = "N/A" + subprocess.run("rm " + str(cov_file), shell=True) + logging.info("%s No coverage identified for scaffolds with integrated prophages. Analysis finished." % str(round((time.time() - float(start))/60,1))) + logging.info("") + logging.info("") + logging.info("CAUTION:") + logging.info(" 'No coverage identified' may be due to an incorrect -y flag.") + logging.info(" Please verify -y input correctly distinguishes host from prophage sequences.") + logging.info(" This can be verified by checking the -v or -f inputs.") + logging.info(" Example host name detected: %s" % save_name) + logging.info(" Example prophage name detected: %s" % save_frag) + logging.info(" -y flag input: %s" % termsplit) + logging.info("") + logging.info("") + logging.info("Log file: %s" % logfilename) + logging.info("Output results file: N/A") + if samfile_cleaned == False: + logging.info("sam file: %s" % samfile) + else: + logging.info("sam file: N/A") + if bamfile_cleaned == False: + logging.info("bam file: %s" % bamfile) + else: + logging.info("bam file: N/A") + logging.info("sorted bam file: %s" % sortbam) + logging.info("") + logging.info("") + logging.info("Number of active prophages identified: 0") + logging.info("") + logging.info(' ##') + logging.info(' ## ##') + logging.info(' ## ##') + logging.info('###### ## ## ## ####### ###### ##### ## ##') + logging.info('## ## ## ## ## ## ## ## ## ## ##') + logging.info('###### ###### ###### ## ### ###### ### ##') + logging.info('## ## ## ## ## ## ## ## ## ##') + logging.info('## ## ## ## ## ####### ###### ##### ##') + logging.info(' # ## #') + logging.info(' # # ## # #') + logging.info(' # # # #') + logging.info(' # #') + logging.info("") + exit(0) + +# calculate average read length +total = subprocess.check_output(sampath + "samtools view -c " + str(sortbam), shell=True) # if there are less than 10k reads, use total reads to length check +total = int(float(str(total.strip()).split("'")[1])) +if int(total) < 10000: + head = total +else: + head = 10000 +length = subprocess.check_output(sampath + "samtools view " + str(sortbam) + " | head -n " + str(head) + " | cut -f 10 | wc -c", shell=True) +length = int(float(str(length.strip()).split("'")[1])/int(head)) +logging.info("%s Number of valid reads used: %s." % (str(round((time.time() - float(start))/60,1)),str(total))) +logging.info("%s Estimated read length to trim by: %s." % (str(round((time.time() - float(start))/60,1)),str(length))) + + + + +### Calculate the cover for every scaffold +scaffold_coverage = {} +b_file = open(scaffold_cov_file, "r") + +prev_genome = '' +host = False +temp_host = [] + +for line in b_file: + line = line.strip() + + + if str(line.split("\t")[0]) != str(prev_genome): + + if host == True: # scaffold is done. Update coverage + + if len(temp_host) > 0: + scaffold_coverage.update({str(prev_genome):(temp_host[:])}) # trim the last N nucleotides from the last section added + host = False + temp_host = [] + prev_genome = '' + + + if (line.split("\t")[0] != "next"): + + prev_genome = str(line.split("\t")[0]) + temp_host.append(str(line.split("\t")[0])) + + temp_host.append(str(line.split("\t")[2]).strip("\n")) + host = True + + + + else: + + temp_host.append(str(line.split("\t")[2]).strip("\n")) + +b_file.close() + +###Add up all the coverages of scaffolds per bin +bin_coverage = {} +for i in bin_list: + bin_coverage.update({str(i):[]}) + +for key in scaffold_coverage.keys(): + host = str(key.split(binsplit)[0]) + + if host in bin_coverage.keys(): + + old_line = bin_coverage[host][:] + new_line = scaffold_coverage[key][1:] + combined_list = old_line + new_line + bin_coverage[host] = combined_list + + +##I will still run this to get a dictionary of "prophage coverage" + +#grab only relevant genomes and count genome lengths +prophage_coverage = {} +host_coverage = {} +temp_host = [] +temp_phage = [] +with open(cov_file, 'r') as covfile: + logging.info("%s Identifying coverages for host/prophage regions ..." % str(round((time.time() - float(start))/60,1))) + prev_genome = '' + n = 0 + host = False + phage = False + host_start = False + phage_start = False + host_name = False + phage_name = False + for line in covfile: + if str(line.split("\t")[0]) != str(prev_genome): + # scaffold is done. Update coverage + if host == True: + if len(temp_host) > 0: + host_coverage.update({str(prev_genome):temp_host[:-length]}) # trim the last N nucleotides from the last section added + if len(temp_phage) > 0: + prophage_coverage.update({prophage_dict_frags[str(prev_genome)]:temp_phage}) + elif phage == True: + if len(temp_phage) > 0: + prophage_coverage.update({prophage_dict_frags[str(prev_genome)]:temp_phage[:-length]}) # trim the last N nucleotides from the last section added + if len(temp_host) > 0: + host_coverage.update({str(prev_genome):temp_host}) + n = 0 + temp_phage = [] + temp_host = [] + host = False + phage = False + host_name = False + phage_name = False + prev_genome = str(line.split("\t")[0]) + + n += 1 # needs to come after previous statement of writing and length counting + + if (line.split("\t")[0] in vibrant_genomes_full or line.split("\t")[0] == "next"): # trim by first length, only include phages, "next" is for including last scaffold + if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) not in prophage_dict.keys() and n == 1: # is the first nucleotide host? + if host_name == False: + temp_host.append(str(line.split("\t")[0])) + host_name = True + temp_host.append(str(line.split("\t")[2]).strip("\n")) + host = True + phage = False + elif str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) in prophage_dict.keys() and n == 1: + if phage_name == False: + temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])]) + phage_name = True + temp_phage.append(str(line.split("\t")[2]).strip("\n").strip("\n")) + phage = True + host = False + + if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) not in prophage_dict.keys() and n > 1: + if phage == True: + if phage_name == False: + temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])]) + phage_name = True + temp_phage.append(str(line.split("\t")[2]).strip("\n")) + elif host == True: + if host_name == False: + temp_host.append(str(line.split("\t")[0])) + host_name = True + temp_host.append(str(line.split("\t")[2]).strip("\n")) + + if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) in prophage_dict.keys() and n > 1: # is the nucleotide phage? + if host == True: # must mean that host is finished and phage starting + host = False + phage = True + if phage_name == False: + temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])]) + phage_name = True + temp_phage.append(str(line.split("\t")[2]).strip("\n")) + + elif phage == True: # phage must be done + if phage_name == False: + temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])]) + phage_name = True + temp_phage.append(str(line.split("\t")[2]).strip("\n")) + prophage_coverage.update({prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])]:temp_phage}) # write to phage coverage, reset phage + temp_phage = [] + host = True + phage = False + phage_name = False + +# Statistics and metrics + +### First I will make the statistics per bacterial bin to save some time: +bin_stats = {} +for key in bin_coverage: + + cov_h = ([int(i) for i in bin_coverage[key][1:]]) + avg_h = (statistics.mean(cov_h)) + sd_h = (statistics.stdev(cov_h)) + med_h = (statistics.median(cov_h)) + stat_collection = [cov_h, avg_h, sd_h, med_h] + bin_stats.update({str(key):stat_collection}) + #host_coverage.update({str(prev_genome):temp_host}) + + + +###I modified this section so that I can add in my host statistics + +logging.info("%s Coverage identification done." % str(round((time.time() - float(start))/60,1))) +logging.info("%s Performing statistical tests ..." % str(round((time.time() - float(start))/60,1))) +total = [] +total_adj = [] +with open(outfile, 'w') as output: + output.write("prophage\thost\tactive\tMW_pvalue\tCohenD\tprophage-host_ratio\tmean_difference\tprophage_len\tprophage_mean_cov\tprophage_median_cov\tprophage_sd_cov\thost_len\thost_mean_cov\thost_median_cov\thost_sd_cov\n") + for key in prophage_coverage.keys(): + host = str(key.split(binsplit)[0]) + if host in bin_coverage.keys(): + cov_p = [int(i) for i in prophage_coverage[key][1:]] + cov_h = bin_stats[host][0] + + avg_p = statistics.mean(cov_p) + sd_p = statistics.stdev(cov_p) + avg_h = bin_stats[host][1] + sd_h = bin_stats[host][2] + # if outliers > 0: + # cov_p = [x for x in cov_p if x >= (avg_p-outliers*sd_p) and x <= (avg_p+outliers*sd_p)] + # cov_h = [x for x in cov_h if x >= (avg_h-outliers*sd_h) and x <= (avg_h+outliers*sd_h)] + # # + # avg_p = statistics.mean(cov_p) + # sd_p = statistics.stdev(cov_p) + # avg_h = statistics.mean(cov_h) + # sd_h = statistics.stdev(cov_h) + med_h = bin_stats[host][2] + med_p = statistics.median(cov_p) + mean_diff = avg_p - avg_h + result = MWdownsampling(cov_p, cov_h) + if mean_diff > 0: + cd = cohenD(avg_p, sd_p, avg_h, sd_h) + else: + cd = 0 + host_zero_ratio = False + if avg_h > 0: + ratio = avg_p/avg_h + else: + ratio = avg_p + if avg_p >= 0.5: + host_zero_ratio = True + # + active = "no" + if (float(result[1]) <= float(pval) or str(result[1]) == 'nan') and cd >= effect and mean_diff > 0 and (ratio >= ratio_cutoff or host_zero_ratio == True): + active = "yes" + total.append("yes") + if len(cov_h) < 1000 or len(cov_p) < 1000: # minimum host/phage length is 1000 bp + active = 'NA' + active_adj = 'NA' + if replace == True: + output.write(str(key).replace(str(spaces), " ")+"\t"+str(host).replace(str(spaces), " ")+"\t"+str(active)+"\t"+str(result[1])+"\t"+str(cd)+"\t"+str(ratio)+"\t"+str(mean_diff)+"\t"+str(len(cov_p))+"\t"+str(avg_p)+"\t"+str(med_p)+"\t"+str(sd_p)+"\t"+str(len(cov_h))+"\t"+str(avg_h)+"\t"+str(med_h)+"\t"+str(sd_h)+"\n") + else: + output.write(str(key)+"\t"+str(host)+"\t"+str(active)+"\t"+str(result[1])+"\t"+str(cd)+"\t"+str(ratio)+"\t"+str(mean_diff)+"\t"+str(len(cov_p))+"\t"+str(avg_p)+"\t"+str(med_p)+"\t"+str(sd_p)+"\t"+str(len(cov_h))+"\t"+str(avg_h)+"\t"+str(med_h)+"\t"+str(sd_h)+"\n") + + + + else: + if replace == True: + output.write(str(key).replace(str(spaces), " ")+"\t"+str(host).replace(str(spaces), " ")+"\tno_coverage\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\n") + else: + output.write(str(key)+"\t"+str(host)+"\tno_coverage\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\n") + +if samfile == "": + samfile = "N/A" +if bamfile == "": + bamfile = "N/A" + +subprocess.run("rm " + str(cov_file), shell=True) +logging.info("%s Statistical tests done. Analysis finished." % str(round((time.time() - float(start))/60,1))) +logging.info("") +logging.info("") +logging.info("Log file: %s" % logfilename) +logging.info("Output results file: %s" % outfile) +if samfile_cleaned == False: + logging.info("sam file: %s" % samfile) +else: + logging.info("sam file: N/A") +if bamfile_cleaned == False: + logging.info("bam file: %s" % bamfile) +else: + logging.info("bam file: N/A") +logging.info("sorted bam file: %s" % sortbam) +logging.info("") +logging.info("") +logging.info("Number of active prophages identified: %s" % len(total)) +logging.info("") +logging.info(' ##') +logging.info(' ## ##') +logging.info(' ## ##') +logging.info('###### ## ## ## ####### ###### ##### ## ##') +logging.info('## ## ## ## ## ## ## ## ## ## ##') +logging.info('###### ###### ###### ## ### ###### ### ##') +logging.info('## ## ## ## ## ## ## ## ## ##') +logging.info('## ## ## ## ## ####### ###### ##### ##') +logging.info(' # ## #') +logging.info(' # # ## # #') +logging.info(' # # # #') +logging.info(' # #') +logging.info("") + + +# +# +# diff --git a/VLP_analysis.R b/VLP_analysis.R new file mode 100644 index 0000000..18b7445 --- /dev/null +++ b/VLP_analysis.R @@ -0,0 +1,1042 @@ +#Open packages +library(dplyr) #Version ‘1.0.9’ +library(ggplot2) #Version ‘3.3.6’ +library(tidyr) #Version ‘1.2.0’ +library(stringr) #Version ‘1.4.0’ +library(permute) #Version ‘0.9.7’ +library(vegan) #Version ‘2.6.2’ +library(data.table) #Version ‘1.14.2’ +library(lattice) #Version ‘0.20.45’ +library(rstatix) #Version ‘0.7.0’ +library(grid) #Version ‘4.2.1’ +library(DESeq2) #Version 1.36.0’ +library(car) #Version ‘3.1.0’ +library(ape) #Version ‘5.6.2’ +library(reshape2) #Version ‘1.4.4’ +library(IRanges) #Version ‘2.30.0’ +library(GenomicRanges) #Version ‘1.48.0’ + +#Working from data +setwd("VLP_analysis/") +#Open all files in a data directory +temp <- list.files(pattern="*_readcount") #Make a temp variable for each of the readcount files +SRR_names <- substring(temp,1,9) #Make a vector of the SRR run by name in order files of temp +myfiles <- lapply(temp, function(x) read.delim(x, header = FALSE)) #Opens up each file +myfiles2 <- do.call(cbind, myfiles) #Binds all the files together in one dataframe +colnames(myfiles2) <- 1:48 #Give each column a number +counts_df <- myfiles2[,seq(2,48,2)] #Only odd numbered columns have read data for some reason +colnames(counts_df) <- SRR_names #The runs are in ordered based on temp file so name them +Contig <- myfiles2[,1] #use first column to get contigs name +bacterial_bin <- read.csv(file = '../GTDB-Tk_DAS_check_bin_modified.csv', header = FALSE) +demovir <- read.table(file = "demovir_taxonomy.tabular", sep = '\t', header = TRUE) +runinfo <- read.csv("../sequence_runs.csv", header = TRUE) + +count_matrix <- cbind(Contig, counts_df) + +totalNumReads <- read.delim("totalNumReads", header = FALSE, col.names = c("SRR", "Num_Reads")) +contigLength <- read.delim("ContigLength", header = FALSE, col.names = c("Contig", "Length")) + +### RPKM #### + +#reads per million +totalNumReads <- totalNumReads %>% + mutate(readsPM = Num_Reads/1000000) + +#contig per KB +contigLength <- contigLength %>% + mutate(contigLengthperKB = Length/1000) + +#Merge based on contig data +merged_count_matrix <- merge(contigLength, count_matrix, by="Contig") + +#Divide counts by contigLength per Kb +RPK <- merged_count_matrix[,4:27]/merged_count_matrix$contigLengthperKB +contignames <- merged_count_matrix$Contig +temp <- RPK +for (row in 1:nrow(totalNumReads)){ + index_name <- (totalNumReads$SRR[row]) + index_number <- match(index_name, colnames(RPK)) + new_column <- RPK[,index_number]/totalNumReads$readsPM[row] + temp[,index_number] <- new_column} + +RPKM <- cbind(contignames,temp) + +setwd('../results') +write.table(RPKM, 'RPKM.tsv', sep = "\t", row.names=FALSE) + + +#Heatmap + +#1 Get it in the right format (x-axis: day/seqrun y-axis: contig z: RPKM) +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) + +RPKM.long <- pivot_longer(data = RPKM, cols = -c(1), names_to = 'SRR', values_to = 'RPKM') +colnames(RPKM.long)[which(names(RPKM.long) == 'contignames')] <- 'contig' +RPKM.long <- RPKM.long %>% + mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'prophage' ) ) + + +write.table(RPKM.long, 'RPKM_long.tsv', sep = "\t") +RPKM.long <- read.table('RPKM_long.tsv', sep = "\t") + +##### log-transform RPKM abundances ##### + +#removing all zeros for log transformation +RPKM.long[RPKM.long == 0] <- NA + +RPKM.long$log.abundance <- log(RPKM.long$RPKM) + +write.table(RPKM.long, 'logRPKM_long.tsv', sep = "\t") +RPKM.long <- read.table('logRPKM_long.tsv', sep = "\t") + +logRPKM.heatmap <- ggplot(data = RPKM.long, aes(x = SRR, y = contig, fill = log.abundance)) + + geom_tile() + + scale_fill_gradient2(low = "blue", + mid = "white", + high ="red", + midpoint = 0, + space = "Lab", + na.value = "blue") + + theme(axis.text.x = element_text(angle = 90), axis.text.y = element_text(size = 0.4)) + + xlab(label = "Sequence Run") + + +logRPKM.heatmap + +##### RELATIVE ABUNDANCE #### + +relative_abundance <- read.table('RPKM.tsv', sep = "\t", header = TRUE) + + +for (column in 2:ncol(relative_abundance)){ + total_normalized_counts_per_sequence_run <- sum(relative_abundance[,column]) + relative_abundance[,column] <- relative_abundance[,column]/total_normalized_counts_per_sequence_run +} + +#### RELATIVE ABUNDANCE ### +write.table(relative_abundance, 'relative_abundance.tsv', sep = "\t") +relative_abundance <- read.table('relative_abundance.tsv', sep = "\t") + +#### 1% RELATIVE ABUNDANCE #### + +#I am going to run a check to see how many contigs actually contribute more than 1% of reads to a sample +number_of_one_percent_contigs <- 0 +list_contigs_greater_than_1percent <- c() +for (row in 1:nrow(relative_abundance)){ + + if (max(relative_abundance[row,-1]) > 0.01) { + number_of_one_percent_contigs <- number_of_one_percent_contigs + 1 + list_contigs_greater_than_1percent <- c(list_contigs_greater_than_1percent, relative_abundance[row,1] ) + } + +} + +#### 0.1% RELATIVE ABUNDANCE #### + +#I am going to run a check to see how many contigs actually contribute more than 1% of reads to a sample +number_of_0.1_percent_contigs <- 0 +list_contigs_greater_than_0.1percent <- c() + +for (row in 1:nrow(relative_abundance)){ + + if (max(relative_abundance[row,-1]) > 0.001) { + number_of_0.1_percent_contigs <- number_of_0.1_percent_contigs + 1 + list_contigs_greater_than_0.1percent <- c(list_contigs_greater_than_0.1percent, relative_abundance[row,1] ) + } + +} + +number_of_0.1_percent_contigs +list_contigs_greater_than_0.1percent + +#0.1% +logRPKM.long <- read.table('logRPKM_long.tsv', sep = "\t") +logRPKM.long <- as_tibble(logRPKM.long) +zero_one_percent_RPKM <- logRPKM.long[logRPKM.long$contig %in% list_contigs_greater_than_0.1percent,] +write.table(zero_one_percent_RPKM, 'zero_one_percent_sam.tsv', sep = "\t") + +##Make the sequence runs in order using factors +zero_one_percent_RPKM$SRR <- factor(zero_one_percent_RPKM$SRR, levels = c(runinfo$SRR)) + + +logRPKM.long.heatmap <- ggplot(data = logRPKM.long, aes(x = SRR, y = contig, fill = log.abundance)) + + geom_tile() + + scale_fill_gradient2(low = "blue", + mid = "white", + high ="red", + midpoint = 0, + space = "Lab", + na.value = "blue") + + theme(axis.text.x = element_text(angle = 90), axis.text.y = element_text(size = 0.4)) + + xlab(label = "Sequence Run") + + +#logRPKM.long.heatmap + +##Look at only prophages and 0.1% +zero_one_percent_RPKM <- logRPKM.long[logRPKM.long$contig %in% list_contigs_greater_than_0.1percent,] +zero_one_percent_RPKM$SRR <- factor(zero_one_percent_RPKM$SRR, levels = c(runinfo$SRR)) +prophage_zero_one_percent_RPKM <- zero_one_percent_RPKM %>% filter(zero_one_percent_RPKM$lifestyle == "prophage") + +prophage_zero_one_percent_RPKM.heatmap <- ggplot(data = prophage_zero_one_percent_RPKM, aes(x = SRR, y = contig, fill = log.abundance)) + + geom_tile() + + scale_fill_gradient2(low = "blue", + mid = "white", + high ="red", + midpoint = 0, + space = "Lab", + na.value = "blue") + + theme(axis.text.x = element_text(angle = 90), axis.text.y = element_text(size = 0.4)) + + xlab(label = "Sequence Run") + + +#prophage_zero_one_percent_RPKM.heatmap + +#Diversity + +#change names of SRR to days +x <- runinfo$ID[match(colnames(RPKM), runinfo$SRR)] +colnames(RPKM) <- replace_na(x,'contig') +shannon_diversity <- c() +for (column in 2:(ncol(RPKM)-1)) { + temp <- diversity(RPKM[,column], MARGIN = 2, index = 'shannon') + shannon_diversity <- c(shannon_diversity, temp) +} + +#Bray-curtis difference +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +row.names(RPKM) <- RPKM[,1] +RPKM <- RPKM[,-1] +colnames(RPKM) <- runinfo$ID[match(colnames(RPKM), runinfo$SRR)] +RPKM_transposed <- as.data.frame(t(RPKM)) +beta_dist <- vegdist(RPKM_transposed, index = "bray") +beta_dist_clust <- hclust(beta_dist, method = "average") +plot(beta_dist_clust) + +colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)] +RPKM_transposed <- as.data.frame(t(RPKM)) + +#Euclidean distance +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +row.names(RPKM) <- RPKM[,1] +RPKM <- RPKM[,-1] +colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)] +RPKM_transposed <- as.data.frame(t(RPKM)) +euclid_dist <- as.matrix(dist(RPKM_transposed, method = "euclidean")) +colnames(euclid_dist) <- colnames(RPKM) +row.names(euclid_dist) <- colnames(RPKM) +euclid_matrix <- as.data.frame(euclid_dist) +euclid_matrix <- cbind(colnames(euclid_matrix), euclid_matrix) +euclid_long <- pivot_longer(euclid_matrix, cols = euclid_matrix$`colnames(euclid_matrix)`) +colnames(euclid_long) <- c('time1', 'time2', 'euc_dis') +euclid_long$time1 <- as.numeric(euclid_long$time1) +euclid_long$time2 <- as.numeric(euclid_long$time2) +euclid_long$srt_time_lag <- sqrt(abs((euclid_long$time1 - euclid_long$time2))) +q <- plot(euclid_long$srt_time_lag, euclid_long$euc_dis) +q <- lm(euclid_long$euc_dis ~ euclid_long$srt_time_lag) +q <- abline(9921, 1557) +q +##the results were that there was 'directional change' in the virome unlike what I would expect +#So I will look at the top most abundant + +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +top100 <- c() +for (column in 2:25){ + x <- head(arrange(RPKM, desc(RPKM[,column])), n =175)[,1] + top100 <- c(top100, x) +} + +most_abundant <- unique(top100) +rpkm_most_abundant <- RPKM_transposed[, most_abundant] +most_abdnt_euclid_dist <- as.matrix(dist(rpkm_most_abundant, method = "euclidean")) + +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +row.names(RPKM) <- RPKM[,1] +RPKM <- RPKM[,-1] +colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)] + +colnames(most_abdnt_euclid_dist) <- colnames(RPKM) +row.names(most_abdnt_euclid_dist) <- colnames(RPKM) +most_abd_euclid_matrix <- as.data.frame(most_abdnt_euclid_dist) +most_abd_euclid_matrix <- cbind(colnames(most_abd_euclid_matrix), most_abd_euclid_matrix) +most_abd_euclid_long <- pivot_longer(most_abd_euclid_matrix, cols = most_abd_euclid_matrix$`colnames(most_abd_euclid_matrix)`) +colnames(most_abd_euclid_long) <- c('time1', 'time2', 'euc_dis') +most_abd_euclid_long$time1 <- as.numeric(most_abd_euclid_long$time1) +most_abd_euclid_long$time2 <- as.numeric(most_abd_euclid_long$time2) +most_abd_euclid_long$srt_time_lag <- sqrt(abs((most_abd_euclid_long$time1 - most_abd_euclid_long$time2))) +plot(most_abd_euclid_long$srt_time_lag, most_abd_euclid_long$euc_dis) +lm(most_abd_euclid_long$euc_dis ~ most_abd_euclid_long$srt_time_lag) + +abline(9919, 1557) + + +##Next I will look at how the active prophages compare + +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +RPKM <- as_tibble(RPKM) +RPKM <- RPKM %>% mutate(lifestyle = case_when(str_detect(contignames, "NODE") ~ 'lytic', str_detect(contignames, "DAS") ~ 'temperate' ) ) + +#I know it's a bad move but first 52 coloumns of RPKM_transposed are active prophages and the rest are the regular ones +active_prophage_RPKM <- RPKM_transposed[,1:52] +non_prophage_RPKM <- RPKM_transposed[,53:5890] + +active_prophage_euclid_dist <- as.matrix(dist(active_prophage_RPKM, method = "euclidean")) + +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +row.names(RPKM) <- RPKM[,1] +RPKM <- RPKM[,-1] +colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)] + +colnames(active_prophage_euclid_dist) <- colnames(RPKM) +row.names(active_prophage_euclid_dist) <- colnames(RPKM) +active_prophage_euclid_matrix <- as.data.frame(active_prophage_euclid_dist) +active_prophage_euclid_matrix <- cbind(colnames(active_prophage_euclid_matrix), active_prophage_euclid_matrix) +active_prophage_euclid_long <- pivot_longer(active_prophage_euclid_matrix, cols = active_prophage_euclid_matrix$`colnames(active_prophage_euclid_matrix)`) +colnames(active_prophage_euclid_long) <- c('time1', 'time2', 'euc_dis') +active_prophage_euclid_long$time1 <- as.numeric(active_prophage_euclid_long$time1) +active_prophage_euclid_long$time2 <- as.numeric(active_prophage_euclid_long$time2) +active_prophage_euclid_long$srt_time_lag <- sqrt(abs((active_prophage_euclid_long$time1 - active_prophage_euclid_long$time2))) +plot(active_prophage_euclid_long$srt_time_lag, active_prophage_euclid_long$euc_dis) +lm(active_prophage_euclid_long$euc_dis ~ active_prophage_euclid_long$srt_time_lag) +abline(139.8442,-0.7071) + +#Non-active prophages +non_prophage_euclid_dist <- as.matrix(dist(non_prophage_RPKM, method = "euclidean")) + +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +row.names(RPKM) <- RPKM[,1] +RPKM <- RPKM[,-1] +colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)] + +colnames(non_prophage_euclid_dist) <- colnames(RPKM) +row.names(non_prophage_euclid_dist) <- colnames(RPKM) +non_prophage_euclid_matrix <- as.data.frame(non_prophage_euclid_dist) +non_prophage_euclid_matrix <- cbind(colnames(non_prophage_euclid_matrix), non_prophage_euclid_matrix) +non_prophage_euclid_long <- pivot_longer(non_prophage_euclid_matrix, cols = non_prophage_euclid_matrix$`colnames(non_prophage_euclid_matrix)`) +colnames(non_prophage_euclid_long) <- c('time1', 'time2', 'euc_dis') +non_prophage_euclid_long$time1 <- as.numeric(non_prophage_euclid_long$time1) +non_prophage_euclid_long$time2 <- as.numeric(non_prophage_euclid_long$time2) +non_prophage_euclid_long$srt_time_lag <- sqrt(abs((non_prophage_euclid_long$time1 - non_prophage_euclid_long$time2))) +plot(non_prophage_euclid_long$srt_time_lag, non_prophage_euclid_long$euc_dis) +lm(non_prophage_euclid_long$euc_dis ~ non_prophage_euclid_long$srt_time_lag) +abline(9919,1557) + + +##Now try and use bray-curtis distance + +#Bray-curtis distance +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +row.names(RPKM) <- RPKM[,1] +RPKM <- RPKM[,-1] +colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)] +RPKM_transposed <- as.data.frame(t(RPKM)) +beta_dist <- as.matrix(vegdist(RPKM_transposed, index = "bray")) +colnames(beta_dist) <- colnames(RPKM) +row.names(beta_dist) <- colnames(RPKM) +beta_matrix <- as.data.frame(beta_dist) +beta_matrix <- cbind(colnames(beta_matrix), beta_matrix) +beta_long <- pivot_longer(beta_matrix, cols = beta_matrix$`colnames(beta_matrix)`) +colnames(beta_long) <- c('time1', 'time2', 'beta_dis') +beta_long$time1 <- as.numeric(beta_long$time1) +beta_long$time2 <- as.numeric(beta_long$time2) +beta_long$srt_time_lag <- sqrt(abs((beta_long$time1 - beta_long$time2))) +plot(beta_long$srt_time_lag, beta_long$beta_dis) +lm(beta_long$beta_dis ~ beta_long$srt_time_lag) +abline(0.18549, 0.02483) + + + +#Look at the most abundant contigs + +RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE) +top100 <- c() +for (column in 2:25){ + x <- head(arrange(RPKM, desc(RPKM[,column])), n =100)[,1] + top100 <- cbind(top100, x) + } + +colnames(top100) <- colnames(RPKM)[-1] + +#Look at abundance of active prophges compared to non-active. + +#Make a column for prophage or not +RPKM <- as_tibble(RPKM) +RPKM <- as.tibble(RPKM) +RPKM <- RPKM %>% mutate(lifestyle = case_when(str_detect(contignames, "NODE") ~ 'lytic', str_detect(contignames, "DAS") ~ 'temperate' ) ) +test <- aggregate(RPKM[,2:25], by=list(Category=RPKM$lifestyle), FUN=sum) #This sums the RPKM for all the contigs based on being temperate or lytic +test <- t(test) +colnames(test) <- test[1,] +test <- test[-1,] +test <- as.data.frame(test) +test$lytic <- as.numeric(test$lytic) +test$temperate <- as.numeric(test$temperate) +test$total_RPKM <- test$lytic + test$temperate +test$perc_lytic <- test$lytic / test$total_RPKM +test$perc_temp <- test$temperate / test$total_RPKM + +#The number of contigs is a lot more in lytic so I decided to explore dividing it by the number of contigs +amnt_temp <- sum(RPKM$lifestyle == "temperate") +amnt_lytic <- sum(RPKM$lifestyle == "lytic") + +test$lytic_averag <- test$lytic / amnt_lytic +test$temp_averag <- test$temperate / amnt_temp + +test$day <- runinfo$Time_days[match(row.names(test), runinfo$SRR)] +test$week <- runinfo$week[match(row.names(test), runinfo$SRR)] +test_graph <- ggplot(test, aes(x=day, y=perc_temp)) + geom_point(aes(size = 30)) + theme(axis.text.x = element_text(size = 20)) +test_graph <- test_graph + facet_wrap(~ week, scales="free_x") + ylab(label = 'Relative Abundance of Active Prophages') + xlab(label = 'Day') + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30)) + +test_graph +ggsave(file="/Users/steven/GutMicrobiome/Publication_3_Viral_Metagenomics_Collab/Minot_Data/Third_Round_bins/Viral_Analysis/Reads_Per_Contig/figures/active_prophage_relative_abundance.svg", plot=test_graph, width=10, height=10) + + +##### DESEQ2 ##### + +#format count data for DESEQ2 +#DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~ condition) +#countData +cts <- count_matrix[,-1] +rownames(cts) <- count_matrix[,1] +head(cts,2) + +#colData +orderSamples <- match(colnames(cts), runinfo$SRR) +days <- c((runinfo$Time_days[orderSamples])) +weeks <- c(runinfo$week[orderSamples]) +coldata <- cbind(days, weeks) +colnames(coldata) <- c('days', 'week') +rownames(coldata) <- runinfo$SRR[orderSamples] + +coldata <- as.data.frame(coldata) + +coldata$days <- factor(coldata$days, levels = unique(sort(coldata$days))) +coldata$week <- factor(coldata$week, levels = unique(sort(coldata$week))) +#Test that order is correct which is important for DESEQ +all(rownames(coldata) %in% colnames(cts)) +all(rownames(coldata) == colnames(cts)) + +#Now we can run it as stated above +dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~ days) + +#Now we can run it +test <- DESeq(dds) +res <- results(test) +res +summary(res) +subset(res, padj<0.05) %>% order(row.names(res)) +head(res[order(res$padj),]) +by_contig <- subset((subset(res[order(row.names(res)),], padj<0.05)), log2FoldChange>1) + +##this last step highlights the active prophages +by_prophage <- subset(by_contig, str_detect(row.names(by_contig), "DAS")) +prophages_induced <- data.frame( + row.names(by_prophage), + by_prophage$baseMean, + by_prophage$log2FoldChange, + by_prophage$lfcSE, + by_prophage$stat, + by_prophage$pvalue, + by_prophage$padj) +colnames(prophages_induced) <- c("Phage", "baseMean", "log2FoldChange", "lfcSE", + "stat", "P-value","Adjusted p-value") +#write.table(prophages_induced, 'DESEQ_induced_prophages.tsv', sep = "\t") + +vsdata <- varianceStabilizingTransformation(dds) +day_PCA <- plotPCA(vsdata, intgroup="days") +week_PCA <- plotPCA(vsdata, intgroup="week") + geom_point(size=0.5) + + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7)) + + theme(legend.key.size = unit(0.25, 'cm'), #change legend key size + legend.key.height = unit(0.25, 'cm'), #change legend key height + legend.key.width = unit(0.25, 'cm'), #change legend key width + legend.title = element_text(size=7), #change legend title font size + legend.text = element_text(size=7)) #change legend text font size +week_PCA +ggsave(file="day_PCA2.svg", plot=day_PCA, width=10, height=10) +ggsave(file="week_PCA2_2.svg", plot=week_PCA, width=8, height=8, units = c("cm"), dpi = 600) + +#Now look at only the active prophages + +dds_prophages <- dds[1:52] +vsdata_prophages <- varianceStabilizingTransformation(dds_prophages) + +plot_PCA_prophages <- plotPCA(vsdata_prophages, intgroup="week") + + geom_point(size=0.5) + + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7)) + + theme(legend.key.size = unit(0.25, 'cm'), #change legend key size + legend.key.height = unit(0.25, 'cm'), #change legend key height + legend.key.width = unit(0.25, 'cm'), #change legend key width + legend.title = element_text(size=7), #change legend title font size + legend.text = element_text(size=7)) #change legend text font size +plot_PCA_prophages + + +ggsave(file="plot_PCA_all_phages.svg", plot=plot_PCA_all_phages, width=10, height=10) +ggsave(file="plot_PCA_prophages2.svg", plot=plot_PCA_prophages, width=8, height=10, units = c("cm"), dpi = 600) + + +#Normalizing counts by DESEQ count +dds <- estimateSizeFactors(dds) +sizeFactors(dds) + +norDESEQ_counts <- counts(dds, normalized=TRUE) +norDESEQ_counts <- as.data.frame(norDESEQ_counts) +norDESEQ_counts <- setDT(norDESEQ_counts, keep.rownames = 'contig')[] +norDESEQ_counts <- norDESEQ_counts[-5891,] + +#Before also normalizing for contig length I will make sure both contig list and norDESEQ_counts are in same order of contigs +all(contigLength$Contig %in% norDESEQ_counts$contig) +all(contigLength$Contig == norDESEQ_counts$contig) + +#Normalize counts per contig length +temp_dataframe <- norDESEQ_counts[,2:25]/contigLength$Length + +norDESEQ_counts <- cbind(norDESEQ_counts$contig, temp_dataframe) +x <- colnames(norDESEQ_counts) +x[1] <- 'contig' +colnames(norDESEQ_counts) <- x + +#Get relative abundance +norDESEQ_counts <- as.data.frame(norDESEQ_counts) +for (x in 2:ncol(norDESEQ_counts)){ + total_normalized_counts_per_sequence_run <- sum(norDESEQ_counts[,x]) + norDESEQ_counts[,x] <- norDESEQ_counts[,x]/total_normalized_counts_per_sequence_run +} + + +norDESEQ_counts <- as.data.table(norDESEQ_counts) +temperate_DESEQ <- norDESEQ_counts %>% mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'temperate' ) ) +temperate_DESEQ <- temperate_DESEQ %>% filter(lifestyle == 'temperate') %>% select(!(lifestyle)) +temperate_DESEQ_melt <- melt(temperate_DESEQ, id.vars = c("contig")) + + +temperate_DESEQ_melt$variable <- factor(temperate_DESEQ_melt$variable, levels = c(runinfo$SRR)) +temperate_DESEQ_melt$day <- as.factor(runinfo$ID[match(temperate_DESEQ_melt$variable, runinfo$SRR)]) + +prophage_induction <- temperate_DESEQ_melt[,c('contig','value','variable')] +colnames(prophage_induction) <- c("contig", "read_count", "SRR_columns") + +prophage_induction <- spread(prophage_induction, contig, read_count) +prophage_induction <- as.data.frame(prophage_induction) +prophage_induction <- prophage_induction[-25,] +normalized_coverage <- cbind(runinfo$Time_days, runinfo$week, runinfo$ID, prophage_induction) + +x <- colnames(normalized_coverage) +x[1] <- 'day' +x[2] <- 'week' +x[3] <- 'day-replicate' +colnames(normalized_coverage) <- x +normalized_coverage$`day-replicate` <- factor(normalized_coverage$`day-replicate`) +normalized_coverage$SRR_columns <- factor(normalized_coverage$SRR_columns) +normalized_coverage <- as.data.table(normalized_coverage) +normalized_coverage_melted <- melt(normalized_coverage, id.vars= c('day', 'week', 'day-replicate', 'SRR_columns')) + +normalized_coverage_melted$variable <- str_match(normalized_coverage_melted$variable, "DAS_Check_Bin_..") +bins_position <- match(normalized_coverage_melted$variable, bacterial_bin$V1) +bin_names <- bacterial_bin$V9[bins_position] +normalized_coverage_melted$variable <- bin_names +p <- ggplot(normalized_coverage_melted, aes(x=day, y=value, col=variable)) + geom_point(aes(size = 15)) + theme(axis.text.x = element_text(size = 20)) +p <- p + facet_wrap(~ week, scales="free_x") + ylab(label = 'Normalized Coverage') + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30)) +p + +#Now I will revisit the DESEQ normalized counts and look at what percentage is active prophages. +norDESEQ_counts <- counts(dds, normalized=TRUE) +norDESEQ_counts <- as.data.frame(norDESEQ_counts) +norDESEQ_counts <- setDT(norDESEQ_counts, keep.rownames = 'contig')[] +norDESEQ_counts <- norDESEQ_counts[-5891,] +write.table(norDESEQ_counts, 'norDESEQ_counts.tsv', sep = "\t", row.names=FALSE) +norDESEQ_counts <- read.table('norDESEQ_counts.tsv', sep = "\t", header =TRUE) + +#Before also normalizing for contig length I will make sure both contig list and norDESEQ_counts are in same order of contigs +all(contigLength$Contig %in% norDESEQ_counts$contig) +all(contigLength$Contig == norDESEQ_counts$contig) + +#Normalize counts per contig length +temp_dataframe <- norDESEQ_counts[,2:25]/contigLength$Length + +norDESEQ_counts <- cbind(norDESEQ_counts$contig, temp_dataframe) +x <- colnames(norDESEQ_counts) +x[1] <- 'contig' +colnames(norDESEQ_counts) <- x +write.table(norDESEQ_counts, 'norDESEQ_counts_norbylength.tsv', sep = "\t", row.names=FALSE) +norDESEQ_counts <- read_delim('norDESEQ_counts_norbylength.tsv', delim = "\t") +norDESEQ_counts <- norDESEQ_counts %>% mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'temperate' ) ) + +perc_active_proph <- aggregate(norDESEQ_counts [,2:25], by=list(Category=norDESEQ_counts$lifestyle), FUN=sum) #This sums the norDESEQ_counts for all the contigs based on being temperate or lytic +perc_active_proph <- t(perc_active_proph) +colnames(perc_active_proph) <- perc_active_proph[1,] +perc_active_proph <- perc_active_proph[-1,] +perc_active_proph <- as.data.frame(perc_active_proph, stringsAsFactors = FALSE) +perc_active_proph$lytic <- as.numeric(perc_active_proph$lytic) +perc_active_proph$temperate <- as.numeric(perc_active_proph$temperate) +perc_active_proph$total_norDESEQ_counts <- perc_active_proph$lytic + perc_active_proph$temperate +perc_active_proph$perc_lytic <- perc_active_proph$lytic / perc_active_proph$total_norDESEQ_counts +perc_active_proph$perc_temp <- perc_active_proph$temperate / perc_active_proph$total_norDESEQ_counts + +#The number of contigs is a lot more in lytic so I decided to explore dividing it by the number of contigs +amnt_temp <- sum(norDESEQ_counts$lifestyle == "temperate") +amnt_lytic <- sum(norDESEQ_counts$lifestyle == "lytic") + +perc_active_proph$lytic_averag <- perc_active_proph$lytic / amnt_lytic +perc_active_proph$temp_averag <- perc_active_proph$temperate / amnt_temp + +perc_active_proph$day <- runinfo$ID[match(row.names(perc_active_proph), runinfo$SRR)] +perc_active_proph$week <- runinfo$week[match(row.names(perc_active_proph), runinfo$SRR)] +perc_active_proph$day_group <- runinfo$Time_days[match(row.names(perc_active_proph), runinfo$SRR)] +perc_active_proph$day <- as.factor(runinfo$ID) + +perc_active_proph_graph <- ggplot(perc_active_proph, aes(x=day_group, y=perc_temp, color = perc_temp < 0.010)) + geom_point(size = 1, show.legend=FALSE) + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) +perc_active_proph_graph <- perc_active_proph_graph + ylab(label = 'Relative Abundance of Active Prophages') + xlab(label = 'Day') + theme(strip.text = element_text(size =35)) + theme(axis.title = element_text(size = 35))+ + geom_hline(yintercept=0.010, linetype='dashed', color='#F8766D', size = 0.5) + facet_grid(~ week, scales="free_x", space="free") + +#perc_active_proph_graph + +#### STATS ON PERCENT ACTIVE #### +perc_active_stats <- as_tibble(perc_active_proph) +perc_active_stats <- perc_active_stats %>% select(perc_temp, day_group, temperate) %>% filter(day_group %in% c(0, 182, 851, 852, 853,879, 880, 881)) +perc_active_stats$day_group <- as.factor(perc_active_stats$day_group) +perc_active_stats %>% group_by(day_group) %>% get_summary_stats(temperate, type = "common") +perc_active_stats$id <- as.factor(rep(c(1,2), 8)) +friedman_test(perc_active_stats, temperate ~ day_group | id) + +#Add days names on x-axis +#Changle-axis labels to the day they were sampled. +#day_sampled <- c(runinfo$Time_days) +#perc_active_proph_graphp_2 <- perc_active_proph_graph + scale_x_discrete(labels= day_sampled) +perc_active_proph_graphp_2 <- perc_active_proph_graph + + facet_grid(~ week, scales="free_x", space="free_x", shrink = TRUE) + + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), strip.text = element_text(size=7)) + + geom_point(size=0.5, position = position_dodge(0.05), show.legend=FALSE) + + theme(legend.text = element_text(size = 7)) + + theme(legend.title = element_text(size = 7)) + + theme(axis.text.x = element_text(size = 7, angle = 270), axis.title = element_text(size = 7)) + + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + +#perc_active_proph_graphp_2 +qt <- ggplot_gtable(ggplot_build(perc_active_proph_graphp_2 )) +qt$widths[5] = 12*qt$widths[5] +grid.draw(qt) +ggsave(file="perc_active_proph_graph7.svg", plot=qt, width=7, height=11, units = c("cm"), dpi = 600) + +#### DESEQ normalized read abundance Bray-Curtis #### +dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~ days) +dds <- estimateSizeFactors(dds) + +norDESEQ_counts <- counts(dds, normalized=TRUE) +norDESEQ_counts <- as.data.frame(norDESEQ_counts) +norDESEQ_counts <- setDT(norDESEQ_counts, keep.rownames = 'contig')[] +norDESEQ_counts <- norDESEQ_counts[-5891,] + +#Before also normalizing for contig length I will make sure both contig list and norDESEQ_counts are in same order of contigs +all(contigLength$Contig %in% norDESEQ_counts$contig) +all(contigLength$Contig == norDESEQ_counts$contig) + +#Normalize counts per contig length +temp_dataframe <- norDESEQ_counts[,2:25]/contigLength$Length + +norDESEQ_counts <- cbind(norDESEQ_counts$contig, temp_dataframe) +x <- colnames(norDESEQ_counts) +x[1] <- 'contig' +colnames(norDESEQ_counts) <- x + +#Get relative abundance +norDESEQ_counts <- as.data.frame(norDESEQ_counts) +for (x in 2:ncol(norDESEQ_counts)){ + total_normalized_counts_per_sequence_run <- sum(norDESEQ_counts[,x]) + norDESEQ_counts[,x] <- norDESEQ_counts[,x]/total_normalized_counts_per_sequence_run +} + + +norDESEQ_counts <- as.data.table(norDESEQ_counts) + +temperate_DESEQ <- norDESEQ_counts %>% mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'temperate' ) ) +temperate_DESEQ <- temperate_DESEQ %>% filter(lifestyle == 'temperate') %>% select(!(lifestyle)) + + +#Bray-curtis difference + +norDESEQ_counts_transformed <- as.data.frame(t(norDESEQ_counts)) +colnames(norDESEQ_counts_transformed) <- norDESEQ_counts_transformed[1,] +norDESEQ_counts_transformed <- norDESEQ_counts_transformed[-1,] +temp_row_names <- row.names(norDESEQ_counts_transformed) +norDESEQ_counts_transformed <- sapply(norDESEQ_counts_transformed, as.numeric) #This turns lifestyle row into NA +row.names(norDESEQ_counts_transformed) <- runinfo$ID[match(temp_row_names, runinfo$SRR)] +#norDESEQ_counts_transformed <- norDESEQ_counts_transformed[1:24,1:5] #drop the NA row +beta_dist <- vegdist(norDESEQ_counts_transformed, index = "bray") +beta_dist_clust <- hclust(beta_dist, method = "average") +plot(beta_dist_clust) + +adonis(beta_dist ~ week + Time_days, data= runinfo) + +PCOA <- pcoa(beta_dist) +biplot.pcoa(PCOA) +beta_dist + +beta_dist_matrix <- as.matrix(beta_dist) +beta_dist_matrix[lower.tri(beta_dist_matrix, diag = FALSE)] <- NA +beta_dist_matrix[beta_dist_matrix==0.0000000] <- NA + +beta_dist_matrix <- as.data.frame(beta_dist_matrix) +beta_dist_matrix <- cbind(row.names(beta_dist_matrix), beta_dist_matrix) +beta_dist_matrix_long <- melt(as.data.table(beta_dist_matrix), id.vars = colnames(beta_dist_matrix)[1]) +colnames(beta_dist_matrix_long) <- c('time1', 'time2', 'beta_dis') +beta_dist_matrix_long$time1 <- runinfo$Time_days[match(beta_dist_matrix_long$time1, runinfo$ID)] +beta_dist_matrix_long$time2 <- runinfo$Time_days[match(beta_dist_matrix_long$time2, runinfo$ID)] +beta_dist_matrix_long$srt_time_lag <- sqrt(abs((beta_dist_matrix_long$time1 - beta_dist_matrix_long$time2))) +#### BETADIST FIGURE #### +plot(beta_dist_matrix_long$srt_time_lag, beta_dist_matrix_long$beta_dis) +linearmodel_beta_all <- lm(beta_dist_matrix_long$beta_dis ~ beta_dist_matrix_long$srt_time_lag) +abline(a=0.1849, b=0.0245) +summary(linearmodel_beta_all) +resid(linearmodel_beta_all) + +beta_dist_plot <- ggplot(beta_dist_matrix_long, aes(x=srt_time_lag, y=beta_dis)) + + geom_point(size = 0.75) + + geom_abline(intercept=0.1849, slope=0.0245, linetype='dashed', color='#F8766D') + + ylab(label = 'Bray Curtis Dissimilarity') + + xlab(label = bquote(Time (Days^-2))) + + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7 )) + +beta_dist_plot + +### REVIEWERS COMMENT #### +#Transform BC into log(1-BC) +beta_dist_matrix_long$z <- log(1-beta_dist_matrix_long$beta_dis) +plot(beta_dist_matrix_long$srt_time_lag, beta_dist_matrix_long$z) +reviewer_lm_beta_all <- lm(beta_dist_matrix_long$z ~beta_dist_matrix_long$srt_time_lag) +reviewer_lm_beta_all <- lm(beta_dist_matrix_long$z * beta_dist_matrix_long$srt_time_lag) +summary(reviewer_lm_beta_all) +resid(reviewer_lm_beta_all) +x <- beta_dist_matrix_long$srt_time_lag +newfunc2 <- function(x,a){x*a} +newFunc <- function(x,a,b){b-exp(a*x)} +curve(newfunc2(x,-0.075), add=TRUE) +linearmodel_beta_all <- lm(beta_dist_matrix_long$z ~ beta_dist_matrix_long$srt_time_lag) +abline(a=-0.0649, b=-0.075) + + +beta_dist_plot <- ggplot(beta_dist_matrix_long, aes(x=srt_time_lag, y=z)) + + geom_point(size = 0.75) + + geom_abline(intercept=-0.0649, slope=-0.075, linetype='dashed', color='#F8766D') + + ylab(label = 'Bray Curtis Dissimilarity') + + xlab(label = bquote(Time (Days^-2))) + + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7 )) + + +#Missing points are simply because I am made a triangle matrix, and did so by making one side NAs, there is probably a better way +ggsave(file="beta_dist_plot3_reviewers.png", plot=beta_dist_plot, width=5, height=6, units = c("cm"), dpi = 600, device = 'png') + +#Look at prophages + +prophage_norDESEQ_counts_transformed <- norDESEQ_counts_transformed[,1:52] + +prophage_beta_dist <- vegdist(prophage_norDESEQ_counts_transformed, index = "bray") +prophage_beta_dist_clust <- hclust(prophage_beta_dist, method = "average") +plot(prophage_beta_dist_clust) +adonis(prophage_beta_dist ~ week + Time_days, data= runinfo) +prophage_PCOA <- pcoa(prophage_beta_dist) +biplot.pcoa(prophage_PCOA) + +prophage_beta_dist_matrix <- as.matrix(prophage_beta_dist) +prophage_beta_dist_matrix[lower.tri(prophage_beta_dist_matrix, diag = FALSE)] <- NA +prophage_beta_dist_matrix[prophage_beta_dist_matrix==0.0000000] <- NA + +prophage_beta_dist_matrix <- as.data.frame(prophage_beta_dist_matrix) +prophage_beta_dist_matrix <- cbind(row.names(prophage_beta_dist_matrix), prophage_beta_dist_matrix) +prophage_beta_dist_matrix_long <- melt(as.data.table(prophage_beta_dist_matrix), id.vars = colnames(prophage_beta_dist_matrix)[1]) +colnames(prophage_beta_dist_matrix_long) <- c('time1', 'time2', 'beta_dis') +prophage_beta_dist_matrix_long$time1 <- runinfo$Time_days[match(prophage_beta_dist_matrix_long$time1, runinfo$ID)] +prophage_beta_dist_matrix_long$time2 <- runinfo$Time_days[match(prophage_beta_dist_matrix_long$time2, runinfo$ID)] +prophage_beta_dist_matrix_long$srt_time_lag <- sqrt(abs((prophage_beta_dist_matrix_long$time1 - prophage_beta_dist_matrix_long$time2))) + +plot(prophage_beta_dist_matrix_long$srt_time_lag, prophage_beta_dist_matrix_long$beta_dis) +linearmodel_beta_prophages <- lm(prophage_beta_dist_matrix_long$beta_dis ~ prophage_beta_dist_matrix_long$srt_time_lag) +abline(0.584793, 0.003089) +summary(linearmodel_beta_prophages) +prophage_beta_dist_plot <- ggplot(prophage_beta_dist_matrix_long, aes(x=srt_time_lag, y=beta_dis)) + geom_point(size = 0.75) + + geom_abline(intercept=0.58479, slope=0.003089, linetype='dashed', color='#F8766D') + + ylab(label = 'Bray Curtis Dissimilarity') + + xlab(label = bquote(Time (Days^-2))) + + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7 )) +prophage_beta_dist_plot +ggsave(file="prophage_beta_dist_plot3.svg", plot=prophage_beta_dist_plot, width=5, height=6, units = c("cm"), dpi = 600) + +#### REVIEWERS COMMENT #### + +#Transform BC into log(1-BC) +prophage_beta_dist_matrix_long$z <- log(1-prophage_beta_dist_matrix_long$beta_dis) +plot(prophage_beta_dist_matrix_long$srt_time_lag, prophage_beta_dist_matrix_long$z) +reviewer_lm__prophage_beta_all <- lm(prophage_beta_dist_matrix_long$z ~prophage_beta_dist_matrix_long$srt_time_lag) + +summary(reviewer_lm__prophage_beta_all) +resid(reviewer_lm__prophage_beta_all) +x <- prophage_beta_dist_matrix_long$srt_time_lag +newfunc2 <- function(x,a){x*a} +newFunc <- function(x,a,b){b-exp(a*x)} +curve(newfunc2(x,-0.0075), add=TRUE) +abline(a=-1.037166, b=-0.008292) + +beta_dist_plot <- ggplot(prophage_beta_dist_matrix_long, aes(x=srt_time_lag, y=z)) + + geom_point(size = 0.75) + + geom_abline(intercept=-1.037166, slope=-0.008292, linetype='dashed', color='#F8766D') + + ylab(label = 'Bray Curtis Dissimilarity') + + xlab(label = bquote(Time (Days^-2))) + + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7 )) + + +#Missing points are simply because I am made a triangle matrix, and did so by making one side NAs, there is probably a better way +ggsave(file="prophage_beta_dist_plot3_reviewers.png", plot=beta_dist_plot, width=5, height=6, units = c("cm"), dpi = 600, device = 'png') + + +#Now I want to compare beta_dist_matrix_long (all phages) to prophage_beta_dist_matrix_long (prophages) +comaparing_beta_dist_matrix_long <- beta_dist_matrix_long +names(comaparing_beta_dist_matrix_long)[names(comaparing_beta_dist_matrix_long) == 'beta_dis'] <- 'beta_dis_all' +comaparing_beta_dist_matrix_long$beta_dis_prophage <- prophage_beta_dist_matrix_long$beta_dis +compared_model <- lm(beta_dis_all ~ srt_time_lag + beta_dis_prophage, data = comaparing_beta_dist_matrix_long) +summary(compared_model) +anova(compared_model) + +linearHypothesis(compared_model, "beta_dis_all = beta_dis_prophage") + +#Next I want to see if the high beta-diversity over time is due to the larger number of contigs +random_contigs <- floor(runif(59, min=0, max=5890)) +random_norDESEQ_counts_transformed <- norDESEQ_counts_transformed[,random_contigs] +random_beta_dist <- vegdist(random_norDESEQ_counts_transformed, index = "bray") + +random_beta_dist_matrix <- as.matrix(random_beta_dist) +random_beta_dist_matrix[lower.tri(random_beta_dist_matrix, diag = FALSE)] <- NA +random_beta_dist_matrix[random_beta_dist_matrix==0.0000000] <- NA + +random_beta_dist_matrix <- as.data.frame(random_beta_dist_matrix) +random_beta_dist_matrix <- cbind(row.names(random_beta_dist_matrix), random_beta_dist_matrix) +random_beta_dist_matrix_long <- melt(random_beta_dist_matrix, id.vars = colnames(random_beta_dist_matrix)[1]) + +colnames(random_beta_dist_matrix_long) <- c('time1', 'time2', 'beta_dis') +random_beta_dist_matrix_long$time1 <- runinfo$Time_days[match(random_beta_dist_matrix_long$time1, runinfo$ID)] +random_beta_dist_matrix_long$time2 <- runinfo$Time_days[match(random_beta_dist_matrix_long$time2, runinfo$ID)] +random_beta_dist_matrix_long$srt_time_lag <- sqrt(abs((random_beta_dist_matrix_long$time1 - random_beta_dist_matrix_long$time2))) + +plot(random_beta_dist_matrix_long$srt_time_lag, random_beta_dist_matrix_long$beta_dis) +linearmodel_beta_subsample <- lm(random_beta_dist_matrix_long$beta_dis ~ random_beta_dist_matrix_long$srt_time_lag) +#abline(a=0.1849, b=0.0245) +summary(linearmodel_beta_subsample) + + +##### DESEQ taxnomic abundance ##### +norDESEQ_counts <- read.table('../norDESEQ_counts_norbylength.tsv', sep = "\t", header =TRUE) + +#I want relative abundance so I will divide each value in column + +for (column in 2:ncol(norDESEQ_counts)){ + total_normalized_counts_per_sequence_run <- sum(norDESEQ_counts[,column]) + norDESEQ_counts[,column] <- norDESEQ_counts[,column]/total_normalized_counts_per_sequence_run +} + + +norDESEQ_melt <- melt(as.data.table(norDESEQ_counts), id.vars = c("contig")) + +#Add in the Demovir taxonomy for phages that have it + +norDESEQ_melt$Order <- demovir$Order[match(norDESEQ_melt$contig, demovir$Sequence_ID)] +norDESEQ_melt$Family <- demovir$Family[match(norDESEQ_melt$contig, demovir$Sequence_ID)] +#Make NAs "Unknown" +norDESEQ_melt$Order <- as.character(norDESEQ_melt$Order) +norDESEQ_melt$Order[is.na(norDESEQ_melt$Order)] <- "Unknown" + +norDESEQ_melt$Family <- as.character(norDESEQ_melt$Family) +norDESEQ_melt$Family[is.na(norDESEQ_melt$Family)] <- "Unknown" + +#I still prefer to have samples order by sequence run ID + +norDESEQ_melt$day <- runinfo$ID[match(norDESEQ_melt$variable, runinfo$SRR)] +norDESEQ_melt$week <- runinfo$week[match(norDESEQ_melt$variable, runinfo$SRR)] + +norDESEQ_melt$Family <- as.factor(norDESEQ_melt$Family) +#Making a side-step so that I can later on look at the number of unclassified phages +contig_with_taxonomy <- norDESEQ_melt + +norDESEQ_family_melt <- norDESEQ_melt %>% group_by(day, Family, week) %>% summarise(value = sum(value)) + +family_relaytive_abund <- ggplot(norDESEQ_family_melt, aes(fill=Family, y=value, x=day)) + + geom_bar(position = "stack", stat="identity", colour=NA) + + theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust=1)) +family_relaytive_abund <- family_relaytive_abund + + facet_wrap(~ week, scales="free_x") + ylab(label = 'Relative Abundance by Order') + + xlab(label = 'Day') + + theme(strip.text = element_text(size =20)) + + theme(axis.title = element_text(size = 30)) + +family_relaytive_abund +ggsave(file="family_relaytive_abund.svg", plot=family_relaytive_abund, width=10, height=10) + + +norDESEQ_melt %>% filter(Family == "Unknown") -> unknown_phages +norDESEQ_melt %>% filter(Family == "Unknown") %>% filter(value >= 1e-1) -> abundant_unknown_phages + +#Make a graph for %Known vs %Unknown +w <- norDESEQ_melt +w$Family <- as.character(w$Family) +w$Family[w$Family != 'Unknown'] <- "Known" +#Another side table to see how many phages are known vs unknown +contig_with_taxonomy2 <- as_tibble(w) + +w <- as_tibble(w %>% group_by(variable, Family) %>% summarise((sum(value)))) +w$day <- runinfo$ID[match(w$variable, runinfo$SRR)] +w$week <- runinfo$week[match(w$variable, runinfo$SRR)] +names(w)[names(w) == colnames(w[3])] <- "value" + +perc_known <- ggplot(w, aes(fill=Family, y=value, x=day)) + geom_bar(position = "stack", stat="identity") #+ theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust=1)) +perc_known <- perc_known + + #facet_wrap(~ week, scales="free_x") + + ylab(label = 'Relative Abundance') + xlab(label = 'Day') + + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30)) + +#Remove background ggplot defeault stuff +perc_known <- perc_known + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) + + +#Relabel days, as days, and reformat the look +day_sampled <- c(runinfo$Time_days) +perc_known2 <- perc_known + scale_x_discrete(labels= day_sampled) + xlab(label = 'Day') + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) + theme(strip.text = element_text(size =35)) + theme(axis.title = element_text(size = 35)) +perc_known2 +ggsave(file="perc_known2.svg", plot=perc_known2, width=15, height=10) + +#I will make a version that shows a line per contig +contig_with_taxonomy2$day <- runinfo$ID[match(contig_with_taxonomy2$variable, runinfo$SRR)] +contig_with_taxonomy2$week <- runinfo$week[match(contig_with_taxonomy2$variable, runinfo$SRR)] + +perc_known3 <- ggplot(contig_with_taxonomy2, aes(fill=Family, y=value, x=day)) + geom_bar(position = "stack", stat="identity") #+ theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust=1)) +perc_known3 <- perc_known3 + + #facet_wrap(~ week, scales="free_x") + + ylab(label = 'Relative Abundance') + xlab(label = 'Day') + + theme(strip.text = element_text(size =20)) + + theme(axis.title = element_text(size = 30)) + +#Remove background ggplot defeault stuff +perc_known3 <- perc_known3 + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) + +perc_known3 +#Relabel days, as days, and reformat the look +day_sampled <- c(runinfo$Time_days) +perc_known4 <- perc_known3 + + scale_x_discrete(labels= day_sampled) + + xlab(label = 'Day') + + theme(axis.text.x = element_text(size = 7, angle = 270), axis.text.y = element_text(size = 7)) + + theme(strip.text = element_text(size =7)) + theme(axis.title = element_text(size = 7)) + + theme(legend.key.size = unit(0.25, 'cm'), #change legend key size + legend.key.height = unit(0.25, 'cm'), #change legend key height + legend.key.width = unit(0.25, 'cm'), #change legend key width + legend.title = element_text(size=7), #change legend title font size + legend.text = element_text(size=7)) #change legend text font size +perc_known4 + +ggsave(file="perc_known_w_lines2.svg", plot=perc_known4, width=11, height=11, units = c("cm"), dpi = 600) + + + +#I will now try run the analysis without the Unknown Family/Order category to remove non-demovir hits bias in week 2 / week 3 + +norDESEQ_melt_known_phages <- norDESEQ_melt %>% filter(Family != "Unknown") +known_phages <- unique(norDESEQ_melt_known_phages$contig) +known_norDESEQ <- norDESEQ_counts[match(known_phages, norDESEQ_counts$contig), ] + + +for (column in 2:ncol(known_norDESEQ)){ + total_normalized_counts_per_sequence_run <- sum(known_norDESEQ[,column]) + known_norDESEQ[,column] <- known_norDESEQ[,column]/total_normalized_counts_per_sequence_run +} + +known_norDESEQ_melt <- melt(as.data.table(known_norDESEQ), id.vars = c("contig")) + +#Add in the Demovir taxonomy for phages that have it + +known_norDESEQ_melt$Order <- demovir$Order[match(known_norDESEQ_melt$contig, demovir$Sequence_ID)] +known_norDESEQ_melt$Family <- demovir$Family[match(known_norDESEQ_melt$contig, demovir$Sequence_ID)] +#Make NAs "Unknown" +known_norDESEQ_melt$Order <- as.character(known_norDESEQ_melt$Order) +known_norDESEQ_melt$Order[is.na(known_norDESEQ_melt$Order)] <- "Unknown" + +known_norDESEQ_melt$Family <- as.character(known_norDESEQ_melt$Family) +known_norDESEQ_melt$Family[is.na(known_norDESEQ_melt$Family)] <- "Unknown" + +#### Reviewers Comment #### + +known_norDESEQ_melt <- known_norDESEQ_melt %>% filter(Family == "Myoviridae" | Family == "Siphoviridae" | Family == "Unassigned" | Family == "Podoviridae" | Family =="Microviridae") +Family_Colours <- c("Myoviridae" = "#EEA236", "Siphoviridae" = "#46B8DA", "Unassigned" = "#B8B8B8" , "Podoviridae" = "#357EBD", Microviridae = "#5CB85C") +#I still prefer to have samples order by sequence run ID + +known_norDESEQ_melt$day <- runinfo$ID[match(known_norDESEQ_melt$variable, runinfo$SRR)] +known_norDESEQ_melt$week <- runinfo$week[match(known_norDESEQ_melt$variable, runinfo$SRR)] + + +#Relative abundance by family is fragmented into bars for each contig. So I will sum them by family +known_norDESEQ_melt2 <- as_tibble(known_norDESEQ_melt %>% group_by(variable, Family) %>% summarise((sum(value)))) +known_norDESEQ_melt2$day <- runinfo$ID[match(known_norDESEQ_melt2$variable, runinfo$SRR)] +known_norDESEQ_melt2$week <- runinfo$week[match(known_norDESEQ_melt2$variable, runinfo$SRR)] +known_norDESEQ_melt2$day_group <- runinfo$Time_days[match(known_norDESEQ_melt2$variable, runinfo$SRR)] +names(known_norDESEQ_melt2)[names(known_norDESEQ_melt2) == colnames(known_norDESEQ_melt2[3])] <- "value" +known_norDESEQ_melt2$day <- factor(known_norDESEQ_melt2$day, levels= runinfo$ID) + + +day_sampled <- c(runinfo$Time_days) +ID_sampled <- c(runinfo$ID) +knownfamily_relative_abund <- ggplot(known_norDESEQ_melt2, aes(fill=Family, y=value, x=day)) + + geom_bar(position = "stack", stat="identity", show.legend=TRUE) + scale_x_discrete(breaks = ID_sampled, labels = day_sampled) + scale_fill_manual(values = Family_Colours) #Add in when generating the no label figure +knownfamily_relative_abund <- knownfamily_relative_abund + + #facet_wrap(~ week, scales="free_x") + + ylab(label = 'Relative Abundance by Order') + + xlab(label = 'Day') + theme(strip.text = element_text(size =20)) + + theme(axis.title = element_text(size = 30)) + +#Remove background ggplot defeault stuff +knownfamily_relative_abund <- knownfamily_relative_abund + + theme_bw() + + theme(panel.border = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) +#Relabel days, as days, and reformat the look + +knownfamily_relative_abund2 <- knownfamily_relative_abund + + xlab(label = 'Day') + + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) + + theme(strip.text = element_text(size =35)) + + theme(axis.title = element_text(size = 35)) +knownfamily_relative_abund2 <- knownfamily_relative_abund2 + + facet_grid(~ week, scales="free_x", space="free_x", shrink = TRUE) + + theme(axis.text.x = element_text(size = 7, angle = 270), axis.text.y = element_text(size = 7), strip.text = element_text(size=7)) + + theme(legend.title = element_text(size = 7)) + + theme(axis.text.x = element_text(size = 7), axis.title = element_text(size = 7)) + + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + + theme(legend.text = element_text(size = 7)) + + theme(legend.key.size = unit(0.25, 'cm'), #change legend key size + legend.key.height = unit(0.25, 'cm'), #change legend key height + legend.key.width = unit(0.25, 'cm'), #change legend key width + legend.title = element_text(size=7), #change legend title font size + legend.text = element_text(size=7)) #change legend text font size +knownfamily_relative_abund2 +ggsave(file="knownfamily_relative_abund5.svg", plot=knownfamily_relative_abund2, width=17, height=13, units = c("cm"), dpi = 600) +ggsave(file="knownfamily_relative_abund_nolegend5.svg", plot=knownfamily_relative_abund2, width=8, height=10, units = c("cm"), dpi = 600) + +#Make a figure for known vs unkown +norDESEQ_family_melt + +# # Alejandro's request to see a figure of how many contigs belong to known vs unknown, to see if it is one or two highly abundant +# +# contig_with_taxonomy2 <- as_tibble(w) +# counts_per_category <- contig_with_taxonomy2 %>% group_by(variable, Family) %>% summarise(contig_number = n()) +# +# w <- as_tibble(w %>% group_by(variable, Family) %>% summarise((sum(value)))) +# w$day <- runinfo$ID[match(w$variable, runinfo$SRR)] +# w$week <- runinfo$week[match(w$variable, runinfo$SRR)] +# names(w)[names(w) == colnames(w[3])] <- "value" +# +# perc_known <- ggplot(w, aes(fill=Family, y=value, x=day)) + geom_bar(position = "stack", stat="identity") #+ theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust=1)) +# perc_known <- perc_known + facet_wrap(~ week, scales="free_x") + ylab(label = 'Relative Abundance') + xlab(label = 'Day') + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30)) +# +# #Remove background ggplot defeault stuff +# perc_known <- perc_known + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), +# panel.grid.minor = element_blank()) +# #Relabel days, as days, and reformat the look +# day_sampled <- c(runinfo$Time_days) +# perc_known2 <- perc_known + scale_x_discrete(labels= day_sampled) + xlab(label = 'Day') + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) + theme(strip.text = element_text(size =35)) + theme(axis.title = element_text(size = 35)) diff --git a/checkV_coverage_script.R b/checkV_coverage_script.R new file mode 100644 index 0000000..a8e721c --- /dev/null +++ b/checkV_coverage_script.R @@ -0,0 +1,80 @@ +#Open packages +library(dplyr) #Version ‘1.0.9’ +library(ggplot2) #Version ‘3.3.6’ +library(tidyr) #Version ‘1.2.0’ +library(stringr) #Version ‘1.4.0’ +library(reshape2) #Version ‘1.4.4’ + + +setwd("/Users/Sutcliffe/Library/CloudStorage/OneDrive-McGillUniversity/PhD_OneDrive/Publication_3_Viral_Metagenomics_Collab/Writing/12_draft_revisions/Scripts") + +#Load in the data! + +SRR <- read.csv("Data_For_Scripts/sequence_runs.csv", header=TRUE) #File has the sequence run meta-data +runinfo <- read.csv("Data_For_Scripts/sequence_runs.csv", header = TRUE) +bacterial_bin <- read.csv(file = 'Data_For_Scripts/GTDB-Tk_DAS_check_bin_modified.csv', header = FALSE) +bacphlip <- read.delim('Data_For_Scripts/CheckV_Coverage/checkv_contigs.fasta.bacphlip', header = TRUE, sep = '\t') +setwd("Data_For_Scripts/CheckV_Coverage/") +temp <- list.files(pattern="*_RPKM_coverage") #Make a temp variable for each of the coverage file per sequence run. The all viral reads was not included unless I need it. +myfiles <- lapply(temp, function(x) read.delim(x, header = TRUE)) #Opens up each file +coverage_data <- do.call(rbind, myfiles) #Binds all the files together in one dataframe + +##### DESEQ normalized reads #### + +#I will run the analysis on DESEQ normalized reads +#Load in the normalized reads +DESEQ_normalized_abundance <- read.csv("../../Data_For_Scripts/norDESEQ_counts_norbylength.tsv", sep = "\t", header=TRUE) #DESEQ normalized read counts + +#Then select the 462 contigs we have lifestyle prediction for by Bacphlip +bacphlip_contigs <- match(DESEQ_normalized_abundance$contig, bacphlip$X) +bacphlip_contigs <- bacphlip_contigs[!is.na(bacphlip_contigs)] +bacphlip_norm_abun <- DESEQ_normalized_abundance[bacphlip_contigs,] + +#Now I will add the 48 prophages not in the list as we know they are temperate too +bacphlip_norm_abun <- rbind(DESEQ_normalized_abundance[1:52,], bacphlip_norm_abun) +#Now we have some duplicates because I added the 52 prophages but there were already four there +bacphlip_norm_abun <- bacphlip_norm_abun[!duplicated(bacphlip_norm_abun$contig),] + +#Now add a column for lifestyle + +#Use the bacphlip data to determine the lifestyle +temperate_phages <- bacphlip$X[(which(bacphlip$Temperate > 0.5))] +bacphlip_norm_abun <- bacphlip_norm_abun %>% mutate(lifestyle = case_when((!(contig %in% temperate_phages) & !(str_detect(contig, "DAS"))) ~ 'lytic', ((contig %in% temperate_phages) | ( str_detect(contig, "DAS"))) ~ 'temperate' ) ) +row.names(bacphlip_norm_abun) <- bacphlip_norm_abun[,1] +bacphlip_norm_abun <- bacphlip_norm_abun[,-1] + +temperate_contigs3 <- bacphlip_norm_abun %>% filter(lifestyle == "temperate") +lytic_contigs3 <- bacphlip_norm_abun %>% filter(lifestyle == "lytic") + +relative_by_lifstyl <- c() +for (column in 1:24){ + temperate <- sum(temperate_contigs3[,column])/sum(bacphlip_norm_abun[,column]) + lytic <- sum(lytic_contigs3[,column])/sum(bacphlip_norm_abun[,column]) + relative_by_lifstyl <- rbind(relative_by_lifstyl, c(temperate, lytic)) +} + +relative_by_lifstyl <- as.data.frame(relative_by_lifstyl) +relative_by_lifstyl <- cbind(colnames(bacphlip_norm_abun)[1:24], relative_by_lifstyl) +colnames(relative_by_lifstyl) <- c('SRR','temperate', 'lytic') + +relative_by_lifstyl$ID <- runinfo$ID[match(relative_by_lifstyl$SRR,runinfo$SRR)] +relative_by_lifstyl_melt <- reshape2::melt(relative_by_lifstyl, id.vars=c("SRR", 'ID')) + +DESEQ_lifestyle_abund <- ggplot(relative_by_lifstyl_melt, aes(fill=variable, y=value, x=ID)) + geom_bar(position="stack", stat="identity") + theme(axis.text.x = element_text(angle = 90)) + labs( y = 'Normalized Relative Abundance (%)') +DESEQ_lifestyle_abund <- DESEQ_lifestyle_abund + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) + +#Chnage titles of days on x-axis + reformatting to make more clear to read +day_sampled <- c(runinfo$Time_days) +DESEQ_lifestyle_abund3 <- DESEQ_lifestyle_abund + scale_x_discrete(labels= day_sampled) + xlab(label = 'Day') + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) + theme(strip.text = element_text(size =35)) + theme(axis.title = element_text(size = 35)) + scale_fill_discrete(name = "") +DESEQ_lifestyle_abund3 + + +##### STATS ##### +#Note couldn't get the stats to work in R so I finished it in PRISM +relative_by_lifstyl_melt <- as_tibble(relative_by_lifstyl_melt) +relative_by_lifstyl_melt$day <- runinfo$ID[match(relative_by_lifstyl_melt$SRR, runinfo$SRR)] +relative_by_lifstyl_melt$week <- runinfo$week[match(relative_by_lifstyl_melt$SRR, runinfo$SRR)] +relative_by_lifstyl_melt$day_group <- runinfo$Time_days[match(relative_by_lifstyl_melt$SRR, runinfo$SRR)] +stats <- relative_by_lifstyl_melt %>% select(value, day_group, variable) %>% filter(day_group %in% c(0, 182, 851, 852, 853,879, 880, 881)) +stats <- filter(variable == 'temperate') diff --git a/prophage_coverage_script.R b/prophage_coverage_script.R new file mode 100644 index 0000000..74640df --- /dev/null +++ b/prophage_coverage_script.R @@ -0,0 +1,277 @@ +#Open packages +library(dplyr) #Version ‘1.0.9’ +library(ggplot2) #Version ‘3.3.6’ +library(tidyr) #Version ‘1.2.0’ +library(stringr) #Version ‘1.4.0’ +library(reshape2) #Version ‘1.4.4’ +library(cartography) #Version ‘3.0.1’ +library(grid) #Version ‘4.2.1’ +library(ggrepel) #Version ‘0.9.1’ + +#Each sequence run information +setwd('Data_For_Scripts/Prophage_Coverage/') +SRR <- read.csv("../sequence_runs.csv", header=TRUE) +contig_bed <- read.delim("merged_phage_contigs.bed", header=FALSE, blank.lines.skip = TRUE) +contig_bed <- contig_bed[,1:3] +colnames(contig_bed) <- c('contig', 'start', 'length') + +setwd('samtools_coverage_coverage') +temp <- list.files(pattern="*_samtool_coverage") #Make a temp variable for each of the readcount files +myfiles <- lapply(temp, function(x) read.delim(x, header = TRUE)) #Opens up each file +coverage_data <- do.call(rbind, myfiles) #Binds all the files together in one dataframe +setwd('..') +runinfo <- read.csv("../sequence_runs.csv", header = TRUE) +bacterial_bin <- read.csv(file = '../GTDB-Tk_DAS_check_bin_modified.csv', header = FALSE) +DESEQ_size_factors <- read.csv("../DESEQ_sizefactors.csv", header = TRUE) +#An important file this matches each prophage contig name to a prophage id based on bacterial bin +prophage_host_match <- read.csv(file = "../prophage_host_naming.csv", header = TRUE) + +#I need to make a column with the SRR it is from. +SRR_names <- substring(temp,1,9) #Make a vector of the SRR run by name in order files of temp. +#E.g this replicate(5,SRR_names[1]) would replicate the first sequence run 5 times but I need it 5890 times +SRR_columns <- c() +for (i in 1:24) +{ + SRR_columns <- c(SRR_columns,(replicate(5890,SRR_names[i]))) +} + +runinfo$permillionreads <- runinfo$totalNumReads/1000000 + +#Now I can add it into my coverage data the SRR run +coverage_data <- cbind(coverage_data, SRR_columns) + +#Change name of contig column +x <- colnames(coverage_data) +x[1] <- "contig" +colnames(coverage_data) <- x + +#Make a weak cut-off for present (i.e any reads mapping to contig) +coverage_data$minot_present <- (coverage_data$coverage > 0) + +#Make a column for prophage or not +coverage_data <- coverage_data %>% mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'temperate' ) ) + +coverage_data$present <- (coverage_data$coverage >= 75) #First results I did this with 25 because I thought it was coverage of 0s not other way around + +present_contigs <- coverage_data %>% filter(present == TRUE) +present_contigs <- unique(present_contigs$contig) +length(present_contigs) + +#Check prophages + +present_temperate <- coverage_data %>% filter(present == TRUE) +present_temperate <- present_temperate %>% filter(lifestyle == 'temperate') +length(unique(present_temperate$contig)) + +#Make presence absence heatmap +present_prophages <- coverage_data %>% filter(lifestyle == 'temperate') +coverage_data$days <- runinfo$ID[(match(coverage_data$SRR_columns, runinfo$SRR))] +present_prophages$SRR_columns <- factor(present_prophages$SRR_columns, levels = c(runinfo$SRR)) +#This lumps the days together but some days are sequenced twice so I will try it (below) with the day-sequence run +#present_prophages$day <- as.factor(runinfo$Time_days[match(present_prophages$SRR_columns, runinfo$SRR)]) +present_prophages$day <- as.factor(runinfo$ID[match(present_prophages$SRR_columns, runinfo$SRR)]) +present_prophages$contig <- prophage_host_match$prophage_names[match(present_prophages$contig, prophage_host_match$prophage_ID)] +prophage_presence.heatmap <- ggplot(data = present_prophages, aes(x = day, y = contig, fill = present)) + + geom_tile() + scale_fill_manual(values = c('white', "#CE3D32FF")) + + theme(axis.text.x = element_text(size = 5, angle = 90, ), axis.text.y = element_text(size = 5, face = "italic")) + + xlab(label = "Day") + ylab(label = 'Active Prophage') + theme(axis.text.x = element_text(size = 10)) +prophage_presence.heatmap + +# I want to get some metrics, like how many prophages were found over multiple weeks, all time points, etc. + +present_temperate_sorted <- present_temperate[ , c("contig","SRR_columns")] +present_temperate_sorted$week <- runinfo$week[match(present_temperate_sorted$SRR_columns, runinfo$SRR)] +present_temperate_sorted$day <- runinfo$Time_days[match(present_temperate_sorted$SRR_columns, runinfo$SRR)] + +active_prophage_list <- unique(present_temperate_sorted$contig) +metrics_present_temperate <- c() + +for (i in active_prophage_list){ + temp_matrix <- present_temperate_sorted %>% filter(contig == i) + weeks <- length(unique(temp_matrix$week)) + days <- length(unique(temp_matrix$day)) + temp_variable <- c(i, weeks, days) + metrics_present_temperate <- rbind(metrics_present_temperate, temp_variable) +} +colnames(metrics_present_temperate) <- c('contig', 'Number_of_Weeks' ,"Number_of_Days") +metrics_present_temperate <- as.data.frame(metrics_present_temperate) +metrics_present_temperate$Number_of_Weeks <- as.numeric(metrics_present_temperate$Number_of_Weeks) +metrics_present_temperate$Number_of_Days <- as.numeric(metrics_present_temperate$Number_of_Days) +summary(metrics_present_temperate) + +#Lets see how non-prophages + +#Stop phase, output the 75% coverage table +write.table(coverage_data, 'prophage_coverage_data.tsv', sep = "\t") +coverage_data <- read.table('prophage_coverage_data.tsv', sep = "\t") + +#replace the SRR with days +coverage_data$days <- SRR$ID[(match(coverage_data$SRR_columns, SRR$SRR))] + + + + + +#To compare the coverages of each prophage between dates to see if prophage induction has occured + +prophage_induction <- present_prophages[,c('contig','meandepth','SRR_columns')] +prophage_induction <- spread(prophage_induction, contig, meandepth) + +##### DESEQ SIZE FACTOR #### +#Note run at least the first 201 lines to get here + +runinfo$size_factors <- DESEQ_size_factors$sizeFactors[(match(runinfo$SRR,DESEQ_size_factors$SRR ))] + +normalized_coverage2 <- c() +for (row in 1:nrow(prophage_induction)) +{ + x <- (prophage_induction[row,-1]/runinfo$size_factors[row]) + normalized_coverage2 <- rbind(normalized_coverage2, x) +} + +normalized_coverage2 <- cbind(runinfo$Time_days, runinfo$week, runinfo$ID, normalized_coverage2) +x <- colnames(normalized_coverage2) +x[1] <- 'day' +x[2] <- 'week' +x[3] <- 'day-replicate' +colnames(normalized_coverage2) <- x +normalized_coverage2$`day-replicate` <- factor(normalized_coverage2$`day-replicate`) +normalized_coverage2$sample_num <- c(1:24) +normalized_coverage2_melted <- melt(normalized_coverage2, id.vars= c('day', 'week', 'day-replicate', 'sample_num')) + +normalized_coverage3_melted <- normalized_coverage2_melted +#normalized_coverage3_melted$variable <- prophage_host_match$prophage_names[match(normalized_coverage3_melted$variable, prophage_host_match$prophage_ID)] +normalized_coverage3_melted$`day-replicate` <- as.factor(runinfo$ID) + +#Add bacterial names of prophages +normalized_coverage3_melted$phage_name <- prophage_host_match$prophage_names[match(normalized_coverage3_melted$variable, prophage_host_match$prophage_ID)] + +#I will try and label all those above 100x coverage +p <- ggplot(normalized_coverage3_melted, aes(x=day, y=value, col=variable)) + geom_point(size=1, position = position_dodge(0.7)) + theme(axis.text.x = element_text(size = 20), axis.text.y = element_text(size = 20)) + geom_text(aes(label=ifelse(value>100,as.character(variable),'')),hjust=0,vjust=0, size=4, show.legend=FALSE) +p <- p + facet_grid(~ week, scales="free_x", space="free") + ylab(label = 'Normalized Coverage') + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30)) + theme(legend.text =element_text(face = "italic") ) +p + +#Now I will try with z-score that is significant +nozero_coverage <- subset(normalized_coverage2_melted, value>0) +nozero_coverage$logCov <- log(nozero_coverage$value) +nozero_coverage$zscore <- scale(nozero_coverage$logCov) +#nozero_coverage$variable <- prophage_host_match$prophage_names[match(nozero_coverage$variable, prophage_host_match$prophage_ID)] +one_hundred_coverage <- subset(nozero_coverage, value>100) +significant_coverage <- subset(nozero_coverage, zscore>1.96) + +normalized_coverage_w_zscore <- left_join(normalized_coverage3_melted, nozero_coverage[,c(1,2,3,4,5,7,8)], by =c('day', "week", "day-replicate", "sample_num", "variable")) +q <- ggplot(normalized_coverage_w_zscore, aes(x=day, y=value, col=variable)) + + geom_point(size=1, position = position_dodge(0.1), show.legend=FALSE) + + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7)) + + geom_text(aes(label=ifelse(zscore>1.95,as.character(variable),'')),hjust=0,vjust=0, size=2, show.legend=FALSE) +q <- q + facet_grid(~ week, scales="free_x", space="free_x", shrink = TRUE) + ylab(label = 'Normalized Coverage') + theme(strip.text = element_text(size =7)) + theme(axis.title = element_text(size = 7)) + theme(legend.text =element_text(face = "italic") ) +q <- q + theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 7)) + theme(axis.text.x = element_text(size = 7), axis.title = element_text(size = 7)) + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +q +qt <- ggplot_gtable(ggplot_build(q)) +qt$widths[5] = 12*qt$widths[5] +grid.draw(qt) + +ggsave(file="prophage_coverage_DESEQ_norm6.svg", plot=q, width=30, height=13, units = c("cm"), dpi = 600) + +#Note, so I can't figure out how make a day 0 which has just one time point, look like the other weeks so I am going to make a fake dataset where +#During week has other days 1,2,3,4 so that I can later remove them in Inkskape in a way that the data looks the same. AFter trying different things +#With facet_grid or ggplot_gtable + +day1 <- filter(normalized_coverage_w_zscore, week == 'day0') +day1$day <- 1 + +day2 <- filter(normalized_coverage_w_zscore, week == 'day0') +day2$day <- 2 + +day3 <- filter(normalized_coverage_w_zscore, week == 'day0') +day3$day <- 3 + +day4 <- filter(normalized_coverage_w_zscore, week == 'day0') +day4$day <- 4 + +normalized_coverage_w_zscore2 <- rbind(normalized_coverage_w_zscore, day1, day2, day3, day4) + +qt <- ggplot(normalized_coverage_w_zscore2, aes(x=day, y=value, col=variable)) + + geom_point(size=1, position = position_dodge(0.1), show.legend=FALSE) + + theme(axis.text.x = element_text(size = 7, angle = 270), axis.text.y = element_text(size = 7)) + + geom_text_repel(aes(label=ifelse(zscore>1.95,as.character(variable),'')),hjust=0,vjust=0, size=2, show.legend=FALSE) +qt <- qt + facet_grid(~ week, scales="free_x", space="free_x", shrink = TRUE) + ylab(label = 'Normalized Coverage') + theme(strip.text = element_text(size =7)) + theme(axis.title = element_text(size = 7)) + theme(legend.text =element_text(face = "italic") ) +qt <- qt + theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 7)) + theme(axis.text.x = element_text(size = 7), axis.title = element_text(size = 7)) + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +qt +ggsave(file="prophage_coverage_DESEQ_norm7.svg", plot=qt, width=8, height=11, units = c("cm"), dpi = 600) + + +#Make a heatmap with normalized coverage +normalized_coverage4 <- c() +for (row in 1:nrow(present_prophages)){ + if (present_prophages$coverage[row] < 75){ + coverage = 0 + } else { + x <- match(present_prophages$SRR_columns[row], runinfo$SRR) + coverage = present_prophages$meandepth[row]/runinfo$size_factors[x] + } + y <- c(present_prophages$contig[row], as.character(present_prophages$day[row]), as.numeric(coverage)) + normalized_coverage4 <- rbind(normalized_coverage4, y) +} + +normalized_coverage4 <- as.data.frame(normalized_coverage4) +colnames(normalized_coverage4) <- c("contig", "day", "coverage") +normalized_coverage4$day <- as.factor(present_prophages$day) +normalized_coverage4$coverage <- as.numeric(normalized_coverage4$coverage) +normalized_coverage4[normalized_coverage4 == 0] <- NA +normalized_coverage4$week <- runinfo$week[(match(normalized_coverage4$day, runinfo$ID))] +normalized_coverage4$number <- prophage_host_match$prophage_numbers[(match(normalized_coverage4$contig, prophage_host_match$prophage_names))] + + +prophage_coverage.heatmap <- ggplot(data = normalized_coverage4, aes(x = day, y = contig, fill = coverage)) + + geom_tile() + scale_fill_gradientn(colours = carto.pal(pal1 = "orange.pal" ,n1 = 5), na.value = 'white') + + theme(axis.text.x = element_text(size = 7 ), axis.text.y = element_text(size = 7, face = "italic", )) + + xlab(label = "Day") + ylab(label = 'Active Prophages') + theme(axis.text.x = element_text(size = 7), axis.title = element_text(size = 7)) +prophage_coverage.heatmap_grided <- prophage_coverage.heatmap +facet_grid(~ week) + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +prophage_coverage.heatmap_grided +prophage_coverage.heatmap + +#Changle-axis labels to the day they were sampled. +day_sampled <- c(runinfo$Time_days) +prophage_coverage.heatmap_2 <- prophage_coverage.heatmap + scale_x_discrete(labels= day_sampled) + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +prophage_coverage.heatmap_2 <- prophage_coverage.heatmap_2 + theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 7)) +prophage_coverage.heatmap_2 <- prophage_coverage.heatmap_2 + theme(legend.key.size = unit(2, 'mm'), #change legend key size + legend.key.height = unit(2, 'mm'), #change legend key height + legend.key.width = unit(2, 'mm'), #change legend key width + legend.title = element_text(size=7), #change legend title font size + legend.text = element_text(size=7)) #change legend text font size +prophage_coverage.heatmap_2 +ggsave(file='prophage_coverage.heatmap_3.svg', plot=prophage_coverage.heatmap_2, width=21, height=13, units = c("cm"), dpi = 600) +ggsave(file='prophage_coverage.heatmap_grided.svg', plot=prophage_coverage.heatmap_grided, width=13, height=8) + +##### Check Induced Prophages #### +#Important the prophages that went though a signficant fold increase by DESEQ analysis +#Method1 use the significantly increased DESEQ prophages +DESEQ_induced_prophages <- read.csv( file = '../DESEQ_induced_prophages.tsv', sep = "\t") +DESEQ_induced_prophages$Phage <- prophage_host_match$prophage_names[match(DESEQ_induced_prophages$Phage, prophage_host_match$prophage_ID)] + +#Method2 use the normalized coverage with a z-score over 1.96 +nozero_coverage <- subset(normalized_coverage2_melted, value>0) +nozero_coverage$logCov <- log(nozero_coverage$value) +nozero_coverage$zscore <- scale(nozero_coverage$logCov) +#nozero_coverage$variable <- prophage_host_match$prophage_names[match(nozero_coverage$variable, prophage_host_match$prophage_ID)] +one_hundred_coverage <- subset(nozero_coverage, value>100) +significant_coverage <- subset(nozero_coverage, zscore>1.96) + +# bins_with_induced <- unique((significant_coverage$variable)) +# bins_position <- match(bins_with_induced, bacterial_bin$V1) +# bin_names <- bacterial_bin$V9[bins_position] +# bin_names + +#Compare the different methods +method1 <- DESEQ_induced_prophages$Phage +method2 <- significant_coverage$variable +intersect(method1,method2) + +##### Missclassified Prophage-Host ##### + +coverage_data %>% filter(str_detect(contig, "DAS")) %>% filter(coverage > 95) %>% select(contig) -> high_covered_prophages +unique(sort(high_covered_prophages[,])) + +#We see that 28 of 52 prophages could fall into this 'miss-classified to host' area. + diff --git a/prophage_merger_v2.py b/prophage_merger_v2.py new file mode 100644 index 0000000..2fe307e --- /dev/null +++ b/prophage_merger_v2.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Apr 8 10:13:33 2021 + +@author: steven +""" + +import pandas as pd + +#Removes prophages that overlap with each other + +#loading in the file + +prophage_coordinates = pd.read_csv("path/to/prophage_coordinates.txt", sep='\t', header=None) + +#sorting +sorted_coordinates = prophage_coordinates.sort_values(by=[1]) + +double_sorted = prophage_coordinates.sort_values(by=[0,1]) + +double_sorted = double_sorted.reset_index(drop=True) +compairson = double_sorted + +#iterate through the dataframe: + +for i in range(0,(len(double_sorted)-1)): + check = True + while check: + for x in range(i+1,len(double_sorted)): + if str(double_sorted.loc[i, 0]) == str(double_sorted.loc[x, 0]): + if (int(double_sorted.loc[x, 1]) >= int((double_sorted.loc[i, 1]))) and (int(double_sorted.loc[x, 1]) <= int((double_sorted.loc[i, 2]))): + if (int(double_sorted.loc[i, 2]) <= int((double_sorted.loc[x, 2]))): + double_sorted.loc[i,2] = double_sorted.loc[x,2] + + else: + check = False + break + + + +def remove_duplicates(z): + + #iterate through the dataframe creating indices to delete + indices_to_delete = [] + + for i in range(1,len(z)): + if str(z.loc[i, 0]) == str(z.loc[i-1, 0]): + if (int(z.loc[i, 1]) >= int((z.loc[i-1, 1]))) and (int(z.loc[i, 1]) <= int((z.loc[i-1, 2]))): + indices_to_delete.append(i) + + + #Create final dataframe with prophages merged + z = z.drop(indices_to_delete) + return z + +#Check for duplicates to remove and then remove them +check = True +while check: + check = False + double_sorted = remove_duplicates(double_sorted) + double_sorted = double_sorted.reset_index(drop=True) + for i in range(1,len(double_sorted)): + if str(double_sorted.loc[i, 0]) == str(double_sorted.loc[i-1, 0]): + if (int(double_sorted.loc[i, 1]) >= int((double_sorted.loc[i-1, 1]))) and (int(double_sorted.loc[i, 1]) <= int((double_sorted.loc[i-1, 2]))): + check = True + + + + +double_sorted.to_csv('final', sep='\t', header=False, index=False) \ No newline at end of file