From d91a02a74495a56c321c05628aee3cfdb26baa66 Mon Sep 17 00:00:00 2001
From: sgsutcliffe <47899883+sgsutcliffe@users.noreply.github.com>
Date: Thu, 22 Dec 2022 15:17:57 -0500
Subject: [PATCH] Scripts used in publication

---
 BowtieProphageFinding.py     |  120 ++++
 MAGS_relative_abundance.R    |  106 ++++
 Modified-PropagAtE_script.py |  968 +++++++++++++++++++++++++++++++
 VLP_analysis.R               | 1042 ++++++++++++++++++++++++++++++++++
 checkV_coverage_script.R     |   80 +++
 prophage_coverage_script.R   |  277 +++++++++
 prophage_merger_v2.py        |   71 +++
 7 files changed, 2664 insertions(+)
 create mode 100644 BowtieProphageFinding.py
 create mode 100644 MAGS_relative_abundance.R
 create mode 100644 Modified-PropagAtE_script.py
 create mode 100644 VLP_analysis.R
 create mode 100644 checkV_coverage_script.R
 create mode 100644 prophage_coverage_script.R
 create mode 100644 prophage_merger_v2.py

diff --git a/BowtieProphageFinding.py b/BowtieProphageFinding.py
new file mode 100644
index 0000000..88ee6b2
--- /dev/null
+++ b/BowtieProphageFinding.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Nov 19 15:54:15 2020
+
+@author: steven
+"""
+
+from Bio import SeqIO
+
+import csv
+
+#loading in the file with three columns 1 bacterial bin contig name 2 base site 3 coverage output from mpileup
+path_mpileup="Data_For_Scripts/three_columns_mpileup_DAScheckbins"
+tsv_file = open(path_mpileup)
+
+read_tsv = csv.reader(tsv_file, delimiter="\t")
+
+#turns tsv into a list of lists, each list is a row
+mpileup = []
+for row in read_tsv:
+    
+    mpileup.append(row)
+
+contig_list = []
+
+
+#makes a list of all contigs in tsv
+for n in mpileup:
+    if n[0] not in contig_list:
+        contig_list.append(n[0])
+
+print('made contig list')
+
+#iterate through all the contigs and see if there is a prophage region and add them to the dictionary
+
+prophage_regions = []
+
+#contig by contig
+for contig in contig_list:
+    
+    
+    #this takes all lines that belong to this specifc contig and makes a contig specific pileup list
+    mpileup_contig = []
+    
+    #collect all base-pairs or rows for a contig
+    for i in mpileup:
+        
+        if i[0] == contig:
+            
+            mpileup_contig.append(i)
+    
+    #For regions where there were no hits mpileup skips these lines, so I added a zero coverage value to fill in the 'gaps'
+    for k in range(1,len(mpileup_contig)):
+       
+        if (int(mpileup_contig[k-1][1]) + 1) != (int(mpileup_contig[k][1])):
+           
+            mpileup_contig.insert(int(k), [contig, (int(mpileup_contig[k-1][1]) + 1), 0])
+    
+    #Now we calculate coverage for a sliding window I can define
+    sliding_window = 1000
+    
+    #Set a min coverage to alert me 
+    min_coverage = 10
+    contig_coverage = 0
+    
+    #Go over contig measure number of reads that map to each base
+    for d in range(0,(len(mpileup_contig) - sliding_window)):
+        contig_reads = 0
+        for q in range(d,(d + sliding_window)):
+            contig_reads += int(mpileup_contig[q][2])
+        
+        #calculate coverage
+        contig_coverage = contig_reads/sliding_window
+     
+        if contig_coverage >= min_coverage:
+             # print(contig)
+             # print(str(d) + ":" + str(d + sliding_window))
+             # print(contig_coverage)
+             # print(d)
+             prophage_regions.append([contig, d, (d+sliding_window)])
+
+for b in range(0,len(prophage_regions)):
+    outF = open("prophage_regions_step1.txt", "a")
+    outF.writelines(str(prophage_regions[b][0]) + ' ' + str(prophage_regions[b][1]) + ' ' + str(prophage_regions[b][2]) + '\n')
+    outF.close()
+
+
+print('found prophage regions')
+
+for n in range(0, len(prophage_regions)):
+    
+    
+    while (len(prophage_regions) > (n + 1)) and (prophage_regions[n][0] == prophage_regions[n+1][0]) and (int(prophage_regions[n][2]) > int(prophage_regions[n+1][1])):
+        prophage_regions[n][2] = prophage_regions[n+1][2]
+        prophage_regions.pop(n+1) 
+
+
+for op in range(0,len(prophage_regions)):
+    outF = open("prophage_regions_step1.txt", "a")
+    outF.writelines(str(prophage_regions[op][0]) + ' ' + str(prophage_regions[op][1]) + ' ' + str(prophage_regions[op][2]) + '\n')
+    outF.close()
+
+print('Regions')
+
+#Here I load in the concatenated fasta file of all the bacterial bins involved in the study
+bacterial_bin = {}
+for seq_record1 in SeqIO.parse("Data_For_Scripts/Final_Collection_DAS_check_MAGs.fa"", "fasta"):
+        bacterial_bin[seq_record1.id] = seq_record1.seq
+
+for b1 in range(0, len(prophage_regions)):
+    if prophage_regions[b1][0] in bacterial_bin:
+                   
+        outF = open("bowtie_prophages_DASins.fa", "a")
+        outF.writelines(">" + prophage_regions[b1][0] + '\n')
+        outF.writelines(bacterial_bin[prophage_regions[b1][0]][(int(prophage_regions[b1][1])):(int(prophage_regions[b1][2]))] + '\n')
+        outF.close()
+
+print('End')
+
diff --git a/MAGS_relative_abundance.R b/MAGS_relative_abundance.R
new file mode 100644
index 0000000..86c4c86
--- /dev/null
+++ b/MAGS_relative_abundance.R
@@ -0,0 +1,106 @@
+#Open packages
+library(dplyr)
+library(readr)
+library(ggplot2)
+library(tidyr)
+library(stringr)
+library("ggsci")
+
+
+
+#Relative abundances from CheckM
+SRR828645_profile <- read.delim("SRR828645_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community'))
+SRR828660_profile <- read.delim("SRR828660_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community'))
+SRR828661_profile <- read.delim("SRR828661_profile", header = TRUE, col.names = c('BinId', 'BinSize', 'MappedReads', 'Perc_Mapped_Reads', 'Percen_binned_population', 'Perc_community'))
+
+#Elsewhere I turned the GTDB_tk into a phylogentically sorted list
+phylo_GTDB_tk <- read.csv("phylogeny_of_bins.csv", header = TRUE)
+phylo_GTDB_tk$GTDB_taxa <- as.factor(phylo_GTDB_tk$GTDB_taxa)
+
+#Create a normalized for genome size percent of mapped reads
+
+SRR828645_profile$norm_reads <- (SRR828645_profile$MappedReads/(SRR828645_profile$BinSize*100000000))
+SRR828645_norm_reads <- sum(SRR828645_profile$norm_reads)
+SRR828645_profile$norm_perc_reads <- (SRR828645_profile$norm_reads/SRR828645_norm_reads)*100
+SRR828645_mini <- cbind(SRR828645_profile$BinId, SRR828645_profile$norm_reads, (replicate(25, "week1")))
+colnames(SRR828645_mini) <- c('BinID', 'NormPercReads', 'Week')
+SRR828645_mini <- as.data.frame(SRR828645_mini)
+
+SRR828661_profile$norm_reads <- (SRR828661_profile$MappedReads/(SRR828661_profile$BinSize*100000000))
+SRR828661_norm_reads <- sum(SRR828661_profile$norm_reads)
+SRR828661_profile$norm_perc_reads <- (SRR828661_profile$norm_reads/SRR828661_norm_reads)*100
+SRR828661_mini <- cbind(SRR828661_profile$BinId, SRR828661_profile$norm_reads, (replicate(25, "week2")))
+colnames(SRR828661_mini) <- c('BinID', 'NormPercReads', 'Week')
+SRR828661_mini <- as.data.frame(SRR828661_mini)
+
+SRR828660_profile$norm_reads <- (SRR828660_profile$MappedReads/(SRR828660_profile$BinSize*100000000))
+SRR828660_norm_reads <- sum(SRR828660_profile$norm_reads)
+SRR828660_profile$norm_perc_reads <- (SRR828660_profile$norm_reads/SRR828660_norm_reads)*100
+SRR828660_mini <- cbind(SRR828660_profile$BinId, SRR828660_profile$norm_reads, (replicate(25, "week3")))
+colnames(SRR828660_mini) <- c('BinID', 'NormPercReads', 'Week')
+SRR828660_mini <- as.data.frame(SRR828660_mini)
+
+all_weeks <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini)
+all_weeks <-  transform(all_weeks, NormPercReads = as.numeric(NormPercReads))
+all_weeks$BinID <- phylo_GTDB_tk[(match(all_weeks$BinID,phylo_GTDB_tk$Bin)),9]
+all_weeks$BinID <- factor(all_weeks$BinID, levels = phylo_GTDB_tk$GTDB_taxa)
+
+stacked_norm_perc <- ggplot(all_weeks, aes(fill=BinID, y=NormPercReads, x=Week,)) +
+  geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +theme(panel.border = element_blank())
+
+stacked_norm_perc  <- stacked_norm_perc + labs(fill = 'Bacteria') 
+stacked_norm_perc
+#save as .svg
+ggsave(file="stacked_norm_perc.svg", plot=stacked_norm_perc, width=10, height=8)
+
+#Alejandro wanted me to try an alternative order, specifically in order of the most abundant bacteria.
+
+#I interpreted this as the most abundant bacteria over the three days, the noramlized by each day or percentage of reads.
+
+abundance_order <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini)
+abundance_order$NormPercReads <- as.numeric(abundance_order$NormPercReads)
+binorder <- abundance_order %>% group_by(BinID) %>% summarise(total_abund = sum(NormPercReads)) %>% arrange(desc(total_abund)) %>% select(BinID)
+abundance_order <-  transform(abundance_order, NormPercReads = as.numeric(NormPercReads))
+abundance_order$BinID <- factor(abundance_order$BinID, levels = binorder$BinID)
+abundance_order$Bacteria <- phylo_GTDB_tk[(match(abundance_order$BinID,phylo_GTDB_tk$Bin)),9]
+abundance_order$Bacteria <- as.character(abundance_order$Bacteria)
+abundance_order$Bacteria <- factor(abundance_order$Bacteria, levels = phylo_GTDB_tk[(match(binorder$BinID,phylo_GTDB_tk$Bin)),9])
+
+stacked_norm_perc2 <- ggplot(abundance_order, aes(fill=Bacteria, y=NormPercReads, x=Week)) +
+  geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(panel.border = element_blank()) +
+      + theme(text = element_text(size = 10))
+stacked_norm_perc2 + theme(text = element_text(size = 10)) + theme(axis.title = element_text(size = 20)) + theme(legend.text = element_text(size = 20))     
+stacked_norm_perc2 <- stacked_norm_perc2 + theme(text = element_text(size = 10)) + theme(axis.title = element_text(size = 20)) + theme(legend.text = element_text(size = 10)) + theme(axis.title.x = element_text(size = 15, angle=90, hjust=1, vjust=1))    
+stacked_norm_perc2
+ggsave(file="stacked_norm_perc2.svg", plot=stacked_norm_perc2 )#, width=10, height=8)
+
+#I will try it based on abundance of week 1
+
+abundance_order2 <- rbind(SRR828645_mini,SRR828660_mini,SRR828661_mini)
+abundance_order2$NormPercReads <- as.numeric(abundance_order2$NormPercReads)
+binorder2 <- abundance_order2 %>% group_by(BinID) %>% filter(Week == 'week1') %>% arrange(desc(NormPercReads))
+abundance_order2 <-  transform(abundance_order2, NormPercReads = as.numeric(NormPercReads))
+abundance_order2$BinID <- factor(abundance_order2$BinID, levels = binorder2$BinID)
+abundance_order2$Bacteria <- phylo_GTDB_tk[(match(abundance_order2$BinID,phylo_GTDB_tk$Bin)),9]
+abundance_order2$Bacteria <- as.character(abundance_order2$Bacteria)
+abundance_order2$Bacteria <- factor(abundance_order2$Bacteria, levels = phylo_GTDB_tk[(match(binorder2$BinID,phylo_GTDB_tk$Bin)),9])
+stacked_norm_perc3 <- ggplot(abundance_order2, aes(fill=Bacteria, y=NormPercReads, x=Week,)) +
+  geom_bar(position="fill", stat="identity") + xlab(label = element_blank()) + ylab(label = "Relative Abundance") + scale_fill_igv() + theme_bw() +theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +theme(panel.border = element_blank())
+
+stacked_norm_perc3 
+
+#The order for organizing bacteria for figures is 
+bacteria_order <- as.data.frame(binorder$BinID)
+bacteria_order$`binorder$BinID` <- phylo_GTDB_tk[(match(bacteria_order$`binorder$BinID`,phylo_GTDB_tk$Bin)),9]
+write_csv(bacteria_order, file = 'bacteria_order.csv')
+
+#Alternative colour schemes I could use
+library(pals)
+stacked_norm_perc + scale_fill_manual(values = rev(cols25(n=25)))
+stacked_norm_perc + scale_fill_manual(values = cols25(n=26))
+stacked_norm_perc + scale_fill_manual(values = (glasbey(n=25)))
+stacked_norm_perc + scale_fill_manual(values = rev(glasbey(n=25)))
+stacked_norm_perc + scale_fill_manual(values = rev(unname(polychrome(n=25))))
+stacked_norm_perc + scale_fill_manual(values = rev(unname(alphabet(n=25))))
+stacked_norm_perc + scale_fill_manual(values = (unname(alphabet(n=25))))
+
diff --git a/Modified-PropagAtE_script.py b/Modified-PropagAtE_script.py
new file mode 100644
index 0000000..072dcf8
--- /dev/null
+++ b/Modified-PropagAtE_script.py
@@ -0,0 +1,968 @@
+#! /usr/bin/env python3
+# Author: Kristopher Kieft, UW-Madison
+# kieft@wisc.edu
+
+
+# PropagAtE: Prophage Activity Estimator
+# Version: v1.0.0
+# Release date: January 21 2021
+# dev: 22
+
+# Modified by Steven Sutcliffe
+# Allows prophages with no flanking regions to be included using the bacterial MAG mean coverage
+
+# Imports
+try:
+    import warnings
+    warnings.filterwarnings("ignore")
+    import argparse
+    import subprocess
+    import sys
+    import random
+    import time
+    import datetime
+    from datetime import date
+    from scipy import stats
+    import statistics
+    from scipy.stats import mannwhitneyu
+    from scipy.stats import combine_pvalues
+    import math
+    import uuid
+    import logging
+    import os
+    from Bio.SeqIO.FastaIO import SimpleFastaParser
+except Exception as e:
+    sys.stderr.write("\nError: please verify dependancy imports are installed and up to date:\n\n")
+    sys.stderr.write(str(e) + "\n\n")
+    exit(1)
+
+from sys import exit
+# Set up variables
+start = time.time()
+start_time = str(datetime.datetime.now().time()).rsplit(".",1)[0]
+u = str(uuid.uuid1()).split("-")[0]
+
+# propagate = argparse.ArgumentParser(description='Example 1: python3 PropagAtE_run.py -f scaffolds.fasta -r forward.fastq reverse.fastq -o output.tsv -v prophage_coordinates.tsv -t threads -i ID  .  .  .  Example 2: python3 PropagAtE_run.py -s alignment.sam -o output.tsv -v prophage_coordinates.tsv')
+# propagate.add_argument('--version', action='version', version='PropagAtE v1.0.0')
+
+# # Input required
+# propagate.add_argument('-s', type=str, nargs=1, default = [''], help='input SAM (.sam) sequence alignment file (skip -sb/-b/-r/-f).')
+# propagate.add_argument('-b', type=str, nargs=1, default = [''], help='input BAM (.bam) sequence alignment file (skip -sb/-s/-r/-f).')
+# propagate.add_argument('-sb', type=str, nargs=1, default = [''], help='input sorted BAM (.bam) sequence alignment file (skip -b/-s/-r/-f).')
+# propagate.add_argument('-r', type=str, nargs=2, default = ['',''], help='input forward and reverse read files (.fastq) separated by a space (skip -sb/-b/-s, requires -f)')
+# propagate.add_argument('-v', type=str, nargs=1, required = True, help='VIBRANT "integrated_prophage_coordinates" file or custom prophage coordinates file.')
+
+# # Optional
+# propagate.add_argument('-o', type=str, nargs=1, default = ['PropagAtE_result.' + str(u) + '.tsv'], help='name of output tab-separated file for results.')
+# propagate.add_argument('-f', type=str, nargs=1, default = [''], help='if input is -r provide genomes/scaffolds (.fna, .fastq, .fa, .fsa).')
+# propagate.add_argument('-t', type=str, nargs=1, default = ['1'], help='threads used for Bowtie2 mapping (if input is -r and -f).')
+# propagate.add_argument('-i', type=str, nargs=1, default = [''], help='unique ID to append to Bowtie2 SAM output for reference to read sets used (not requied, useful when analyzing multiple read sets).')
+# propagate.add_argument('-p', type=str, nargs=1, default = ['0.05'], help='p-value cutoff for Mann-Whitney statistical test (default=0.05).')
+# propagate.add_argument('-g', type=str, nargs=1, default = ['1'], choices=["0", "1", "2", "3"], help='number of gap extensions allowed in read alignment (per read) [default=1, options=0-3].')
+# propagate.add_argument('-m', type=str, nargs=1, default = ['3'], choices=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], help='number of mismatches allowed in read alignment (per read) [default=3, options=0-10].')
+# propagate.add_argument('-n', type=str, nargs=1, default = ['5'], help='number of coverage sub-samples to take for Mann-Whitney statistical analysis [default=5, minimum=5].')
+# propagate.add_argument('-a', type=str, nargs=1, default = ['4'], choices=["0", "1", "2", "3", "4"], help='remove outlier coverage values "a" standard deviations from the average [default=4].')
+# propagate.add_argument('-e', type=str, nargs=1, default = ['0.75'], help="minimum effect size for significance by Cohen's d test [default=0.75, minimum=0.6].")
+# propagate.add_argument('-c', type=str, nargs=1, default = ['1.65'], help="minimum prophage:host coverage ratio for significance [default=1.65, minimum=1.5].")
+# propagate.add_argument('-y', type=str, nargs=1, default = [''], help='term to distinguish host from prophage name in genome/scaffold definition lines [default="_fragment_"].')
+# propagate.add_argument('-x', type=str, nargs=1, default = [''], help="path to Samtools executable if not in PATH.")
+# propagate.add_argument('-clean', action='store_true', help='use this setting to remove (if applicable) any generated SAM, unsorted BAM and Bowtie2 index files. Will retain any user input files, raw reads and sorted BAM files (default=off).')
+# propagate.add_argument('-all', action='store_true', help='use this setting to keep all aligned reads (ignore -g and -m, default=off).')
+# propagate.add_argument('-z', type=str, nargs=1, default = [''], help='character(s) used to replace spaces in genome/scaffold names. Use this setting if spaces were replaced in the alignment file (SAM/BAM) but not in the prophage coordinates file.')
+
+# Parse arguments
+args = False
+samfile = ''
+bamfile = ''
+sortbamfile = "SRR828661_das_check_bin_abundance.gap-1_mm-3.sorted.bam"
+fasta = ''
+forward = ''
+reverse = ''
+threads = ''
+vibe = "propagate_coordinates"
+id = ''
+pval = 0.05
+pval = 0.050
+gaps = 1
+mismatches = 3
+subs = 5
+outliers = 4
+effect = 0.75
+ratio_cutoff = 1.65
+termsplit = ''
+###Making a split for the bin as well:
+binsplit = '_k119_'
+outfile = "test.tsv"
+outpath = "Test_Bowtie"
+clean = False
+args_all = False
+
+try:
+    temp_out = str(outfile).rsplit("/",1)[1]
+except Exception:
+    temp_out = str(outfile)
+outfile = outpath + str(temp_out)
+sampath = '/usr/local/bin/samtools'
+spaces = ''
+replace = False
+if termsplit == '':
+    termsplit = "_fragment_"
+if spaces == '':
+    spaces == '~~'
+else:
+    replace = True
+if sampath != '':
+    if "samtools" == sampath[-8:]:
+        sampath = sampath[:-8]
+    if sampath[-1] != '/':
+        sampath += '/'
+
+# log file
+logfilename = str(outfile).rsplit(".",1)[0] + ".log"
+subprocess.run("rm " + logfilename + " 2> /dev/null", shell=True)
+logging.basicConfig(filename=logfilename, level=logging.INFO, format='%(message)s')
+
+# verify inputs and versions
+if termsplit[0] == " ":
+    sys.stderr.write("\nThe -y input term cannot start with a space. If you tried '-y -gene' then try '-y-gene' instead. Exiting.\n")
+    logging.info("\nThe -y input term cannot start with a space. If you tried '-y -gene' then try '-y-gene' instead. Exiting.\n")
+    exit(1)
+if outfile.rsplit(".",1)[1] == "log":
+    sys.stderr.write("\nThe output file cannot end in .log, suggested is .tsv. Exiting.\n")
+    logging.info("\nThe output file cannot end in .log, suggested is .tsv. Exiting.\n")
+    exit(1)
+if (samfile != '' and bamfile != '') or (samfile != '' and forward != '') or (bamfile != '' and forward != '') or (samfile != '' and sortbamfile != '') or (sortbamfile != '' and bamfile != '') or (forward != '' and sortbamfile != ''):
+    sys.stderr.write("\nOnly one input file (-s, -sb, -b, -r) is allowed. Exiting.\n")
+    logging.info("\nOnly one input file (-s, -sb, -b, -r) is allowed. Exiting.\n")
+    exit(1)
+try:
+    if (forward.rsplit('.',1)[1] != 'fastq' or reverse.rsplit('.',1)[1] != 'fastq') and ('.'.join(forward.rsplit('.',2)[1:]) != 'fastq.gz' or '.'.join(reverse.rsplit('.',2)[1:]) != 'fastq.gz'):
+        sys.stderr.write("\nError: Provided reads files must both have the extension .fastq or .fastq.gz. Exiting.\n")
+        logging.info("\nError: Provided reads files must both have the extension .fastq or .fastq.gz. Exiting.\n")
+        exit(1)
+except Exception:
+    pass
+try:
+    if forward != '' and reverse != '' and fasta.rsplit('.',1)[1] != 'fa' and fasta.rsplit('.',1)[1] != 'fna' and fasta.rsplit('.',1)[1] != 'fasta' and fasta.rsplit('.',1)[1] != 'fsa':
+        sys.stderr.write("\nError: Provided fasta file must have the extension .fasta, .fna, .fa or .fsa. Exiting.\n")
+        logging.info("\nError: Provided fasta file must have the extension .fasta, .fna, .fa or .fsa. Exiting.\n")
+        exit(1)
+except Exception:
+    pass
+if forward != '' and reverse != '' and fasta == '':
+    sys.stderr.write("\nError: Reads (-r) were provided but a sequence file (-f) was not. Exiting.\n")
+    logging.info("\nError: Reads (-r) were provided but a sequence file (-f) was not. Exiting.\n")
+    exit(1)
+try:
+    if samfile.rsplit('.',1)[1] != 'sam':
+        sys.stderr.write("\nError: Provided sam file must have the extension .sam. Exiting.\n")
+        logging.info("\nError: Provided sam file must have the extension .sam. Exiting.\n")
+        exit(1)
+except Exception:
+    pass
+try:
+    if bamfile.rsplit('.',1)[1] != 'bam':
+        sys.stderr.write("\nError: Provided bam file must have the extension .bam. Exiting.\n")
+        logging.info("\nError: Provided bam file must have the extension .bam. Exiting.\n")
+        exit(1)
+except Exception:
+    pass
+try:
+    if sortbamfile.rsplit('.',1)[1] != 'bam':
+        sys.stderr.write("\nError: Provided bam (sorted) file must have the extension .bam. Exiting.\n")
+        logging.info("\nError: Provided bam (sorted) file must have the extension .bam. Exiting.\n")
+        exit(1)
+except Exception:
+    pass
+
+if not os.path.exists(vibe):
+    sys.stderr.write("\nError: could not identify prophage coordinates file (-v). Exiting.\n")
+    logging.info("\nError: could not identify prophage coordinates file (-v). Exiting.\n")
+    exit(1)
+if effect < 0.6:
+    sys.stderr.write("\nError: Cohen's d effect size should not be set below 0.6. Exiting.\n")
+    logging.info("\nError: Cohen's d effect size should not be set below 0.6. Exiting.\n")
+    exit(1)
+if ratio_cutoff < 1.5:
+    sys.stderr.write("\nError: ratio cutoff should not be set below 1.5. Exiting.\n")
+    logging.info("\nError: ratio cutoff should not be set below 1.5. Exiting.\n")
+    exit(1)
+if subs < 5:
+    sys.stderr.write("\nError: subsample number should not be set below 5. Exiting.\n")
+    logging.info("\nError: subsample number should not be set below 5. Exiting.\n")
+    exit(1)
+try:
+    subprocess.check_output("which " + sampath + "samtools", shell=True)
+except Exception:
+    sys.stderr.write("\nError: samtools does not appear to be installed, is not in the system's PATH or is not in the indicated path (-x). Exiting.\n")
+    logging.info("\nError: samtools does not appear to be installed, is not in the system's PATH or is not in the indicated path (-x). Exiting.\n")
+    exit(1)
+try:
+    subprocess.check_output(sampath + "samtools depth -a placeholder.bam 2> " + outpath + u + "_samtools_checkfile.txt", shell=True)
+except Exception as e:
+    with open(outpath + u + "_samtools_checkfile.txt",'r') as checkfile:
+        for line in checkfile:
+            if "depth: invalid option -- 'a'" in line:
+                sys.stderr.write("\nError: samtools version is incorrect. Please update and verify samtools '-a' flag is available.")
+                sys.stderr.write("\n       Alternatively use the '-x' flag to provide the location of the correct samtools executable.")
+                sys.stderr.write("\n       Example update using conda: 'conda install -c bioconda samtools'. Exiting.\n")
+                logging.info("\nError: samtools version is incorrect. Please update and verify samtools '-a' flag is available.")
+                logging.info("\n       Alternatively use the '-x' flag to provide the location of the correct samtools executable.")
+                subprocess.run("rm " + outpath + u + "_samtools_checkfile.txt 2> /dev/null", shell=True)
+                exit(1)
+subprocess.run("rm " + outpath + u + "_samtools_checkfile.txt 2> /dev/null", shell=True)
+
+if forward == '' and fasta != '':
+    logging.info("Note: Reads not provided, ignoring input fasta file.\n")
+if forward == '' and threads != '1':
+    logging.info("Note: Reads not provided, using 1 thread.\n")
+if forward == '' and id != '':
+    logging.info("Note: Reads not provided, ignoring identifier ID input.\n")
+
+# Statistics
+def MWdownsampling(phage, background):
+    n = 1
+    pvalue_list = []
+    while n <= subs:
+        cov_p_100 = random.sample(phage, 100)
+        cov_h_100 = random.sample(background, 100)
+        if sum(cov_p_100) != 0 and sum(cov_h_100) != 0:
+            stat, pvalue = mannwhitneyu(cov_p_100, cov_h_100)
+            pvalue_list.append(pvalue)
+        n += 1
+    result = combine_pvalues(pvalue_list)
+    return result
+
+def cohenD(phage_mean, phage_sd, host_mean, host_sd):
+    pool = math.sqrt((phage_sd**2+host_sd**2)/2)
+    d = (host_mean-phage_mean)/pool
+    return abs(d)
+
+
+##### ----------------------------------------------------------------------------------------------------------------------- #####
+logging.info("Command:    %s" % ' '.join(sys.argv))
+logging.info("Outfolder:  %s" % outpath)
+logging.info("")
+logging.info("Date:       %s" % str(date.today()))
+logging.info("Time:       %s" % str(datetime.datetime.now().time()).rsplit(".",1)[0])
+logging.info("Program:    PropagAtE v1.0.0")
+logging.info("Run ID:     %s\n" % u)
+
+logging.info("Time (min) |  Log                                                   ")
+logging.info("--------------------------------------------------------------------")
+
+reads_input = False
+sam_input = False
+bam_input = False
+sortbam_input = False
+
+# If input is reads/fasta run Bowtie2
+if forward != '' and fasta != '':
+    reads_input = True
+    logging.info("%s         Reads input identified, using %s threads to run Bowtie2." % (str(round((time.time() - float(start))/60,1)),threads))
+
+    try:
+        subprocess.check_output("which bowtie2", shell=True)
+    except Exception:
+        sys.stderr.write("\nError: Bowtie2 does not appear to be installed or is not in the system's PATH. Exiting.\n")
+        logging.info("\nError: Bowtie2 does not appear to be installed or is not in the system's PATH. Exiting.\n")
+        exit(1)
+    try:
+        temp = fasta.rsplit("/",1)[1]
+        base = temp.rsplit(".",1)[0]
+    except Exception:
+        base = fasta.rsplit(".",1)[0]
+
+    logging.info("%s         Checking format of FASTA file, replacing spaces if necessary." % str(round((time.time() - float(start))/60,1)))
+    with open(fasta, 'r') as fasta_in, open(outpath + base + ".no-spaces.fasta", 'w') as fasta_out:
+        for name,seq in SimpleFastaParser(fasta_in):
+            if " " in name:
+                replace = True
+                name = name.replace(" ", str(spaces))
+            fasta_out.write(">" + str(name) + "\n" + str(seq) + "\n")
+    if replace == True:
+        logging.info('%s         Spaces identified in sequence names. Replaced with "~~".' % str(round((time.time() - float(start))/60,1)))
+        fasta = outpath + base + ".no-spaces.fasta"
+
+    if not os.path.exists(outpath + str(base)+".bowtie2.index.1.bt2"):
+        logging.info("%s         Building index ..." % str(round((time.time() - float(start))/60,1)))
+        build = True
+        subprocess.run("bowtie2-build " + str(fasta) + " " + outpath + str(base)+".bowtie2.index > /dev/null 2> /dev/null", shell=True)
+    else:
+        build = False
+        logging.info("%s         Existing index identified, skipping build ..." % str(round((time.time() - float(start))/60,1)))
+    logging.info("%s         Mapping reads ..." % str(round((time.time() - float(start))/60,1)))
+    if id != '':
+        subprocess.run("bowtie2 -x " + outpath + str(base)+".bowtie2.index -1 " + str(forward) + " -2 " + str(reverse) + " -S " + outpath + str(base)+"." + str(id) + ".sam -q -p " + str(threads) + " --no-unal > /dev/null 2> /dev/null", shell=True)
+    else:
+        subprocess.run("bowtie2 -x " + outpath + str(base)+".bowtie2.index -1 " + str(forward) + " -2 " + str(reverse) + " -S " + outpath + str(base)+".sam -q -p " + str(threads) + " --no-unal > /dev/null 2> /dev/null", shell=True)
+    if clean == True and build == True:
+        subprocess.run("rm " + outpath + str(base)+".bowtie2.index.* 2> /dev/null", shell=True)
+    logging.info("%s         Bowtie2 done." % str(round((time.time() - float(start))/60,1)))
+    if str(id) != "":
+        samfile = outpath + str(base)+"." + str(id) + ".sam"
+    else:
+        samfile = outpath + str(base)+".sam"
+    subprocess.run("rm " + outpath + base + ".no-spaces.fasta 2> /dev/null", shell=True)
+
+# Read in prophage coordinate data
+prophage_dict = {}
+prophage_dict_frags = {}
+vibrant_genomes_full = []
+logging.info("%s         Generating a list of all prophage regions ..." % str(round((time.time() - float(start))/60,1)))
+logging.info("%s         Distinguishing host from prophage names by the term '%s' ..." % (str(round((time.time() - float(start))/60,1)),str(termsplit)))
+with open(vibe, 'r') as vibrant:
+    phage_list = vibrant.read().replace("\n","\t").split("\t")
+    if phage_list[0] == "scaffold" and phage_list[1] == "fragment" and phage_list[5] == "nucleotide start" and phage_list[6] == "nucleotide stop":
+        n = 9
+        while n < len(phage_list):
+            name = str(phage_list[n]).split(termsplit)[0]
+            if replace == True:
+                name = name.replace(" ", str(spaces))
+            if name[0] == "'" or name[0] == '"':
+                name = name[1:]
+            if replace == True:
+                frag = str(phage_list[n]).replace(" ", str(spaces))
+            else:
+                frag = str(phage_list[n])
+            if frag[0] == "'" or frag[0] == '"':
+                frag = frag[1:-1]
+
+            if n == 9: # in case of error
+                save_name = name
+                save_frag = frag
+            if name != '' and frag != '' and name == frag: # names were unable to be split
+                sys.stderr.write("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n")
+                sys.stderr.write("Host name detected:       %s\n" % name)
+                sys.stderr.write("Prophage name detected:   %s\n" % frag)
+                sys.stderr.write("-y flag input detected:   %s\n" % termsplit)
+                logging.info("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n")
+                logging.info("Host name detected:       %s" % name)
+                logging.info("Prophage name detected:   %s" % frag)
+                logging.info("-y flag input detected:   %s" % termsplit)
+                exit(1)
+
+            prophage_dict.update({str(name) + "~" + str(phage_list[n+4]):str(frag)})
+            prophage_dict.update({str(name) + "~" + str(phage_list[n+5]):str(frag)})
+            prophage_dict_frags.update({str(name):str(frag)})
+            vibrant_genomes_full.append(str(name))
+
+            n += 8
+
+    elif phage_list[0] == "scaffold" and phage_list[1] == "fragment" and phage_list[2] == "start" and phage_list[3] == "stop":
+        n = 5
+        while n < len(phage_list):
+            name = str(phage_list[n]).split(termsplit)[0]
+            if replace == True:
+                name = name.replace(" ", str(spaces))
+            if name[0] == "'" or name[0] == '"':
+                name = name[1:]
+            if replace == True:
+                frag = str(phage_list[n]).replace(" ", str(spaces))
+            else:
+                frag = str(phage_list[n])
+            if frag[0] == "'" or frag[0] == '"':
+                frag = frag[1:-1]
+
+            if n == 5: # in case of error
+                save_name = name
+                save_frag = frag
+            if name != '' and frag != '' and name == frag: # names were unable to be split
+                sys.stderr.write("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n")
+                sys.stderr.write("Host name detected:       %s\n" % name)
+                sys.stderr.write("Prophage name detected:   %s\n" % frag)
+                sys.stderr.write("-y flag input detected:   %s\n" % termsplit)
+                logging.info("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n")
+                logging.info("Host name detected:       %s" % name)
+                logging.info("Prophage name detected:   %s" % frag)
+                logging.info("-y flag input detected:   %s" % termsplit)
+                exit(1)
+
+            prophage_dict.update({str(name) + "~" + str(phage_list[n+1]):str(frag)})
+            prophage_dict.update({str(name) + "~" + str(phage_list[n+2]):str(frag)})
+            prophage_dict_frags.update({str(name):str(frag)})
+            vibrant_genomes_full.append(str(name))
+
+            n += 4
+
+    else:
+        sys.stderr.write("\nError: -v coordinates file appears to be incorrect. Either input a VIBRANT 'integrated_prophage_coordinates' file or a custom tab separated coordinates file with the header 'scaffold/fragment/start/stop'. Exiting.\n")
+        logging.info("\nError: -v coordinates file appears to be incorrect. Either input a VIBRANT 'integrated_prophage_coordinates' file or a custom tab separated coordinates file with the header 'scaffold/fragment/start/stop'. Exiting.\n")
+        exit(1)
+
+    number_hosts = list(set(vibrant_genomes_full))
+    logging.info("%s         Number of prophage regions identified: %s" % (str(round((time.time() - float(start))/60,1)),len(vibrant_genomes_full)))
+    logging.info("%s         Number of unique host regions identified: %s" % (str(round((time.time() - float(start))/60,1)),len(number_hosts)))
+    vibrant_genomes_full = list(set(vibrant_genomes_full))
+
+###### Make a bin list not scaffold list
+
+bin_list = []
+
+for i in vibrant_genomes_full:
+    name = str(i).split(binsplit)[0]
+    bin_list.append(str(name))
+    
+bin_list = list(set(bin_list))
+
+# process sam/bam files
+logging.info("%s         Processing alignment (.sam/.bam) file." % str(round((time.time() - float(start))/60,1)))
+
+# Gap/mismatch processing for sam file, convert sam to bam
+processed = False
+if samfile != '':
+    sam_input = True
+    try:
+        temp = samfile.rsplit("/",1)[1]
+        base = temp.rsplit(".",1)[0]
+    except Exception:
+        base = samfile.rsplit(".",1)[0]
+
+    if args_all == False:
+        processed = True
+        logging.info("%s         Alignment .sam input identified, processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1)))
+        tossed = 0
+        kept = 0
+        with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf:
+            for line in sf:
+                if "XG:i:" in line and "XM:i:" in line:
+                    xg = line.split("\t")[15]
+                    xg = xg.split(":")[2]
+                    xm = line.split("\t")[13]
+                    xm = xm.split(":")[2]
+                    if int(xg) <= gaps and int(xm) <= mismatches:
+                        psf.write(line)
+                        kept += 1
+                    else:
+                        tossed += 1
+                else:
+                    psf.write(line)
+        if reads_input == True and clean == True:
+            subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam
+        samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam'
+        try:
+            temp = samfile.rsplit("/",1)[1]
+            base = temp.rsplit(".",1)[0]
+        except Exception:
+            base = samfile.rsplit(".",1)[0]
+        logging.info("%s         %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept)))
+    else:
+        logging.info("%s         Alignment .sam input identified, skipping gaps/mismatches processing ..." % str(round((time.time() - float(start))/60,1)))
+
+    logging.info("%s         Converting .sam to .bam ..." % str(round((time.time() - float(start))/60,1)))
+    subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True)
+    logging.info("%s         Conversion done." % str(round((time.time() - float(start))/60,1)))
+    bamfile = outpath + str(base) + ".bam"
+
+samfile_cleaned = False
+if reads_input == True and clean == True:
+    samfile_cleaned = True
+    subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam
+
+# Gap/mismatch processing for bam file
+if bamfile != '':
+    bam_input = True
+    if sam_input == False and args_all == False:
+        try:
+            temp = bamfile.rsplit("/",1)[1]
+            base = temp.rsplit(".",1)[0]
+        except Exception:
+            base = bamfile.rsplit(".",1)[0]
+        logging.info("%s         Alignment .bam input identified, coverting to .sam for gap/mismatch filtering ..." % str(round((time.time() - float(start))/60,1)))
+        subprocess.run(sampath + "samtools view -h -o " + outpath + str(base) + ".sam " + str(bamfile) + " 2> /dev/null", shell=True)
+
+        samfile = outpath + str(base) + ".sam"
+        processed = True
+        logging.info("%s         Alignment .sam input identified, processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1)))
+        tossed = 0
+        kept = 0
+        with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf:
+            for line in sf:
+                if "XG:i:" in line and "XM:i:" in line:
+                    xg = line.split("\t")[15]
+                    xg = xg.split(":")[2]
+                    xm = line.split("\t")[13]
+                    xm = xm.split(":")[2]
+                    if int(xg) <= gaps and int(xm) <= mismatches:
+                        psf.write(line)
+                        kept += 1
+                    else:
+                        tossed += 1
+                else:
+                    psf.write(line)
+        if clean == True:
+            subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam
+        samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam'
+        try:
+            temp = samfile.rsplit("/",1)[1]
+            base = temp.rsplit(".",1)[0]
+        except Exception:
+            base = samfile.rsplit(".",1)[0]
+        logging.info("%s         %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept)))
+        logging.info("%s         Converting processed .sam to .bam ..." % str(round((time.time() - float(start))/60,1)))
+        subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True)
+        logging.info("%s         Conversion done." % str(round((time.time() - float(start))/60,1)))
+        bamfile = outpath + str(base) + ".bam"
+
+        if clean == True:
+            samfile_cleaned = True
+            subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam
+
+    if sam_input == False and args_all == True:
+        logging.info("%s         Alignment .bam input identified, sorting .bam file ..." % str(round((time.time() - float(start))/60,1)))
+    try:
+        temp = bamfile.rsplit("/",1)[1]
+        base = temp.rsplit(".",1)[0]
+    except Exception:
+        base = bamfile.rsplit(".",1)[0]
+    subprocess.run(sampath + "samtools sort -o " + outpath + str(base) + ".sorted.bam " + str(bamfile) + " 2> /dev/null", shell=True)
+    logging.info("%s         Sorting done." % str(round((time.time() - float(start))/60,1)))
+    sortbam = outpath + str(base) + ".sorted.bam"
+
+bamfile_cleaned = False
+if sam_input == True and clean == True:
+    bamfile_cleaned = True
+    subprocess.run("rm " + str(bamfile), shell=True)
+
+# Gap/mismatch processing for sorted bam file
+if sortbamfile != '':
+    if args_all == False and processed == False:
+        try:
+            temp = sortbamfile.rsplit("/",1)[1]
+            base = temp.rsplit(".",1)[0]
+        except Exception:
+            base = sortbamfile.rsplit(".",1)[0]
+
+        logging.info("%s         Sorted .bam input identified, coverting to .sam for gap/mismatch filtering ..." % str(round((time.time() - float(start))/60,1)))
+        subprocess.run(sampath + "samtools view -h -o " + outpath + str(base) + ".sam " + str(sortbamfile) + " 2> /dev/null", shell=True)
+        samfile = outpath + str(base) + ".sam"
+        processed = True
+        logging.info("%s         Processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1)))
+        tossed = 0
+        kept = 0
+        with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf:
+            for line in sf:
+                if "XG:i:" in line and "XM:i:" in line:
+                    xg = line.split("\t")[15]
+                    xg = xg.split(":")[2]
+                    xm = line.split("\t")[13]
+                    xm = xm.split(":")[2]
+                    if int(xg) <= gaps and int(xm) <= mismatches:
+                        psf.write(line)
+                        kept += 1
+                    else:
+                        tossed += 1
+                else:
+                    psf.write(line)
+
+        if clean == True:
+            subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam
+        samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam'
+        try:
+            temp = samfile.rsplit("/",1)[1]
+            base = temp.rsplit(".",1)[0]
+        except Exception:
+            base = samfile.rsplit(".",1)[0]
+        logging.info("%s         %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept)))
+
+        logging.info("%s         Converting processed .sam to .bam ..." % str(round((time.time() - float(start))/60,1)))
+        subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True)
+        logging.info("%s         Conversion done." % str(round((time.time() - float(start))/60,1)))
+        bamfile = outpath + str(base) + ".bam"
+
+        if clean == True:
+            samfile_cleaned = True
+            subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam
+
+        logging.info("%s         Sorting .bam file ..." % str(round((time.time() - float(start))/60,1)))
+        try:
+            temp = bamfile.rsplit("/",1)[1]
+            base = temp.rsplit(".",1)[0]
+        except Exception:
+            base = bamfile.rsplit(".",1)[0]
+        subprocess.run(sampath + "samtools sort -o " + outpath + str(base) + ".sorted.bam " + str(bamfile) + " 2> /dev/null", shell=True)
+        logging.info("%s         Sorting done." % str(round((time.time() - float(start))/60,1)))
+        sortbam = outpath + str(base) + ".sorted.bam"
+        if clean == True:
+            samfile_cleaned = True
+            subprocess.run("rm " + str(bamfile), shell=True) # remove processed or non-processed sam
+
+    sortbam = sortbamfile
+    if args_all == True:
+        try:
+            temp = sortbam.rsplit("/",1)[1]
+            base = temp.rsplit(".",1)[0]
+        except Exception:
+            base = sortbam.rsplit(".",1)[0]
+        logging.info("%s         Sorted alignment .bam input identified, skipping sorting and gap/mismatch processing." % str(round((time.time() - float(start))/60,1)))
+
+# Extract coverage for each nucleotide
+temp = outpath + str(base) + ".temp-list.txt"
+subprocess.run("echo '" + str(sortbam) + "' > " + str(temp), shell=True)
+cov_file = str(sortbam).rsplit(".",1)[0] + ".coverage-by-nt.tsv"
+logging.info("%s         Indexing sorted .bam file ..." % str(round((time.time() - float(start))/60,1)))
+subprocess.run(sampath + "samtools index " + str(sortbam), shell=True) # index bam file
+logging.info("%s         Extracting coverage information from sorted .bam file ..." % str(round((time.time() - float(start))/60,1)))
+
+# take the whole scaffold
+for genome in vibrant_genomes_full:
+    
+    subprocess.run(sampath + "samtools depth -r " + str(genome) + ":0-999999999 -a -f " + str(temp) + " >> " + cov_file + " 2> /dev/null", shell=True) # extract coverage info
+
+#subprocess.run("rm " + str(temp) + " 2> /dev/null", shell=True)
+#subprocess.run("rm " + str(sortbam) + ".bai 2> /dev/null", shell=True) # remove index
+subprocess.run("echo 'next\t0\t0\n' >> " + str(cov_file), shell=True)
+logging.info("%s         Coverage extraction done." % str(round((time.time() - float(start))/60,1)))
+
+##### Make a list of all the scafolds
+
+scaffold_file = str(sortbam).rsplit(".",1)[0] + ".scaffold_list.tsv"
+scaffold_cov_file = str(sortbam).rsplit(".",1)[0] + ".scaffold-coverage-by-nt.tsv"
+subprocess.run(sampath + "samtools idxstats " + str(sortbam) + " | cut -f 1 >> " + scaffold_file + " 2> /dev/null", shell=True)
+scaffold_list = []
+
+a_file = open(scaffold_file, "r")
+
+for line in a_file:
+    stripped_line = line.strip()
+    scaffold_list.append(stripped_line)
+
+a_file.close()
+
+
+##### Take coverage for all scaffolds not just those that have prophages
+
+
+for scaffold in scaffold_list:
+    
+    subprocess.run(sampath + "samtools depth -r " + str(scaffold) + ":0-999999999 -a -f " + str(temp) + " >> " + scaffold_cov_file + " 2> /dev/null", shell=True) # extract coverage info
+
+subprocess.run("echo 'next\t0\t0' >> " + str(scaffold_cov_file), shell=True)
+
+if int(os.stat(cov_file).st_size) < 50: # Exit if file is empty (< 50 bytes)
+    if samfile == "":
+        samfile = "N/A"
+    if bamfile == "":
+        bamfile = "N/A"
+    subprocess.run("rm " + str(cov_file), shell=True)
+    logging.info("%s         No coverage identified for scaffolds with integrated prophages. Analysis finished." % str(round((time.time() - float(start))/60,1)))
+    logging.info("")
+    logging.info("")
+    logging.info("CAUTION:")
+    logging.info("         'No coverage identified' may be due to an incorrect -y flag.")
+    logging.info("          Please verify -y input correctly distinguishes host from prophage sequences.")
+    logging.info("          This can be verified by checking the -v or -f inputs.")
+    logging.info("          Example host name detected:       %s" % save_name)
+    logging.info("          Example prophage name detected:   %s" % save_frag)
+    logging.info("          -y flag input:                    %s" % termsplit)
+    logging.info("")
+    logging.info("")
+    logging.info("Log file:               %s" % logfilename)
+    logging.info("Output results file:    N/A")
+    if samfile_cleaned == False:
+        logging.info("sam file:               %s" % samfile)
+    else:
+        logging.info("sam file:               N/A")
+    if bamfile_cleaned == False:
+        logging.info("bam file:               %s" % bamfile)
+    else:
+        logging.info("bam file:               N/A")
+    logging.info("sorted bam file:        %s" % sortbam)
+    logging.info("")
+    logging.info("")
+    logging.info("Number of active prophages identified: 0")
+    logging.info("")
+    logging.info('                                                               ##')
+    logging.info('                                                             ##  ##')
+    logging.info('                                                           ##      ##')
+    logging.info('######   ##  ##     ##     #######   ######    #####       ##      ##')
+    logging.info('##  ##   ##  ##   ##  ##   ##        ##       ##             ##  ##')
+    logging.info('######   ######   ######   ##  ###   ######    ###             ##')
+    logging.info('##       ##  ##   ##  ##   ##   ##   ##           ##           ##')
+    logging.info('##       ##  ##   ##  ##   #######   ######   #####            ##')
+    logging.info('                                                            #  ##  #')
+    logging.info('                                                           # # ## # #')
+    logging.info('                                                          #   #  #   #')
+    logging.info('                                                         #            #')
+    logging.info("")
+    exit(0)
+
+# calculate average read length
+total = subprocess.check_output(sampath + "samtools view -c " + str(sortbam), shell=True) # if there are less than 10k reads, use total reads to length check
+total = int(float(str(total.strip()).split("'")[1]))
+if int(total) < 10000:
+    head = total
+else:
+    head = 10000
+length = subprocess.check_output(sampath + "samtools view " + str(sortbam) + " | head -n " + str(head) + " | cut -f 10 | wc -c", shell=True)
+length = int(float(str(length.strip()).split("'")[1])/int(head))
+logging.info("%s         Number of valid reads used: %s." % (str(round((time.time() - float(start))/60,1)),str(total)))
+logging.info("%s         Estimated read length to trim by: %s." % (str(round((time.time() - float(start))/60,1)),str(length)))
+
+
+
+
+### Calculate the cover for every scaffold
+scaffold_coverage = {}
+b_file = open(scaffold_cov_file, "r")
+
+prev_genome = ''
+host = False
+temp_host = []
+
+for line in b_file:
+    line = line.strip()
+    
+     
+    if str(line.split("\t")[0]) != str(prev_genome):
+        
+        if host == True: # scaffold is done. Update coverage
+            
+            if len(temp_host) > 0:
+                scaffold_coverage.update({str(prev_genome):(temp_host[:])}) # trim the last N nucleotides from the last section added
+                host = False
+                temp_host = []
+                prev_genome = ''
+        
+            
+        if (line.split("\t")[0] != "next"):
+                
+            prev_genome = str(line.split("\t")[0])
+            temp_host.append(str(line.split("\t")[0]))
+                
+            temp_host.append(str(line.split("\t")[2]).strip("\n"))
+            host = True
+            
+            
+            
+    else:
+        
+        temp_host.append(str(line.split("\t")[2]).strip("\n"))
+
+b_file.close()         
+
+###Add up all the coverages of scaffolds per bin
+bin_coverage = {}
+for i in bin_list:
+    bin_coverage.update({str(i):[]})
+    
+for key in scaffold_coverage.keys():
+    host = str(key.split(binsplit)[0])
+    
+    if host in bin_coverage.keys():
+        
+        old_line = bin_coverage[host][:]
+        new_line = scaffold_coverage[key][1:]
+        combined_list = old_line + new_line
+        bin_coverage[host] = combined_list
+        
+
+##I will still run this to get a dictionary of "prophage coverage"
+
+#grab only relevant genomes and count genome lengths
+prophage_coverage = {}
+host_coverage = {}
+temp_host = []
+temp_phage = []
+with open(cov_file, 'r') as covfile:
+    logging.info("%s         Identifying coverages for host/prophage regions ..." % str(round((time.time() - float(start))/60,1)))
+    prev_genome = ''
+    n = 0
+    host = False
+    phage = False
+    host_start = False
+    phage_start = False
+    host_name = False
+    phage_name = False
+    for line in covfile:
+        if str(line.split("\t")[0]) != str(prev_genome):
+            # scaffold is done. Update coverage
+            if host == True:
+                if len(temp_host) > 0:
+                    host_coverage.update({str(prev_genome):temp_host[:-length]}) # trim the last N nucleotides from the last section added
+                if len(temp_phage) > 0:
+                    prophage_coverage.update({prophage_dict_frags[str(prev_genome)]:temp_phage})
+            elif phage == True:
+                if len(temp_phage) > 0:
+                    prophage_coverage.update({prophage_dict_frags[str(prev_genome)]:temp_phage[:-length]}) # trim the last N nucleotides from the last section added
+                if len(temp_host) > 0:
+                    host_coverage.update({str(prev_genome):temp_host})
+            n = 0
+            temp_phage = []
+            temp_host = []
+            host = False
+            phage = False
+            host_name = False
+            phage_name = False
+        prev_genome = str(line.split("\t")[0])
+
+        n += 1 # needs to come after previous statement of writing and length counting
+
+        if (line.split("\t")[0] in vibrant_genomes_full or line.split("\t")[0] == "next"): # trim by first length, only include phages, "next" is for including last scaffold
+            if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) not in prophage_dict.keys() and n == 1: # is the first nucleotide host?
+                if host_name == False:
+                    temp_host.append(str(line.split("\t")[0]))
+                    host_name = True
+                temp_host.append(str(line.split("\t")[2]).strip("\n"))
+                host = True
+                phage = False
+            elif str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) in prophage_dict.keys() and n == 1:
+                if phage_name == False:
+                    temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])])
+                    phage_name = True
+                temp_phage.append(str(line.split("\t")[2]).strip("\n").strip("\n"))
+                phage = True
+                host = False
+
+            if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) not in prophage_dict.keys() and n > 1:
+                if phage == True:
+                    if phage_name == False:
+                        temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])])
+                        phage_name = True
+                    temp_phage.append(str(line.split("\t")[2]).strip("\n"))
+                elif host == True:
+                    if host_name == False:
+                        temp_host.append(str(line.split("\t")[0]))
+                        host_name = True
+                    temp_host.append(str(line.split("\t")[2]).strip("\n"))
+
+            if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) in prophage_dict.keys() and n > 1: # is the nucleotide phage?
+                if host == True: # must mean that host is finished and phage starting
+                    host = False
+                    phage = True
+                    if phage_name == False:
+                        temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])])
+                        phage_name = True
+                    temp_phage.append(str(line.split("\t")[2]).strip("\n"))
+
+                elif phage == True: # phage must be done
+                    if phage_name == False:
+                        temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])])
+                        phage_name = True
+                    temp_phage.append(str(line.split("\t")[2]).strip("\n"))
+                    prophage_coverage.update({prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])]:temp_phage}) # write to phage coverage, reset phage
+                    temp_phage = []
+                    host = True
+                    phage = False
+                    phage_name = False
+
+# Statistics and metrics
+
+### First I will make the statistics per bacterial bin to save some time:
+bin_stats = {}
+for key in bin_coverage:
+    
+    cov_h = ([int(i) for i in bin_coverage[key][1:]])
+    avg_h = (statistics.mean(cov_h))
+    sd_h = (statistics.stdev(cov_h))
+    med_h = (statistics.median(cov_h))
+    stat_collection = [cov_h, avg_h, sd_h, med_h]
+    bin_stats.update({str(key):stat_collection})
+    #host_coverage.update({str(prev_genome):temp_host})
+    
+    
+
+###I modified this section so that I can add in my host statistics
+
+logging.info("%s         Coverage identification done." % str(round((time.time() - float(start))/60,1)))
+logging.info("%s         Performing statistical tests ..." % str(round((time.time() - float(start))/60,1)))
+total = []
+total_adj = []
+with open(outfile, 'w') as output:
+    output.write("prophage\thost\tactive\tMW_pvalue\tCohenD\tprophage-host_ratio\tmean_difference\tprophage_len\tprophage_mean_cov\tprophage_median_cov\tprophage_sd_cov\thost_len\thost_mean_cov\thost_median_cov\thost_sd_cov\n")
+    for key in prophage_coverage.keys():
+        host = str(key.split(binsplit)[0])
+        if host in bin_coverage.keys():
+            cov_p = [int(i) for i in prophage_coverage[key][1:]]
+            cov_h = bin_stats[host][0]
+            
+            avg_p = statistics.mean(cov_p)
+            sd_p = statistics.stdev(cov_p)
+            avg_h = bin_stats[host][1]
+            sd_h = bin_stats[host][2]
+            # if outliers > 0:
+            #     cov_p = [x for x in cov_p if x >= (avg_p-outliers*sd_p) and x <= (avg_p+outliers*sd_p)]
+            #     cov_h = [x for x in cov_h if x >= (avg_h-outliers*sd_h) and x <= (avg_h+outliers*sd_h)]
+            #     #
+            #     avg_p = statistics.mean(cov_p)
+            #     sd_p = statistics.stdev(cov_p)
+            #     avg_h = statistics.mean(cov_h)
+            #     sd_h = statistics.stdev(cov_h)
+            med_h = bin_stats[host][2]
+            med_p = statistics.median(cov_p)
+            mean_diff = avg_p - avg_h
+            result = MWdownsampling(cov_p, cov_h)
+            if mean_diff > 0:
+                cd = cohenD(avg_p, sd_p, avg_h, sd_h)
+            else:
+                cd = 0
+            host_zero_ratio = False
+            if avg_h > 0:
+                ratio = avg_p/avg_h
+            else:
+                ratio = avg_p
+                if avg_p >= 0.5:
+                    host_zero_ratio = True
+            #
+            active = "no"
+            if (float(result[1]) <= float(pval) or str(result[1]) == 'nan') and cd >= effect and mean_diff > 0 and (ratio >= ratio_cutoff or host_zero_ratio == True):
+                active = "yes"
+                total.append("yes")
+            if len(cov_h) < 1000 or len(cov_p) < 1000: # minimum host/phage length is 1000 bp
+                active = 'NA'
+                active_adj = 'NA'
+            if replace == True:
+                output.write(str(key).replace(str(spaces), " ")+"\t"+str(host).replace(str(spaces), " ")+"\t"+str(active)+"\t"+str(result[1])+"\t"+str(cd)+"\t"+str(ratio)+"\t"+str(mean_diff)+"\t"+str(len(cov_p))+"\t"+str(avg_p)+"\t"+str(med_p)+"\t"+str(sd_p)+"\t"+str(len(cov_h))+"\t"+str(avg_h)+"\t"+str(med_h)+"\t"+str(sd_h)+"\n")
+            else:
+                output.write(str(key)+"\t"+str(host)+"\t"+str(active)+"\t"+str(result[1])+"\t"+str(cd)+"\t"+str(ratio)+"\t"+str(mean_diff)+"\t"+str(len(cov_p))+"\t"+str(avg_p)+"\t"+str(med_p)+"\t"+str(sd_p)+"\t"+str(len(cov_h))+"\t"+str(avg_h)+"\t"+str(med_h)+"\t"+str(sd_h)+"\n")
+
+           
+
+        else:
+            if replace == True:
+                output.write(str(key).replace(str(spaces), " ")+"\t"+str(host).replace(str(spaces), " ")+"\tno_coverage\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\n")
+            else:
+                output.write(str(key)+"\t"+str(host)+"\tno_coverage\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\n")
+
+if samfile == "":
+    samfile = "N/A"
+if bamfile == "":
+    bamfile = "N/A"
+
+subprocess.run("rm " + str(cov_file), shell=True)
+logging.info("%s         Statistical tests done. Analysis finished." % str(round((time.time() - float(start))/60,1)))
+logging.info("")
+logging.info("")
+logging.info("Log file:               %s" % logfilename)
+logging.info("Output results file:    %s" % outfile)
+if samfile_cleaned == False:
+    logging.info("sam file:               %s" % samfile)
+else:
+    logging.info("sam file:               N/A")
+if bamfile_cleaned == False:
+    logging.info("bam file:               %s" % bamfile)
+else:
+    logging.info("bam file:               N/A")
+logging.info("sorted bam file:        %s" % sortbam)
+logging.info("")
+logging.info("")
+logging.info("Number of active prophages identified: %s" % len(total))
+logging.info("")
+logging.info('                                                               ##')
+logging.info('                                                             ##  ##')
+logging.info('                                                           ##      ##')
+logging.info('######   ##  ##     ##     #######   ######    #####       ##      ##')
+logging.info('##  ##   ##  ##   ##  ##   ##        ##       ##             ##  ##')
+logging.info('######   ######   ######   ##  ###   ######    ###             ##')
+logging.info('##       ##  ##   ##  ##   ##   ##   ##           ##           ##')
+logging.info('##       ##  ##   ##  ##   #######   ######   #####            ##')
+logging.info('                                                            #  ##  #')
+logging.info('                                                           # # ## # #')
+logging.info('                                                          #   #  #   #')
+logging.info('                                                         #            #')
+logging.info("")
+
+
+#
+#
+#
diff --git a/VLP_analysis.R b/VLP_analysis.R
new file mode 100644
index 0000000..18b7445
--- /dev/null
+++ b/VLP_analysis.R
@@ -0,0 +1,1042 @@
+#Open packages
+library(dplyr) #Version ‘1.0.9’
+library(ggplot2) #Version ‘3.3.6’
+library(tidyr) #Version ‘1.2.0’
+library(stringr) #Version ‘1.4.0’
+library(permute) #Version ‘0.9.7’
+library(vegan) #Version ‘2.6.2’
+library(data.table) #Version ‘1.14.2’
+library(lattice) #Version ‘0.20.45’
+library(rstatix) #Version ‘0.7.0’
+library(grid) #Version ‘4.2.1’
+library(DESeq2) #Version 1.36.0’
+library(car) #Version ‘3.1.0’
+library(ape) #Version ‘5.6.2’
+library(reshape2) #Version ‘1.4.4’
+library(IRanges) #Version ‘2.30.0’
+library(GenomicRanges) #Version ‘1.48.0’
+
+#Working from data
+setwd("VLP_analysis/")
+#Open all files in a data directory
+temp <-  list.files(pattern="*_readcount") #Make a temp variable for each of the readcount files
+SRR_names <- substring(temp,1,9) #Make a vector of the SRR run by name in order files of temp
+myfiles <-  lapply(temp, function(x) read.delim(x, header = FALSE)) #Opens up each file
+myfiles2 <- do.call(cbind, myfiles) #Binds all the files together in one dataframe
+colnames(myfiles2) <- 1:48 #Give each column a number
+counts_df <- myfiles2[,seq(2,48,2)] #Only odd numbered columns have read data for some reason
+colnames(counts_df) <- SRR_names #The runs are in ordered based on temp file so name them
+Contig <- myfiles2[,1] #use first column to get contigs name
+bacterial_bin <- read.csv(file = '../GTDB-Tk_DAS_check_bin_modified.csv', header = FALSE)
+demovir <- read.table(file = "demovir_taxonomy.tabular", sep = '\t', header = TRUE)
+runinfo <- read.csv("../sequence_runs.csv", header = TRUE)
+
+count_matrix <- cbind(Contig, counts_df)
+
+totalNumReads <- read.delim("totalNumReads", header = FALSE, col.names = c("SRR", "Num_Reads"))
+contigLength <- read.delim("ContigLength", header = FALSE, col.names = c("Contig", "Length"))
+
+### RPKM ####
+
+#reads per million
+totalNumReads <- totalNumReads %>%
+  mutate(readsPM = Num_Reads/1000000)
+
+#contig per KB
+contigLength <-  contigLength %>%
+  mutate(contigLengthperKB = Length/1000)
+
+#Merge based on contig data
+merged_count_matrix <- merge(contigLength, count_matrix, by="Contig")
+
+#Divide counts by contigLength per Kb
+RPK <- merged_count_matrix[,4:27]/merged_count_matrix$contigLengthperKB
+contignames <- merged_count_matrix$Contig
+temp <-  RPK
+for (row in 1:nrow(totalNumReads)){
+  index_name <- (totalNumReads$SRR[row])
+  index_number <- match(index_name, colnames(RPK))
+  new_column <- RPK[,index_number]/totalNumReads$readsPM[row]
+  temp[,index_number] <- new_column}
+
+RPKM <- cbind(contignames,temp)
+
+setwd('../results')
+write.table(RPKM, 'RPKM.tsv', sep = "\t", row.names=FALSE)
+
+
+#Heatmap
+
+#1 Get it in the right format (x-axis: day/seqrun y-axis: contig z: RPKM)
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+
+RPKM.long <- pivot_longer(data = RPKM, cols = -c(1), names_to = 'SRR', values_to = 'RPKM')
+colnames(RPKM.long)[which(names(RPKM.long) == 'contignames')] <- 'contig'
+RPKM.long <- RPKM.long %>% 
+  mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'prophage' ) )
+
+
+write.table(RPKM.long, 'RPKM_long.tsv', sep = "\t")
+RPKM.long <- read.table('RPKM_long.tsv', sep = "\t")
+
+##### log-transform RPKM abundances #####
+
+#removing all zeros for log transformation 
+RPKM.long[RPKM.long == 0] <- NA
+
+RPKM.long$log.abundance <- log(RPKM.long$RPKM)
+
+write.table(RPKM.long, 'logRPKM_long.tsv', sep = "\t")
+RPKM.long <- read.table('logRPKM_long.tsv', sep = "\t")
+
+logRPKM.heatmap <- ggplot(data = RPKM.long, aes(x = SRR, y = contig, fill = log.abundance)) +
+  geom_tile() +
+  scale_fill_gradient2(low = "blue",
+                       mid = "white",
+                       high ="red",
+                       midpoint = 0,
+                       space = "Lab",
+                       na.value = "blue") +
+  theme(axis.text.x = element_text(angle = 90), axis.text.y = element_text(size = 0.4)) +
+  xlab(label = "Sequence Run")
+
+
+logRPKM.heatmap
+
+##### RELATIVE ABUNDANCE ####
+
+relative_abundance <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+
+
+for (column in 2:ncol(relative_abundance)){
+  total_normalized_counts_per_sequence_run <- sum(relative_abundance[,column])
+  relative_abundance[,column] <- relative_abundance[,column]/total_normalized_counts_per_sequence_run
+}
+
+#### RELATIVE ABUNDANCE ###
+write.table(relative_abundance, 'relative_abundance.tsv', sep = "\t")
+relative_abundance <- read.table('relative_abundance.tsv', sep = "\t")
+
+#### 1% RELATIVE ABUNDANCE ####
+
+#I am going to run a check to see how many contigs actually contribute more than 1% of reads to a sample
+number_of_one_percent_contigs <- 0
+list_contigs_greater_than_1percent <- c()
+for (row in 1:nrow(relative_abundance)){
+  
+  if (max(relative_abundance[row,-1]) > 0.01) {
+    number_of_one_percent_contigs <- number_of_one_percent_contigs + 1
+    list_contigs_greater_than_1percent <- c(list_contigs_greater_than_1percent, relative_abundance[row,1] )
+  }
+  
+}
+
+#### 0.1% RELATIVE ABUNDANCE ####
+
+#I am going to run a check to see how many contigs actually contribute more than 1% of reads to a sample
+number_of_0.1_percent_contigs <- 0
+list_contigs_greater_than_0.1percent <- c()
+
+for (row in 1:nrow(relative_abundance)){
+  
+  if (max(relative_abundance[row,-1]) > 0.001) {
+    number_of_0.1_percent_contigs <- number_of_0.1_percent_contigs + 1
+    list_contigs_greater_than_0.1percent <- c(list_contigs_greater_than_0.1percent, relative_abundance[row,1] )
+  }
+  
+}
+
+number_of_0.1_percent_contigs 
+list_contigs_greater_than_0.1percent
+
+#0.1%
+logRPKM.long <- read.table('logRPKM_long.tsv', sep = "\t")
+logRPKM.long <- as_tibble(logRPKM.long)
+zero_one_percent_RPKM <- logRPKM.long[logRPKM.long$contig %in% list_contigs_greater_than_0.1percent,]
+write.table(zero_one_percent_RPKM, 'zero_one_percent_sam.tsv', sep = "\t")
+
+##Make the sequence runs in order using factors
+zero_one_percent_RPKM$SRR <- factor(zero_one_percent_RPKM$SRR, levels = c(runinfo$SRR))
+
+
+logRPKM.long.heatmap <- ggplot(data = logRPKM.long, aes(x = SRR, y = contig, fill = log.abundance)) +
+  geom_tile() +
+  scale_fill_gradient2(low = "blue",
+                       mid = "white",
+                       high ="red",
+                       midpoint = 0,
+                       space = "Lab",
+                       na.value = "blue") +
+  theme(axis.text.x = element_text(angle = 90), axis.text.y = element_text(size = 0.4)) +
+  xlab(label = "Sequence Run")
+
+
+#logRPKM.long.heatmap
+
+##Look at only prophages and 0.1%
+zero_one_percent_RPKM <- logRPKM.long[logRPKM.long$contig %in% list_contigs_greater_than_0.1percent,]
+zero_one_percent_RPKM$SRR <- factor(zero_one_percent_RPKM$SRR, levels = c(runinfo$SRR))
+prophage_zero_one_percent_RPKM <- zero_one_percent_RPKM %>% filter(zero_one_percent_RPKM$lifestyle == "prophage")
+
+prophage_zero_one_percent_RPKM.heatmap <- ggplot(data = prophage_zero_one_percent_RPKM, aes(x = SRR, y = contig, fill = log.abundance)) +
+  geom_tile() +
+  scale_fill_gradient2(low = "blue",
+                       mid = "white",
+                       high ="red",
+                       midpoint = 0,
+                       space = "Lab",
+                       na.value = "blue") +
+  theme(axis.text.x = element_text(angle = 90), axis.text.y = element_text(size = 0.4)) +
+  xlab(label = "Sequence Run")
+
+
+#prophage_zero_one_percent_RPKM.heatmap
+
+#Diversity
+
+#change names of SRR to days
+x <- runinfo$ID[match(colnames(RPKM), runinfo$SRR)]
+colnames(RPKM) <- replace_na(x,'contig')
+shannon_diversity <-  c()
+for (column in 2:(ncol(RPKM)-1)) {
+  temp <- diversity(RPKM[,column], MARGIN = 2, index = 'shannon')
+  shannon_diversity <- c(shannon_diversity, temp)
+}
+
+#Bray-curtis difference
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+row.names(RPKM) <- RPKM[,1]
+RPKM <- RPKM[,-1]
+colnames(RPKM) <- runinfo$ID[match(colnames(RPKM), runinfo$SRR)]
+RPKM_transposed <- as.data.frame(t(RPKM))
+beta_dist <- vegdist(RPKM_transposed, index = "bray")
+beta_dist_clust <- hclust(beta_dist, method = "average")
+plot(beta_dist_clust)
+
+colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)]
+RPKM_transposed <- as.data.frame(t(RPKM))
+
+#Euclidean distance
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+row.names(RPKM) <- RPKM[,1]
+RPKM <- RPKM[,-1]
+colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)]
+RPKM_transposed <- as.data.frame(t(RPKM))
+euclid_dist <- as.matrix(dist(RPKM_transposed, method = "euclidean"))
+colnames(euclid_dist) <- colnames(RPKM)
+row.names(euclid_dist) <- colnames(RPKM)
+euclid_matrix <- as.data.frame(euclid_dist)
+euclid_matrix <- cbind(colnames(euclid_matrix), euclid_matrix)
+euclid_long <- pivot_longer(euclid_matrix, cols = euclid_matrix$`colnames(euclid_matrix)`)
+colnames(euclid_long) <- c('time1', 'time2', 'euc_dis')
+euclid_long$time1 <- as.numeric(euclid_long$time1)
+euclid_long$time2 <- as.numeric(euclid_long$time2)
+euclid_long$srt_time_lag <- sqrt(abs((euclid_long$time1 - euclid_long$time2)))
+q <- plot(euclid_long$srt_time_lag, euclid_long$euc_dis)
+q <- lm(euclid_long$euc_dis ~ euclid_long$srt_time_lag)
+q <- abline(9921, 1557)
+q
+##the results were that there was 'directional change' in the virome unlike what I would expect
+#So I will look at the top most abundant 
+
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+top100 <- c()
+for (column in 2:25){
+  x <- head(arrange(RPKM, desc(RPKM[,column])), n =175)[,1]
+  top100 <- c(top100, x)
+}
+
+most_abundant <- unique(top100)
+rpkm_most_abundant <- RPKM_transposed[, most_abundant]
+most_abdnt_euclid_dist <- as.matrix(dist(rpkm_most_abundant, method = "euclidean"))
+
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+row.names(RPKM) <- RPKM[,1]
+RPKM <- RPKM[,-1]
+colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)]
+
+colnames(most_abdnt_euclid_dist) <- colnames(RPKM)
+row.names(most_abdnt_euclid_dist) <- colnames(RPKM)
+most_abd_euclid_matrix <- as.data.frame(most_abdnt_euclid_dist)
+most_abd_euclid_matrix <- cbind(colnames(most_abd_euclid_matrix), most_abd_euclid_matrix)
+most_abd_euclid_long <- pivot_longer(most_abd_euclid_matrix, cols = most_abd_euclid_matrix$`colnames(most_abd_euclid_matrix)`)
+colnames(most_abd_euclid_long) <- c('time1', 'time2', 'euc_dis')
+most_abd_euclid_long$time1 <- as.numeric(most_abd_euclid_long$time1)
+most_abd_euclid_long$time2 <- as.numeric(most_abd_euclid_long$time2)
+most_abd_euclid_long$srt_time_lag <- sqrt(abs((most_abd_euclid_long$time1 - most_abd_euclid_long$time2)))
+plot(most_abd_euclid_long$srt_time_lag, most_abd_euclid_long$euc_dis)
+lm(most_abd_euclid_long$euc_dis ~ most_abd_euclid_long$srt_time_lag)
+
+abline(9919, 1557)
+
+
+##Next I will look at how the active prophages compare
+
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+RPKM <- as_tibble(RPKM)
+RPKM <- RPKM %>% mutate(lifestyle = case_when(str_detect(contignames, "NODE") ~ 'lytic', str_detect(contignames, "DAS") ~ 'temperate' ) )
+
+#I know it's a bad move but first 52 coloumns of RPKM_transposed are active prophages and the rest are the regular ones
+active_prophage_RPKM <- RPKM_transposed[,1:52]
+non_prophage_RPKM <- RPKM_transposed[,53:5890]
+
+active_prophage_euclid_dist <- as.matrix(dist(active_prophage_RPKM, method = "euclidean"))
+
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+row.names(RPKM) <- RPKM[,1]
+RPKM <- RPKM[,-1]
+colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)]
+
+colnames(active_prophage_euclid_dist) <- colnames(RPKM)
+row.names(active_prophage_euclid_dist) <- colnames(RPKM)
+active_prophage_euclid_matrix <- as.data.frame(active_prophage_euclid_dist)
+active_prophage_euclid_matrix <- cbind(colnames(active_prophage_euclid_matrix), active_prophage_euclid_matrix)
+active_prophage_euclid_long <- pivot_longer(active_prophage_euclid_matrix, cols = active_prophage_euclid_matrix$`colnames(active_prophage_euclid_matrix)`)
+colnames(active_prophage_euclid_long) <- c('time1', 'time2', 'euc_dis')
+active_prophage_euclid_long$time1 <- as.numeric(active_prophage_euclid_long$time1)
+active_prophage_euclid_long$time2 <- as.numeric(active_prophage_euclid_long$time2)
+active_prophage_euclid_long$srt_time_lag <- sqrt(abs((active_prophage_euclid_long$time1 - active_prophage_euclid_long$time2)))
+plot(active_prophage_euclid_long$srt_time_lag, active_prophage_euclid_long$euc_dis)
+lm(active_prophage_euclid_long$euc_dis ~ active_prophage_euclid_long$srt_time_lag)
+abline(139.8442,-0.7071)
+
+#Non-active prophages
+non_prophage_euclid_dist <- as.matrix(dist(non_prophage_RPKM, method = "euclidean"))
+
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+row.names(RPKM) <- RPKM[,1]
+RPKM <- RPKM[,-1]
+colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)]
+
+colnames(non_prophage_euclid_dist) <- colnames(RPKM)
+row.names(non_prophage_euclid_dist) <- colnames(RPKM)
+non_prophage_euclid_matrix <- as.data.frame(non_prophage_euclid_dist)
+non_prophage_euclid_matrix <- cbind(colnames(non_prophage_euclid_matrix), non_prophage_euclid_matrix)
+non_prophage_euclid_long <- pivot_longer(non_prophage_euclid_matrix, cols = non_prophage_euclid_matrix$`colnames(non_prophage_euclid_matrix)`)
+colnames(non_prophage_euclid_long) <- c('time1', 'time2', 'euc_dis')
+non_prophage_euclid_long$time1 <- as.numeric(non_prophage_euclid_long$time1)
+non_prophage_euclid_long$time2 <- as.numeric(non_prophage_euclid_long$time2)
+non_prophage_euclid_long$srt_time_lag <- sqrt(abs((non_prophage_euclid_long$time1 - non_prophage_euclid_long$time2)))
+plot(non_prophage_euclid_long$srt_time_lag, non_prophage_euclid_long$euc_dis)
+lm(non_prophage_euclid_long$euc_dis ~ non_prophage_euclid_long$srt_time_lag)
+abline(9919,1557)
+
+
+##Now try and use bray-curtis distance
+
+#Bray-curtis distance
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+row.names(RPKM) <- RPKM[,1]
+RPKM <- RPKM[,-1]
+colnames(RPKM) <- runinfo$Time_days[match(colnames(RPKM), runinfo$SRR)]
+RPKM_transposed <- as.data.frame(t(RPKM))
+beta_dist <- as.matrix(vegdist(RPKM_transposed, index = "bray"))
+colnames(beta_dist) <- colnames(RPKM)
+row.names(beta_dist) <- colnames(RPKM)
+beta_matrix <- as.data.frame(beta_dist)
+beta_matrix <- cbind(colnames(beta_matrix), beta_matrix)
+beta_long <- pivot_longer(beta_matrix, cols = beta_matrix$`colnames(beta_matrix)`)
+colnames(beta_long) <- c('time1', 'time2', 'beta_dis')
+beta_long$time1 <- as.numeric(beta_long$time1)
+beta_long$time2 <- as.numeric(beta_long$time2)
+beta_long$srt_time_lag <- sqrt(abs((beta_long$time1 - beta_long$time2)))
+plot(beta_long$srt_time_lag, beta_long$beta_dis)
+lm(beta_long$beta_dis ~ beta_long$srt_time_lag)
+abline(0.18549, 0.02483)
+
+
+
+#Look at the most abundant contigs
+
+RPKM <- read.table('RPKM.tsv', sep = "\t", header = TRUE)
+top100 <- c()
+for (column in 2:25){
+  x <- head(arrange(RPKM, desc(RPKM[,column])), n =100)[,1]
+  top100 <- cbind(top100, x)
+  }
+
+colnames(top100) <- colnames(RPKM)[-1]
+
+#Look at abundance of active prophges compared to non-active.
+
+#Make a column for prophage or not
+RPKM <- as_tibble(RPKM)
+RPKM <- as.tibble(RPKM)
+RPKM <- RPKM %>% mutate(lifestyle = case_when(str_detect(contignames, "NODE") ~ 'lytic', str_detect(contignames, "DAS") ~ 'temperate' ) )
+test <- aggregate(RPKM[,2:25], by=list(Category=RPKM$lifestyle), FUN=sum) #This sums the RPKM for all the contigs based on being temperate or lytic
+test <- t(test)
+colnames(test) <- test[1,]
+test <- test[-1,]
+test <- as.data.frame(test)
+test$lytic <- as.numeric(test$lytic)
+test$temperate <- as.numeric(test$temperate)
+test$total_RPKM <- test$lytic + test$temperate
+test$perc_lytic <- test$lytic / test$total_RPKM
+test$perc_temp <- test$temperate / test$total_RPKM
+
+#The number of contigs is a lot more in lytic so I decided to explore dividing it by the number of contigs
+amnt_temp <- sum(RPKM$lifestyle == "temperate")
+amnt_lytic <- sum(RPKM$lifestyle == "lytic")
+
+test$lytic_averag <- test$lytic / amnt_lytic
+test$temp_averag <- test$temperate / amnt_temp
+
+test$day <- runinfo$Time_days[match(row.names(test), runinfo$SRR)]
+test$week <- runinfo$week[match(row.names(test), runinfo$SRR)]
+test_graph <- ggplot(test, aes(x=day, y=perc_temp)) + geom_point(aes(size = 30)) + theme(axis.text.x = element_text(size = 20))  
+test_graph <- test_graph +  facet_wrap(~ week, scales="free_x") + ylab(label = 'Relative Abundance of Active Prophages') + xlab(label = 'Day') + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30))
+  
+test_graph
+ggsave(file="/Users/steven/GutMicrobiome/Publication_3_Viral_Metagenomics_Collab/Minot_Data/Third_Round_bins/Viral_Analysis/Reads_Per_Contig/figures/active_prophage_relative_abundance.svg", plot=test_graph, width=10, height=10)
+
+
+##### DESEQ2 #####
+
+#format count data for DESEQ2
+#DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~ condition)
+#countData
+cts <- count_matrix[,-1]
+rownames(cts) <- count_matrix[,1]
+head(cts,2)
+
+#colData
+orderSamples <- match(colnames(cts), runinfo$SRR)
+days <- c((runinfo$Time_days[orderSamples]))
+weeks <- c(runinfo$week[orderSamples])
+coldata <- cbind(days, weeks)
+colnames(coldata) <- c('days', 'week')
+rownames(coldata) <- runinfo$SRR[orderSamples]
+
+coldata <- as.data.frame(coldata)
+
+coldata$days <- factor(coldata$days, levels = unique(sort(coldata$days)))
+coldata$week <- factor(coldata$week, levels = unique(sort(coldata$week)))
+#Test that order is correct which is important for DESEQ
+all(rownames(coldata) %in% colnames(cts))
+all(rownames(coldata) == colnames(cts))
+
+#Now we can run it as stated above
+dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~ days)
+
+#Now we can run it
+test <- DESeq(dds)
+res <- results(test)
+res
+summary(res)
+subset(res, padj<0.05) %>% order(row.names(res))
+head(res[order(res$padj),])
+by_contig <- subset((subset(res[order(row.names(res)),], padj<0.05)), log2FoldChange>1)
+
+##this last step highlights the active prophages 
+by_prophage <- subset(by_contig, str_detect(row.names(by_contig), "DAS"))
+prophages_induced <- data.frame(
+  row.names(by_prophage), 
+  by_prophage$baseMean,
+  by_prophage$log2FoldChange,
+  by_prophage$lfcSE,
+  by_prophage$stat,
+  by_prophage$pvalue,
+  by_prophage$padj)
+colnames(prophages_induced) <- c("Phage", "baseMean", "log2FoldChange", "lfcSE", 
+                   "stat", "P-value","Adjusted p-value")
+#write.table(prophages_induced, 'DESEQ_induced_prophages.tsv', sep = "\t")
+
+vsdata <- varianceStabilizingTransformation(dds)
+day_PCA <- plotPCA(vsdata, intgroup="days")
+week_PCA <- plotPCA(vsdata, intgroup="week") + geom_point(size=0.5) +
+  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7)) + 
+  theme(legend.key.size = unit(0.25, 'cm'), #change legend key size
+        legend.key.height = unit(0.25, 'cm'), #change legend key height
+        legend.key.width = unit(0.25, 'cm'), #change legend key width
+        legend.title = element_text(size=7), #change legend title font size
+        legend.text = element_text(size=7)) #change legend text font size
+week_PCA
+ggsave(file="day_PCA2.svg", plot=day_PCA, width=10, height=10)
+ggsave(file="week_PCA2_2.svg", plot=week_PCA, width=8, height=8, units = c("cm"), dpi = 600)
+
+#Now look at only the active prophages
+
+dds_prophages <- dds[1:52]
+vsdata_prophages <- varianceStabilizingTransformation(dds_prophages)
+
+plot_PCA_prophages <- plotPCA(vsdata_prophages, intgroup="week") +
+  geom_point(size=0.5) +
+  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7)) + 
+  theme(legend.key.size = unit(0.25, 'cm'), #change legend key size
+        legend.key.height = unit(0.25, 'cm'), #change legend key height
+        legend.key.width = unit(0.25, 'cm'), #change legend key width
+        legend.title = element_text(size=7), #change legend title font size
+        legend.text = element_text(size=7)) #change legend text font size
+plot_PCA_prophages 
+  
+
+ggsave(file="plot_PCA_all_phages.svg", plot=plot_PCA_all_phages, width=10, height=10)
+ggsave(file="plot_PCA_prophages2.svg", plot=plot_PCA_prophages, width=8, height=10, units = c("cm"), dpi = 600)
+
+
+#Normalizing counts by DESEQ count
+dds <- estimateSizeFactors(dds)
+sizeFactors(dds)
+
+norDESEQ_counts <- counts(dds, normalized=TRUE)
+norDESEQ_counts <- as.data.frame(norDESEQ_counts)
+norDESEQ_counts <-  setDT(norDESEQ_counts, keep.rownames = 'contig')[]
+norDESEQ_counts <- norDESEQ_counts[-5891,]
+
+#Before also normalizing for contig length I will make sure both contig list and norDESEQ_counts are in same order of contigs
+all(contigLength$Contig %in% norDESEQ_counts$contig)
+all(contigLength$Contig == norDESEQ_counts$contig)
+
+#Normalize counts per contig length
+temp_dataframe <- norDESEQ_counts[,2:25]/contigLength$Length
+
+norDESEQ_counts <- cbind(norDESEQ_counts$contig, temp_dataframe)
+x <- colnames(norDESEQ_counts)
+x[1] <- 'contig'
+colnames(norDESEQ_counts) <- x
+
+#Get relative abundance
+norDESEQ_counts <- as.data.frame(norDESEQ_counts)
+for (x in 2:ncol(norDESEQ_counts)){
+  total_normalized_counts_per_sequence_run <- sum(norDESEQ_counts[,x])
+  norDESEQ_counts[,x] <- norDESEQ_counts[,x]/total_normalized_counts_per_sequence_run
+}
+
+
+norDESEQ_counts <- as.data.table(norDESEQ_counts)
+temperate_DESEQ <- norDESEQ_counts %>% mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'temperate' ) )
+temperate_DESEQ <- temperate_DESEQ %>% filter(lifestyle == 'temperate') %>% select(!(lifestyle))
+temperate_DESEQ_melt <- melt(temperate_DESEQ, id.vars = c("contig"))
+
+
+temperate_DESEQ_melt$variable <- factor(temperate_DESEQ_melt$variable, levels = c(runinfo$SRR))
+temperate_DESEQ_melt$day <- as.factor(runinfo$ID[match(temperate_DESEQ_melt$variable, runinfo$SRR)])
+
+prophage_induction <- temperate_DESEQ_melt[,c('contig','value','variable')]
+colnames(prophage_induction) <- c("contig", "read_count", "SRR_columns")
+
+prophage_induction <- spread(prophage_induction, contig, read_count)
+prophage_induction <- as.data.frame(prophage_induction)
+prophage_induction <- prophage_induction[-25,]
+normalized_coverage <- cbind(runinfo$Time_days, runinfo$week, runinfo$ID, prophage_induction)
+
+x <- colnames(normalized_coverage)
+x[1] <- 'day'
+x[2] <-  'week'
+x[3] <- 'day-replicate'
+colnames(normalized_coverage) <- x
+normalized_coverage$`day-replicate` <- factor(normalized_coverage$`day-replicate`)
+normalized_coverage$SRR_columns <- factor(normalized_coverage$SRR_columns)
+normalized_coverage <- as.data.table(normalized_coverage)
+normalized_coverage_melted <- melt(normalized_coverage, id.vars= c('day', 'week', 'day-replicate', 'SRR_columns'))
+
+normalized_coverage_melted$variable <- str_match(normalized_coverage_melted$variable, "DAS_Check_Bin_..")
+bins_position <- match(normalized_coverage_melted$variable, bacterial_bin$V1)
+bin_names <- bacterial_bin$V9[bins_position]
+normalized_coverage_melted$variable <- bin_names
+p <- ggplot(normalized_coverage_melted, aes(x=day, y=value, col=variable)) + geom_point(aes(size = 15)) + theme(axis.text.x = element_text(size = 20))  
+p <- p + facet_wrap(~ week, scales="free_x") + ylab(label = 'Normalized Coverage') + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30))
+p
+
+#Now I will revisit the DESEQ normalized counts and look at what percentage is active prophages.
+norDESEQ_counts <- counts(dds, normalized=TRUE)
+norDESEQ_counts <- as.data.frame(norDESEQ_counts)
+norDESEQ_counts <-  setDT(norDESEQ_counts, keep.rownames = 'contig')[]
+norDESEQ_counts <- norDESEQ_counts[-5891,]
+write.table(norDESEQ_counts, 'norDESEQ_counts.tsv', sep = "\t", row.names=FALSE)
+norDESEQ_counts <- read.table('norDESEQ_counts.tsv', sep = "\t", header =TRUE)
+
+#Before also normalizing for contig length I will make sure both contig list and norDESEQ_counts are in same order of contigs
+all(contigLength$Contig %in% norDESEQ_counts$contig)
+all(contigLength$Contig == norDESEQ_counts$contig)
+
+#Normalize counts per contig length
+temp_dataframe <- norDESEQ_counts[,2:25]/contigLength$Length
+
+norDESEQ_counts <- cbind(norDESEQ_counts$contig, temp_dataframe)
+x <- colnames(norDESEQ_counts)
+x[1] <- 'contig'
+colnames(norDESEQ_counts) <- x
+write.table(norDESEQ_counts, 'norDESEQ_counts_norbylength.tsv', sep = "\t", row.names=FALSE)
+norDESEQ_counts <- read_delim('norDESEQ_counts_norbylength.tsv', delim = "\t")
+norDESEQ_counts <- norDESEQ_counts %>% mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'temperate' ) )
+
+perc_active_proph <- aggregate(norDESEQ_counts [,2:25], by=list(Category=norDESEQ_counts$lifestyle), FUN=sum) #This sums the norDESEQ_counts for all the contigs based on being temperate or lytic
+perc_active_proph <- t(perc_active_proph)
+colnames(perc_active_proph) <- perc_active_proph[1,]
+perc_active_proph <- perc_active_proph[-1,]
+perc_active_proph <- as.data.frame(perc_active_proph, stringsAsFactors = FALSE)
+perc_active_proph$lytic <- as.numeric(perc_active_proph$lytic)
+perc_active_proph$temperate <- as.numeric(perc_active_proph$temperate)
+perc_active_proph$total_norDESEQ_counts <- perc_active_proph$lytic + perc_active_proph$temperate
+perc_active_proph$perc_lytic <- perc_active_proph$lytic / perc_active_proph$total_norDESEQ_counts
+perc_active_proph$perc_temp <- perc_active_proph$temperate / perc_active_proph$total_norDESEQ_counts
+
+#The number of contigs is a lot more in lytic so I decided to explore dividing it by the number of contigs
+amnt_temp <- sum(norDESEQ_counts$lifestyle == "temperate")
+amnt_lytic <- sum(norDESEQ_counts$lifestyle == "lytic")
+
+perc_active_proph$lytic_averag <- perc_active_proph$lytic / amnt_lytic
+perc_active_proph$temp_averag <- perc_active_proph$temperate / amnt_temp
+
+perc_active_proph$day <- runinfo$ID[match(row.names(perc_active_proph), runinfo$SRR)]
+perc_active_proph$week <- runinfo$week[match(row.names(perc_active_proph), runinfo$SRR)]
+perc_active_proph$day_group <- runinfo$Time_days[match(row.names(perc_active_proph), runinfo$SRR)]
+perc_active_proph$day <- as.factor(runinfo$ID)
+
+perc_active_proph_graph <- ggplot(perc_active_proph, aes(x=day_group, y=perc_temp, color = perc_temp < 0.010)) + geom_point(size = 1, show.legend=FALSE) + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25))  
+perc_active_proph_graph <- perc_active_proph_graph + ylab(label = 'Relative Abundance of Active Prophages') + xlab(label = 'Day') + theme(strip.text = element_text(size =35)) + theme(axis.title = element_text(size = 35))+
+  geom_hline(yintercept=0.010, linetype='dashed', color='#F8766D', size = 0.5) + facet_grid(~ week, scales="free_x", space="free")
+
+#perc_active_proph_graph 
+
+#### STATS ON PERCENT ACTIVE ####
+perc_active_stats <-  as_tibble(perc_active_proph)
+perc_active_stats <- perc_active_stats %>% select(perc_temp, day_group, temperate) %>% filter(day_group %in% c(0, 182, 851, 852, 853,879, 880, 881))
+perc_active_stats$day_group <- as.factor(perc_active_stats$day_group)
+perc_active_stats %>% group_by(day_group) %>% get_summary_stats(temperate, type = "common")
+perc_active_stats$id <- as.factor(rep(c(1,2), 8))
+friedman_test(perc_active_stats, temperate ~ day_group | id)
+
+#Add days names on x-axis
+#Changle-axis labels to the day they were sampled.
+#day_sampled <- c(runinfo$Time_days)
+#perc_active_proph_graphp_2 <- perc_active_proph_graph + scale_x_discrete(labels= day_sampled)
+perc_active_proph_graphp_2 <- perc_active_proph_graph  + 
+  facet_grid(~ week, scales="free_x", space="free_x", shrink = TRUE) +
+  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), strip.text = element_text(size=7)) + 
+  geom_point(size=0.5, position = position_dodge(0.05), show.legend=FALSE) + 
+  theme(legend.text = element_text(size = 7)) + 
+  theme(legend.title = element_text(size = 7)) + 
+  theme(axis.text.x = element_text(size = 7, angle = 270), axis.title = element_text(size = 7)) + 
+  theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
+
+#perc_active_proph_graphp_2  
+qt <- ggplot_gtable(ggplot_build(perc_active_proph_graphp_2 ))
+qt$widths[5] = 12*qt$widths[5]
+grid.draw(qt)
+ggsave(file="perc_active_proph_graph7.svg", plot=qt, width=7, height=11, units = c("cm"), dpi = 600)
+
+#### DESEQ normalized read abundance Bray-Curtis ####
+dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~ days)
+dds <- estimateSizeFactors(dds)
+
+norDESEQ_counts <- counts(dds, normalized=TRUE)
+norDESEQ_counts <- as.data.frame(norDESEQ_counts)
+norDESEQ_counts <-  setDT(norDESEQ_counts, keep.rownames = 'contig')[]
+norDESEQ_counts <- norDESEQ_counts[-5891,]
+
+#Before also normalizing for contig length I will make sure both contig list and norDESEQ_counts are in same order of contigs
+all(contigLength$Contig %in% norDESEQ_counts$contig)
+all(contigLength$Contig == norDESEQ_counts$contig)
+
+#Normalize counts per contig length
+temp_dataframe <- norDESEQ_counts[,2:25]/contigLength$Length
+
+norDESEQ_counts <- cbind(norDESEQ_counts$contig, temp_dataframe)
+x <- colnames(norDESEQ_counts)
+x[1] <- 'contig'
+colnames(norDESEQ_counts) <- x
+
+#Get relative abundance
+norDESEQ_counts <- as.data.frame(norDESEQ_counts)
+for (x in 2:ncol(norDESEQ_counts)){
+  total_normalized_counts_per_sequence_run <- sum(norDESEQ_counts[,x])
+  norDESEQ_counts[,x] <- norDESEQ_counts[,x]/total_normalized_counts_per_sequence_run
+}
+
+
+norDESEQ_counts <- as.data.table(norDESEQ_counts)
+
+temperate_DESEQ <- norDESEQ_counts %>% mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'temperate' ) )
+temperate_DESEQ <- temperate_DESEQ %>% filter(lifestyle == 'temperate') %>% select(!(lifestyle))
+
+
+#Bray-curtis difference
+
+norDESEQ_counts_transformed <- as.data.frame(t(norDESEQ_counts))
+colnames(norDESEQ_counts_transformed) <- norDESEQ_counts_transformed[1,]
+norDESEQ_counts_transformed <- norDESEQ_counts_transformed[-1,]
+temp_row_names <- row.names(norDESEQ_counts_transformed)
+norDESEQ_counts_transformed <- sapply(norDESEQ_counts_transformed, as.numeric) #This turns lifestyle row into NA
+row.names(norDESEQ_counts_transformed) <- runinfo$ID[match(temp_row_names, runinfo$SRR)]
+#norDESEQ_counts_transformed <- norDESEQ_counts_transformed[1:24,1:5] #drop the NA row
+beta_dist <- vegdist(norDESEQ_counts_transformed, index = "bray")
+beta_dist_clust <- hclust(beta_dist, method = "average")
+plot(beta_dist_clust)
+
+adonis(beta_dist ~ week + Time_days, data= runinfo)
+
+PCOA <- pcoa(beta_dist)
+biplot.pcoa(PCOA)
+beta_dist
+
+beta_dist_matrix <- as.matrix(beta_dist)
+beta_dist_matrix[lower.tri(beta_dist_matrix, diag = FALSE)] <- NA
+beta_dist_matrix[beta_dist_matrix==0.0000000] <- NA
+
+beta_dist_matrix <- as.data.frame(beta_dist_matrix)
+beta_dist_matrix <- cbind(row.names(beta_dist_matrix), beta_dist_matrix)
+beta_dist_matrix_long <- melt(as.data.table(beta_dist_matrix), id.vars = colnames(beta_dist_matrix)[1])
+colnames(beta_dist_matrix_long) <- c('time1', 'time2', 'beta_dis')
+beta_dist_matrix_long$time1 <- runinfo$Time_days[match(beta_dist_matrix_long$time1, runinfo$ID)]
+beta_dist_matrix_long$time2 <- runinfo$Time_days[match(beta_dist_matrix_long$time2, runinfo$ID)]
+beta_dist_matrix_long$srt_time_lag <- sqrt(abs((beta_dist_matrix_long$time1 - beta_dist_matrix_long$time2)))
+#### BETADIST FIGURE ####
+plot(beta_dist_matrix_long$srt_time_lag, beta_dist_matrix_long$beta_dis)
+linearmodel_beta_all <- lm(beta_dist_matrix_long$beta_dis ~ beta_dist_matrix_long$srt_time_lag)
+abline(a=0.1849, b=0.0245)
+summary(linearmodel_beta_all)
+resid(linearmodel_beta_all)
+
+beta_dist_plot <- ggplot(beta_dist_matrix_long, aes(x=srt_time_lag, y=beta_dis)) +
+  geom_point(size = 0.75) + 
+  geom_abline(intercept=0.1849, slope=0.0245, linetype='dashed', color='#F8766D') + 
+  ylab(label = 'Bray Curtis Dissimilarity') + 
+  xlab(label = bquote(Time (Days^-2))) + 
+  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7 )) 
+
+beta_dist_plot
+
+### REVIEWERS COMMENT ####
+#Transform BC into log(1-BC)
+beta_dist_matrix_long$z <- log(1-beta_dist_matrix_long$beta_dis)
+plot(beta_dist_matrix_long$srt_time_lag, beta_dist_matrix_long$z)
+reviewer_lm_beta_all <- lm(beta_dist_matrix_long$z ~beta_dist_matrix_long$srt_time_lag)
+reviewer_lm_beta_all <- lm(beta_dist_matrix_long$z * beta_dist_matrix_long$srt_time_lag)
+summary(reviewer_lm_beta_all)
+resid(reviewer_lm_beta_all)
+x <- beta_dist_matrix_long$srt_time_lag
+newfunc2 <- function(x,a){x*a}
+newFunc <- function(x,a,b){b-exp(a*x)}
+curve(newfunc2(x,-0.075), add=TRUE)
+linearmodel_beta_all <- lm(beta_dist_matrix_long$z ~ beta_dist_matrix_long$srt_time_lag)
+abline(a=-0.0649, b=-0.075)
+
+
+beta_dist_plot <- ggplot(beta_dist_matrix_long, aes(x=srt_time_lag, y=z)) +
+  geom_point(size = 0.75) + 
+  geom_abline(intercept=-0.0649, slope=-0.075, linetype='dashed', color='#F8766D') + 
+  ylab(label = 'Bray Curtis Dissimilarity') + 
+  xlab(label = bquote(Time (Days^-2))) + 
+  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7 )) 
+
+
+#Missing points are simply because I am made a triangle matrix, and did so by making one side NAs, there is probably a better way
+ggsave(file="beta_dist_plot3_reviewers.png", plot=beta_dist_plot, width=5, height=6, units = c("cm"), dpi = 600, device = 'png')
+
+#Look at prophages
+
+prophage_norDESEQ_counts_transformed <- norDESEQ_counts_transformed[,1:52]
+
+prophage_beta_dist <- vegdist(prophage_norDESEQ_counts_transformed, index = "bray")
+prophage_beta_dist_clust <- hclust(prophage_beta_dist, method = "average")
+plot(prophage_beta_dist_clust)
+adonis(prophage_beta_dist ~ week + Time_days, data= runinfo)
+prophage_PCOA <- pcoa(prophage_beta_dist)
+biplot.pcoa(prophage_PCOA)
+
+prophage_beta_dist_matrix <- as.matrix(prophage_beta_dist)
+prophage_beta_dist_matrix[lower.tri(prophage_beta_dist_matrix, diag = FALSE)] <- NA
+prophage_beta_dist_matrix[prophage_beta_dist_matrix==0.0000000] <- NA
+
+prophage_beta_dist_matrix <- as.data.frame(prophage_beta_dist_matrix)
+prophage_beta_dist_matrix <- cbind(row.names(prophage_beta_dist_matrix), prophage_beta_dist_matrix)
+prophage_beta_dist_matrix_long <- melt(as.data.table(prophage_beta_dist_matrix), id.vars = colnames(prophage_beta_dist_matrix)[1])
+colnames(prophage_beta_dist_matrix_long) <- c('time1', 'time2', 'beta_dis')
+prophage_beta_dist_matrix_long$time1 <- runinfo$Time_days[match(prophage_beta_dist_matrix_long$time1, runinfo$ID)]
+prophage_beta_dist_matrix_long$time2 <- runinfo$Time_days[match(prophage_beta_dist_matrix_long$time2, runinfo$ID)]
+prophage_beta_dist_matrix_long$srt_time_lag <- sqrt(abs((prophage_beta_dist_matrix_long$time1 - prophage_beta_dist_matrix_long$time2)))
+
+plot(prophage_beta_dist_matrix_long$srt_time_lag, prophage_beta_dist_matrix_long$beta_dis)
+linearmodel_beta_prophages <- lm(prophage_beta_dist_matrix_long$beta_dis ~ prophage_beta_dist_matrix_long$srt_time_lag)
+abline(0.584793, 0.003089)
+summary(linearmodel_beta_prophages)
+prophage_beta_dist_plot <- ggplot(prophage_beta_dist_matrix_long, aes(x=srt_time_lag, y=beta_dis)) + geom_point(size = 0.75) + 
+  geom_abline(intercept=0.58479, slope=0.003089, linetype='dashed', color='#F8766D') + 
+  ylab(label = 'Bray Curtis Dissimilarity') + 
+  xlab(label = bquote(Time (Days^-2))) + 
+  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7 )) 
+prophage_beta_dist_plot
+ggsave(file="prophage_beta_dist_plot3.svg", plot=prophage_beta_dist_plot, width=5, height=6, units = c("cm"), dpi = 600)
+
+#### REVIEWERS COMMENT ####
+
+#Transform BC into log(1-BC)
+prophage_beta_dist_matrix_long$z <- log(1-prophage_beta_dist_matrix_long$beta_dis)
+plot(prophage_beta_dist_matrix_long$srt_time_lag, prophage_beta_dist_matrix_long$z)
+reviewer_lm__prophage_beta_all <- lm(prophage_beta_dist_matrix_long$z ~prophage_beta_dist_matrix_long$srt_time_lag)
+
+summary(reviewer_lm__prophage_beta_all)
+resid(reviewer_lm__prophage_beta_all)
+x <- prophage_beta_dist_matrix_long$srt_time_lag
+newfunc2 <- function(x,a){x*a}
+newFunc <- function(x,a,b){b-exp(a*x)}
+curve(newfunc2(x,-0.0075), add=TRUE)
+abline(a=-1.037166, b=-0.008292)
+
+beta_dist_plot <- ggplot(prophage_beta_dist_matrix_long, aes(x=srt_time_lag, y=z)) +
+  geom_point(size = 0.75) + 
+  geom_abline(intercept=-1.037166, slope=-0.008292, linetype='dashed', color='#F8766D') + 
+  ylab(label = 'Bray Curtis Dissimilarity') + 
+  xlab(label = bquote(Time (Days^-2))) + 
+  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7), axis.title = element_text(size = 7 )) 
+
+
+#Missing points are simply because I am made a triangle matrix, and did so by making one side NAs, there is probably a better way
+ggsave(file="prophage_beta_dist_plot3_reviewers.png", plot=beta_dist_plot, width=5, height=6, units = c("cm"), dpi = 600, device = 'png')
+
+
+#Now I want to compare beta_dist_matrix_long (all phages) to prophage_beta_dist_matrix_long (prophages)
+comaparing_beta_dist_matrix_long <- beta_dist_matrix_long
+names(comaparing_beta_dist_matrix_long)[names(comaparing_beta_dist_matrix_long) == 'beta_dis'] <- 'beta_dis_all'
+comaparing_beta_dist_matrix_long$beta_dis_prophage <- prophage_beta_dist_matrix_long$beta_dis
+compared_model <- lm(beta_dis_all ~ srt_time_lag + beta_dis_prophage, data = comaparing_beta_dist_matrix_long)
+summary(compared_model)
+anova(compared_model)
+
+linearHypothesis(compared_model, "beta_dis_all = beta_dis_prophage")
+
+#Next I want to see if the high beta-diversity over time is due to the larger number of contigs
+random_contigs <- floor(runif(59, min=0, max=5890))
+random_norDESEQ_counts_transformed <-  norDESEQ_counts_transformed[,random_contigs]
+random_beta_dist <- vegdist(random_norDESEQ_counts_transformed, index = "bray")
+
+random_beta_dist_matrix <- as.matrix(random_beta_dist)
+random_beta_dist_matrix[lower.tri(random_beta_dist_matrix, diag = FALSE)] <- NA
+random_beta_dist_matrix[random_beta_dist_matrix==0.0000000] <- NA
+
+random_beta_dist_matrix <- as.data.frame(random_beta_dist_matrix)
+random_beta_dist_matrix <- cbind(row.names(random_beta_dist_matrix), random_beta_dist_matrix)
+random_beta_dist_matrix_long <- melt(random_beta_dist_matrix, id.vars = colnames(random_beta_dist_matrix)[1])
+
+colnames(random_beta_dist_matrix_long) <- c('time1', 'time2', 'beta_dis')
+random_beta_dist_matrix_long$time1 <- runinfo$Time_days[match(random_beta_dist_matrix_long$time1, runinfo$ID)]
+random_beta_dist_matrix_long$time2 <- runinfo$Time_days[match(random_beta_dist_matrix_long$time2, runinfo$ID)]
+random_beta_dist_matrix_long$srt_time_lag <- sqrt(abs((random_beta_dist_matrix_long$time1 - random_beta_dist_matrix_long$time2)))
+
+plot(random_beta_dist_matrix_long$srt_time_lag, random_beta_dist_matrix_long$beta_dis)
+linearmodel_beta_subsample <- lm(random_beta_dist_matrix_long$beta_dis ~ random_beta_dist_matrix_long$srt_time_lag)
+#abline(a=0.1849, b=0.0245)
+summary(linearmodel_beta_subsample)
+
+
+##### DESEQ taxnomic abundance #####
+norDESEQ_counts <- read.table('../norDESEQ_counts_norbylength.tsv', sep = "\t", header =TRUE)
+
+#I want relative abundance so I will divide each value in column
+
+for (column in 2:ncol(norDESEQ_counts)){
+  total_normalized_counts_per_sequence_run <- sum(norDESEQ_counts[,column])
+  norDESEQ_counts[,column] <- norDESEQ_counts[,column]/total_normalized_counts_per_sequence_run
+}
+
+
+norDESEQ_melt <- melt(as.data.table(norDESEQ_counts), id.vars = c("contig"))
+
+#Add in the Demovir taxonomy for phages that have it
+
+norDESEQ_melt$Order <- demovir$Order[match(norDESEQ_melt$contig, demovir$Sequence_ID)]
+norDESEQ_melt$Family <- demovir$Family[match(norDESEQ_melt$contig, demovir$Sequence_ID)]
+#Make NAs "Unknown"
+norDESEQ_melt$Order <- as.character(norDESEQ_melt$Order)
+norDESEQ_melt$Order[is.na(norDESEQ_melt$Order)] <- "Unknown"
+
+norDESEQ_melt$Family <- as.character(norDESEQ_melt$Family)
+norDESEQ_melt$Family[is.na(norDESEQ_melt$Family)] <- "Unknown"
+
+#I still prefer to have samples order by sequence run ID
+
+norDESEQ_melt$day <- runinfo$ID[match(norDESEQ_melt$variable, runinfo$SRR)]
+norDESEQ_melt$week <- runinfo$week[match(norDESEQ_melt$variable, runinfo$SRR)]
+
+norDESEQ_melt$Family <- as.factor(norDESEQ_melt$Family)
+#Making a side-step so that I can later on look at the number of unclassified phages
+contig_with_taxonomy <- norDESEQ_melt
+
+norDESEQ_family_melt <- norDESEQ_melt %>% group_by(day, Family, week) %>% summarise(value = sum(value))
+
+family_relaytive_abund <- ggplot(norDESEQ_family_melt, aes(fill=Family, y=value, x=day)) + 
+  geom_bar(position = "stack", stat="identity", colour=NA) + 
+  theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust=1))  
+family_relaytive_abund <- family_relaytive_abund +  
+  facet_wrap(~ week, scales="free_x") + ylab(label = 'Relative Abundance by Order') + 
+  xlab(label = 'Day') + 
+  theme(strip.text = element_text(size =20)) + 
+  theme(axis.title = element_text(size = 30))
+
+family_relaytive_abund
+ggsave(file="family_relaytive_abund.svg", plot=family_relaytive_abund, width=10, height=10)
+
+
+norDESEQ_melt %>% filter(Family == "Unknown") -> unknown_phages
+norDESEQ_melt %>% filter(Family == "Unknown") %>% filter(value >= 1e-1) -> abundant_unknown_phages
+
+#Make a graph for %Known vs %Unknown
+w <- norDESEQ_melt
+w$Family <-  as.character(w$Family)
+w$Family[w$Family != 'Unknown'] <- "Known"
+#Another side table to see how many phages are known vs unknown
+contig_with_taxonomy2 <- as_tibble(w)
+
+w <-  as_tibble(w %>% group_by(variable, Family) %>% summarise((sum(value))))
+w$day <- runinfo$ID[match(w$variable, runinfo$SRR)]
+w$week <- runinfo$week[match(w$variable, runinfo$SRR)]
+names(w)[names(w) == colnames(w[3])] <- "value"
+
+perc_known <- ggplot(w, aes(fill=Family, y=value, x=day)) + geom_bar(position = "stack", stat="identity") #+ theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust=1))  
+perc_known <- perc_known + 
+  #facet_wrap(~ week, scales="free_x") + 
+  ylab(label = 'Relative Abundance') + xlab(label = 'Day') + 
+  theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30))
+
+#Remove background ggplot defeault stuff
+perc_known <- perc_known + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
+                                                                              panel.grid.minor = element_blank())
+
+
+#Relabel days, as days, and reformat the look
+day_sampled <- c(runinfo$Time_days)
+perc_known2 <- perc_known + scale_x_discrete(labels= day_sampled) + xlab(label = 'Day') + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) + theme(strip.text = element_text(size =35)) + theme(axis.title = element_text(size = 35))
+perc_known2
+ggsave(file="perc_known2.svg", plot=perc_known2, width=15, height=10)
+
+#I will make a version that shows a line per contig 
+contig_with_taxonomy2$day <- runinfo$ID[match(contig_with_taxonomy2$variable, runinfo$SRR)]
+contig_with_taxonomy2$week <- runinfo$week[match(contig_with_taxonomy2$variable, runinfo$SRR)]
+
+perc_known3 <- ggplot(contig_with_taxonomy2, aes(fill=Family, y=value, x=day)) + geom_bar(position = "stack", stat="identity") #+ theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust=1))  
+perc_known3 <- perc_known3 +  
+  #facet_wrap(~ week, scales="free_x") + 
+  ylab(label = 'Relative Abundance') + xlab(label = 'Day') + 
+  theme(strip.text = element_text(size =20)) + 
+  theme(axis.title = element_text(size = 30))
+
+#Remove background ggplot defeault stuff
+perc_known3 <- perc_known3 + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
+                                              panel.grid.minor = element_blank())
+
+perc_known3
+#Relabel days, as days, and reformat the look
+day_sampled <- c(runinfo$Time_days)
+perc_known4 <- perc_known3 + 
+  scale_x_discrete(labels= day_sampled) + 
+  xlab(label = 'Day') + 
+  theme(axis.text.x = element_text(size = 7, angle = 270), axis.text.y = element_text(size = 7)) + 
+  theme(strip.text = element_text(size =7)) + theme(axis.title = element_text(size = 7)) +
+  theme(legend.key.size = unit(0.25, 'cm'), #change legend key size
+        legend.key.height = unit(0.25, 'cm'), #change legend key height
+        legend.key.width = unit(0.25, 'cm'), #change legend key width
+        legend.title = element_text(size=7), #change legend title font size
+        legend.text = element_text(size=7)) #change legend text font size
+perc_known4
+
+ggsave(file="perc_known_w_lines2.svg", plot=perc_known4, width=11, height=11, units = c("cm"), dpi = 600)
+
+
+
+#I will now try run the analysis without the Unknown Family/Order category to remove non-demovir hits bias in week 2 / week 3
+
+norDESEQ_melt_known_phages <- norDESEQ_melt %>% filter(Family != "Unknown")
+known_phages <- unique(norDESEQ_melt_known_phages$contig)
+known_norDESEQ <- norDESEQ_counts[match(known_phages, norDESEQ_counts$contig), ]
+
+
+for (column in 2:ncol(known_norDESEQ)){
+  total_normalized_counts_per_sequence_run <- sum(known_norDESEQ[,column])
+  known_norDESEQ[,column] <- known_norDESEQ[,column]/total_normalized_counts_per_sequence_run
+}
+
+known_norDESEQ_melt <- melt(as.data.table(known_norDESEQ), id.vars = c("contig"))
+
+#Add in the Demovir taxonomy for phages that have it
+
+known_norDESEQ_melt$Order <- demovir$Order[match(known_norDESEQ_melt$contig, demovir$Sequence_ID)]
+known_norDESEQ_melt$Family <- demovir$Family[match(known_norDESEQ_melt$contig, demovir$Sequence_ID)]
+#Make NAs "Unknown"
+known_norDESEQ_melt$Order <- as.character(known_norDESEQ_melt$Order)
+known_norDESEQ_melt$Order[is.na(known_norDESEQ_melt$Order)] <- "Unknown"
+
+known_norDESEQ_melt$Family <- as.character(known_norDESEQ_melt$Family)
+known_norDESEQ_melt$Family[is.na(known_norDESEQ_melt$Family)] <- "Unknown"
+
+#### Reviewers Comment ####
+
+known_norDESEQ_melt <- known_norDESEQ_melt %>% filter(Family == "Myoviridae" | Family == "Siphoviridae" | Family ==  "Unassigned" | Family == "Podoviridae" | Family =="Microviridae")
+Family_Colours <- c("Myoviridae" = "#EEA236", "Siphoviridae" = "#46B8DA", "Unassigned" = "#B8B8B8" , "Podoviridae" = "#357EBD", Microviridae = "#5CB85C")
+#I still prefer to have samples order by sequence run ID
+
+known_norDESEQ_melt$day <- runinfo$ID[match(known_norDESEQ_melt$variable, runinfo$SRR)]
+known_norDESEQ_melt$week <- runinfo$week[match(known_norDESEQ_melt$variable, runinfo$SRR)]
+
+
+#Relative abundance by family is fragmented into bars for each contig. So I will sum them by family
+known_norDESEQ_melt2 <-  as_tibble(known_norDESEQ_melt %>% group_by(variable, Family) %>% summarise((sum(value))))
+known_norDESEQ_melt2$day <- runinfo$ID[match(known_norDESEQ_melt2$variable, runinfo$SRR)]
+known_norDESEQ_melt2$week <- runinfo$week[match(known_norDESEQ_melt2$variable, runinfo$SRR)]
+known_norDESEQ_melt2$day_group <- runinfo$Time_days[match(known_norDESEQ_melt2$variable, runinfo$SRR)]
+names(known_norDESEQ_melt2)[names(known_norDESEQ_melt2) == colnames(known_norDESEQ_melt2[3])] <- "value"
+known_norDESEQ_melt2$day <- factor(known_norDESEQ_melt2$day, levels= runinfo$ID)
+
+
+day_sampled <- c(runinfo$Time_days)
+ID_sampled <- c(runinfo$ID)
+knownfamily_relative_abund <- ggplot(known_norDESEQ_melt2, aes(fill=Family, y=value, x=day)) + 
+  geom_bar(position = "stack", stat="identity", show.legend=TRUE) + scale_x_discrete(breaks = ID_sampled, labels = day_sampled) + scale_fill_manual(values = Family_Colours)  #Add in when generating the no label figure
+knownfamily_relative_abund <- knownfamily_relative_abund +  
+  #facet_wrap(~ week, scales="free_x") + 
+  ylab(label = 'Relative Abundance by Order') + 
+  xlab(label = 'Day') + theme(strip.text = element_text(size =20)) + 
+  theme(axis.title = element_text(size = 30))
+
+#Remove background ggplot defeault stuff
+knownfamily_relative_abund <- knownfamily_relative_abund + 
+  theme_bw() + 
+  theme(panel.border = element_blank(), 
+        panel.grid.major = element_blank(),
+        panel.grid.minor = element_blank())
+#Relabel days, as days, and reformat the look
+
+knownfamily_relative_abund2 <- knownfamily_relative_abund + 
+  xlab(label = 'Day') + 
+  theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) + 
+  theme(strip.text = element_text(size =35)) + 
+  theme(axis.title = element_text(size = 35))
+knownfamily_relative_abund2 <- knownfamily_relative_abund2 + 
+  facet_grid(~ week, scales="free_x", space="free_x", shrink = TRUE) +
+  theme(axis.text.x = element_text(size = 7, angle = 270), axis.text.y = element_text(size = 7), strip.text = element_text(size=7)) + 
+  theme(legend.title = element_text(size = 7)) + 
+  theme(axis.text.x = element_text(size = 7), axis.title = element_text(size = 7)) + 
+  theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
+  theme(legend.text = element_text(size = 7)) +
+  theme(legend.key.size = unit(0.25, 'cm'), #change legend key size
+        legend.key.height = unit(0.25, 'cm'), #change legend key height
+        legend.key.width = unit(0.25, 'cm'), #change legend key width
+        legend.title = element_text(size=7), #change legend title font size
+        legend.text = element_text(size=7)) #change legend text font size
+knownfamily_relative_abund2
+ggsave(file="knownfamily_relative_abund5.svg", plot=knownfamily_relative_abund2, width=17, height=13, units = c("cm"), dpi = 600)
+ggsave(file="knownfamily_relative_abund_nolegend5.svg", plot=knownfamily_relative_abund2, width=8, height=10, units = c("cm"), dpi = 600)
+
+#Make a figure for known vs unkown
+norDESEQ_family_melt
+
+# # Alejandro's request to see a figure of how many contigs belong to known vs unknown, to see if it is one or two highly abundant
+# 
+# contig_with_taxonomy2 <- as_tibble(w)
+# counts_per_category <- contig_with_taxonomy2 %>% group_by(variable, Family) %>% summarise(contig_number = n())
+# 
+# w <-  as_tibble(w %>% group_by(variable, Family) %>% summarise((sum(value))))
+# w$day <- runinfo$ID[match(w$variable, runinfo$SRR)]
+# w$week <- runinfo$week[match(w$variable, runinfo$SRR)]
+# names(w)[names(w) == colnames(w[3])] <- "value"
+# 
+# perc_known <- ggplot(w, aes(fill=Family, y=value, x=day)) + geom_bar(position = "stack", stat="identity") #+ theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust=1))  
+# perc_known <- perc_known +  facet_wrap(~ week, scales="free_x") + ylab(label = 'Relative Abundance') + xlab(label = 'Day') + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30))
+# 
+# #Remove background ggplot defeault stuff
+# perc_known <- perc_known + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
+#                                               panel.grid.minor = element_blank())
+# #Relabel days, as days, and reformat the look
+# day_sampled <- c(runinfo$Time_days)
+# perc_known2 <- perc_known + scale_x_discrete(labels= day_sampled) + xlab(label = 'Day') + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) + theme(strip.text = element_text(size =35)) + theme(axis.title = element_text(size = 35))
diff --git a/checkV_coverage_script.R b/checkV_coverage_script.R
new file mode 100644
index 0000000..a8e721c
--- /dev/null
+++ b/checkV_coverage_script.R
@@ -0,0 +1,80 @@
+#Open packages
+library(dplyr) #Version ‘1.0.9’
+library(ggplot2) #Version ‘3.3.6’
+library(tidyr) #Version ‘1.2.0’
+library(stringr) #Version ‘1.4.0’
+library(reshape2) #Version ‘1.4.4’
+
+
+setwd("/Users/Sutcliffe/Library/CloudStorage/OneDrive-McGillUniversity/PhD_OneDrive/Publication_3_Viral_Metagenomics_Collab/Writing/12_draft_revisions/Scripts")
+
+#Load in the data! 
+
+SRR <- read.csv("Data_For_Scripts/sequence_runs.csv", header=TRUE) #File has the sequence run meta-data
+runinfo <- read.csv("Data_For_Scripts/sequence_runs.csv", header = TRUE)
+bacterial_bin <- read.csv(file = 'Data_For_Scripts/GTDB-Tk_DAS_check_bin_modified.csv', header = FALSE)
+bacphlip <- read.delim('Data_For_Scripts/CheckV_Coverage/checkv_contigs.fasta.bacphlip', header = TRUE, sep = '\t')
+setwd("Data_For_Scripts/CheckV_Coverage/")
+temp <-  list.files(pattern="*_RPKM_coverage") #Make a temp variable for each of the coverage file per sequence run. The all viral reads was not included unless I need it.
+myfiles <-  lapply(temp, function(x) read.delim(x, header = TRUE)) #Opens up each file
+coverage_data <- do.call(rbind, myfiles) #Binds all the files together in one dataframe
+
+##### DESEQ normalized reads ####
+
+#I will run the analysis on DESEQ normalized reads
+#Load in the normalized reads
+DESEQ_normalized_abundance <- read.csv("../../Data_For_Scripts/norDESEQ_counts_norbylength.tsv", sep = "\t", header=TRUE) #DESEQ normalized read counts
+
+#Then select the 462 contigs we have lifestyle prediction for by Bacphlip
+bacphlip_contigs <- match(DESEQ_normalized_abundance$contig, bacphlip$X)
+bacphlip_contigs <- bacphlip_contigs[!is.na(bacphlip_contigs)]
+bacphlip_norm_abun <- DESEQ_normalized_abundance[bacphlip_contigs,]
+
+#Now I will add the 48 prophages not in the list as we know they are temperate too
+bacphlip_norm_abun <- rbind(DESEQ_normalized_abundance[1:52,], bacphlip_norm_abun)
+#Now we have some duplicates because I added the 52 prophages but there were already four there
+bacphlip_norm_abun <- bacphlip_norm_abun[!duplicated(bacphlip_norm_abun$contig),]
+
+#Now add a column for lifestyle
+
+#Use the bacphlip data to determine the lifestyle 
+temperate_phages <- bacphlip$X[(which(bacphlip$Temperate > 0.5))]
+bacphlip_norm_abun <- bacphlip_norm_abun %>% mutate(lifestyle = case_when((!(contig %in% temperate_phages) & !(str_detect(contig, "DAS"))) ~ 'lytic', ((contig %in% temperate_phages) | ( str_detect(contig, "DAS"))) ~ 'temperate' ) )
+row.names(bacphlip_norm_abun) <- bacphlip_norm_abun[,1]
+bacphlip_norm_abun <- bacphlip_norm_abun[,-1]
+
+temperate_contigs3 <- bacphlip_norm_abun %>% filter(lifestyle == "temperate")
+lytic_contigs3 <- bacphlip_norm_abun %>% filter(lifestyle == "lytic")
+
+relative_by_lifstyl <- c()
+for (column in 1:24){
+  temperate <- sum(temperate_contigs3[,column])/sum(bacphlip_norm_abun[,column])
+  lytic <- sum(lytic_contigs3[,column])/sum(bacphlip_norm_abun[,column])
+  relative_by_lifstyl <- rbind(relative_by_lifstyl, c(temperate, lytic))
+}
+
+relative_by_lifstyl <- as.data.frame(relative_by_lifstyl)
+relative_by_lifstyl <- cbind(colnames(bacphlip_norm_abun)[1:24], relative_by_lifstyl)
+colnames(relative_by_lifstyl) <- c('SRR','temperate', 'lytic') 
+
+relative_by_lifstyl$ID <- runinfo$ID[match(relative_by_lifstyl$SRR,runinfo$SRR)]
+relative_by_lifstyl_melt <- reshape2::melt(relative_by_lifstyl, id.vars=c("SRR", 'ID'))
+
+DESEQ_lifestyle_abund <- ggplot(relative_by_lifstyl_melt, aes(fill=variable, y=value, x=ID)) + geom_bar(position="stack", stat="identity") + theme(axis.text.x = element_text(angle = 90)) + labs( y = 'Normalized Relative Abundance (%)')
+DESEQ_lifestyle_abund <- DESEQ_lifestyle_abund + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
+                                          panel.grid.minor = element_blank())
+
+#Chnage titles of days on x-axis + reformatting to make more clear to read
+day_sampled <- c(runinfo$Time_days)
+DESEQ_lifestyle_abund3 <- DESEQ_lifestyle_abund + scale_x_discrete(labels= day_sampled) + xlab(label = 'Day') + theme(axis.text.x = element_text(size = 25), axis.text.y = element_text(size = 25)) + theme(strip.text = element_text(size =35)) + theme(axis.title = element_text(size = 35)) + scale_fill_discrete(name = "")
+DESEQ_lifestyle_abund3 
+
+
+##### STATS #####
+#Note couldn't get the stats to work in R so I finished it in PRISM
+relative_by_lifstyl_melt <- as_tibble(relative_by_lifstyl_melt)
+relative_by_lifstyl_melt$day <- runinfo$ID[match(relative_by_lifstyl_melt$SRR, runinfo$SRR)]
+relative_by_lifstyl_melt$week <- runinfo$week[match(relative_by_lifstyl_melt$SRR, runinfo$SRR)]
+relative_by_lifstyl_melt$day_group <- runinfo$Time_days[match(relative_by_lifstyl_melt$SRR, runinfo$SRR)]
+stats <- relative_by_lifstyl_melt %>% select(value, day_group, variable) %>% filter(day_group %in% c(0, 182, 851, 852, 853,879, 880, 881))
+stats <- filter(variable == 'temperate')
diff --git a/prophage_coverage_script.R b/prophage_coverage_script.R
new file mode 100644
index 0000000..74640df
--- /dev/null
+++ b/prophage_coverage_script.R
@@ -0,0 +1,277 @@
+#Open packages
+library(dplyr) #Version ‘1.0.9’
+library(ggplot2) #Version ‘3.3.6’
+library(tidyr) #Version ‘1.2.0’
+library(stringr) #Version ‘1.4.0’
+library(reshape2) #Version ‘1.4.4’
+library(cartography) #Version ‘3.0.1’
+library(grid) #Version ‘4.2.1’
+library(ggrepel) #Version ‘0.9.1’
+
+#Each sequence run information
+setwd('Data_For_Scripts/Prophage_Coverage/')
+SRR <- read.csv("../sequence_runs.csv", header=TRUE)
+contig_bed <- read.delim("merged_phage_contigs.bed", header=FALSE, blank.lines.skip = TRUE)
+contig_bed <- contig_bed[,1:3]
+colnames(contig_bed) <- c('contig', 'start', 'length')
+
+setwd('samtools_coverage_coverage')
+temp <-  list.files(pattern="*_samtool_coverage") #Make a temp variable for each of the readcount files
+myfiles <-  lapply(temp, function(x) read.delim(x, header = TRUE)) #Opens up each file
+coverage_data <- do.call(rbind, myfiles) #Binds all the files together in one dataframe
+setwd('..')
+runinfo <- read.csv("../sequence_runs.csv", header = TRUE)
+bacterial_bin <- read.csv(file = '../GTDB-Tk_DAS_check_bin_modified.csv', header = FALSE)
+DESEQ_size_factors <- read.csv("../DESEQ_sizefactors.csv", header = TRUE)
+#An important file this matches each prophage contig name to a prophage id based on bacterial bin
+prophage_host_match <- read.csv(file = "../prophage_host_naming.csv", header = TRUE)
+
+#I need to make a column with the SRR it is from. 
+SRR_names <- substring(temp,1,9) #Make a vector of the SRR run by name in order files of temp.
+#E.g this replicate(5,SRR_names[1]) would replicate the first sequence run 5 times but I need it 5890 times
+SRR_columns <- c()
+for (i in 1:24)
+{
+  SRR_columns <- c(SRR_columns,(replicate(5890,SRR_names[i])))
+}
+
+runinfo$permillionreads <- runinfo$totalNumReads/1000000
+
+#Now I can add it into my coverage data the SRR run
+coverage_data <- cbind(coverage_data, SRR_columns)
+
+#Change name of contig column
+x <- colnames(coverage_data)
+x[1] <- "contig"
+colnames(coverage_data) <- x
+
+#Make a weak cut-off for present (i.e any reads mapping to contig)
+coverage_data$minot_present <- (coverage_data$coverage > 0)
+
+#Make a column for prophage or not
+coverage_data <- coverage_data %>% mutate(lifestyle = case_when(str_detect(contig, "NODE") ~ 'lytic', str_detect(contig, "DAS") ~ 'temperate' ) )
+
+coverage_data$present <- (coverage_data$coverage >= 75) #First results I did this with 25 because I thought it was coverage of 0s not other way around
+
+present_contigs <- coverage_data %>% filter(present == TRUE)
+present_contigs <- unique(present_contigs$contig)
+length(present_contigs)
+
+#Check prophages 
+
+present_temperate <- coverage_data %>% filter(present == TRUE)
+present_temperate <- present_temperate %>% filter(lifestyle == 'temperate')
+length(unique(present_temperate$contig))
+
+#Make presence absence heatmap
+present_prophages <- coverage_data %>% filter(lifestyle == 'temperate')
+coverage_data$days <- runinfo$ID[(match(coverage_data$SRR_columns, runinfo$SRR))]
+present_prophages$SRR_columns <- factor(present_prophages$SRR_columns, levels = c(runinfo$SRR))
+#This lumps the days together but some days are sequenced twice so I will try it (below) with the day-sequence run
+#present_prophages$day <- as.factor(runinfo$Time_days[match(present_prophages$SRR_columns, runinfo$SRR)])
+present_prophages$day <- as.factor(runinfo$ID[match(present_prophages$SRR_columns, runinfo$SRR)])
+present_prophages$contig <- prophage_host_match$prophage_names[match(present_prophages$contig, prophage_host_match$prophage_ID)]
+prophage_presence.heatmap <- ggplot(data = present_prophages, aes(x = day, y = contig, fill = present)) +
+  geom_tile() + scale_fill_manual(values = c('white', "#CE3D32FF")) +
+  theme(axis.text.x = element_text(size = 5, angle = 90, ), axis.text.y = element_text(size = 5, face = "italic")) +
+  xlab(label = "Day") + ylab(label = 'Active Prophage') + theme(axis.text.x = element_text(size = 10))
+prophage_presence.heatmap
+
+# I want to get some metrics, like how many prophages were found over multiple weeks, all time points, etc.
+
+present_temperate_sorted <- present_temperate[ , c("contig","SRR_columns")]
+present_temperate_sorted$week <- runinfo$week[match(present_temperate_sorted$SRR_columns, runinfo$SRR)]
+present_temperate_sorted$day <- runinfo$Time_days[match(present_temperate_sorted$SRR_columns, runinfo$SRR)]
+
+active_prophage_list <- unique(present_temperate_sorted$contig)
+metrics_present_temperate <- c()
+
+for (i in active_prophage_list){
+  temp_matrix <- present_temperate_sorted %>% filter(contig == i)
+  weeks <- length(unique(temp_matrix$week))
+  days <- length(unique(temp_matrix$day))
+  temp_variable <- c(i, weeks, days)
+  metrics_present_temperate <- rbind(metrics_present_temperate, temp_variable)
+}
+colnames(metrics_present_temperate) <- c('contig', 'Number_of_Weeks' ,"Number_of_Days")
+metrics_present_temperate <- as.data.frame(metrics_present_temperate)
+metrics_present_temperate$Number_of_Weeks <- as.numeric(metrics_present_temperate$Number_of_Weeks)
+metrics_present_temperate$Number_of_Days <- as.numeric(metrics_present_temperate$Number_of_Days)
+summary(metrics_present_temperate)
+
+#Lets see how non-prophages
+
+#Stop phase, output the 75% coverage table
+write.table(coverage_data, 'prophage_coverage_data.tsv', sep = "\t")
+coverage_data <- read.table('prophage_coverage_data.tsv', sep = "\t")
+
+#replace the SRR with days
+coverage_data$days <- SRR$ID[(match(coverage_data$SRR_columns, SRR$SRR))]
+
+
+
+
+
+#To compare the coverages of each prophage between dates to see if prophage induction has occured
+
+prophage_induction <- present_prophages[,c('contig','meandepth','SRR_columns')]
+prophage_induction <- spread(prophage_induction, contig, meandepth)
+
+##### DESEQ SIZE FACTOR ####
+#Note run at least the first 201 lines to get here
+
+runinfo$size_factors <-  DESEQ_size_factors$sizeFactors[(match(runinfo$SRR,DESEQ_size_factors$SRR ))]
+
+normalized_coverage2 <- c()
+for (row in 1:nrow(prophage_induction))
+{
+  x <- (prophage_induction[row,-1]/runinfo$size_factors[row])
+  normalized_coverage2 <- rbind(normalized_coverage2, x)
+}
+
+normalized_coverage2 <- cbind(runinfo$Time_days, runinfo$week, runinfo$ID, normalized_coverage2)
+x <- colnames(normalized_coverage2)
+x[1] <- 'day'
+x[2] <-  'week'
+x[3] <- 'day-replicate'
+colnames(normalized_coverage2) <- x
+normalized_coverage2$`day-replicate` <- factor(normalized_coverage2$`day-replicate`)
+normalized_coverage2$sample_num <- c(1:24)
+normalized_coverage2_melted <- melt(normalized_coverage2, id.vars= c('day', 'week', 'day-replicate', 'sample_num'))
+
+normalized_coverage3_melted <- normalized_coverage2_melted
+#normalized_coverage3_melted$variable <- prophage_host_match$prophage_names[match(normalized_coverage3_melted$variable, prophage_host_match$prophage_ID)]
+normalized_coverage3_melted$`day-replicate` <- as.factor(runinfo$ID)
+
+#Add bacterial names of prophages
+normalized_coverage3_melted$phage_name <- prophage_host_match$prophage_names[match(normalized_coverage3_melted$variable, prophage_host_match$prophage_ID)]
+
+#I will try and label all those above 100x coverage
+p <- ggplot(normalized_coverage3_melted, aes(x=day, y=value, col=variable)) + geom_point(size=1, position = position_dodge(0.7)) + theme(axis.text.x = element_text(size = 20), axis.text.y = element_text(size = 20)) + geom_text(aes(label=ifelse(value>100,as.character(variable),'')),hjust=0,vjust=0, size=4, show.legend=FALSE)
+p <- p + facet_grid(~ week, scales="free_x", space="free") + ylab(label = 'Normalized Coverage') + theme(strip.text = element_text(size =20)) + theme(axis.title = element_text(size = 30)) + theme(legend.text =element_text(face = "italic") )
+p 
+
+#Now I will try with z-score that is significant
+nozero_coverage <- subset(normalized_coverage2_melted, value>0)
+nozero_coverage$logCov <- log(nozero_coverage$value)
+nozero_coverage$zscore <- scale(nozero_coverage$logCov)
+#nozero_coverage$variable <- prophage_host_match$prophage_names[match(nozero_coverage$variable, prophage_host_match$prophage_ID)]
+one_hundred_coverage <- subset(nozero_coverage, value>100)
+significant_coverage <- subset(nozero_coverage, zscore>1.96)
+
+normalized_coverage_w_zscore <- left_join(normalized_coverage3_melted, nozero_coverage[,c(1,2,3,4,5,7,8)], by =c('day', "week", "day-replicate", "sample_num", "variable"))
+q <- ggplot(normalized_coverage_w_zscore, aes(x=day, y=value, col=variable)) + 
+  geom_point(size=1, position = position_dodge(0.1), show.legend=FALSE) + 
+  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7)) + 
+  geom_text(aes(label=ifelse(zscore>1.95,as.character(variable),'')),hjust=0,vjust=0, size=2, show.legend=FALSE)
+q <- q + facet_grid(~ week, scales="free_x", space="free_x", shrink = TRUE) + ylab(label = 'Normalized Coverage') + theme(strip.text = element_text(size =7)) + theme(axis.title = element_text(size = 7)) + theme(legend.text =element_text(face = "italic") )
+q <- q + theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 7)) + theme(axis.text.x = element_text(size = 7), axis.title = element_text(size = 7)) + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
+q
+qt <- ggplot_gtable(ggplot_build(q))
+qt$widths[5] = 12*qt$widths[5]
+grid.draw(qt)
+
+ggsave(file="prophage_coverage_DESEQ_norm6.svg", plot=q,  width=30, height=13, units = c("cm"), dpi = 600)
+
+#Note, so I can't figure out how make a day 0 which has just one time point, look like the other weeks so I am going to make a fake dataset where
+#During week has other days 1,2,3,4 so that I can later remove them in Inkskape in a way that the data looks the same. AFter trying different things
+#With facet_grid or ggplot_gtable
+
+day1 <- filter(normalized_coverage_w_zscore, week == 'day0')
+day1$day <- 1
+
+day2 <- filter(normalized_coverage_w_zscore, week == 'day0')
+day2$day <- 2
+
+day3 <- filter(normalized_coverage_w_zscore, week == 'day0')
+day3$day <- 3
+
+day4 <- filter(normalized_coverage_w_zscore, week == 'day0')
+day4$day <- 4
+
+normalized_coverage_w_zscore2 <- rbind(normalized_coverage_w_zscore, day1, day2, day3, day4)
+
+qt <- ggplot(normalized_coverage_w_zscore2, aes(x=day, y=value, col=variable)) + 
+  geom_point(size=1, position = position_dodge(0.1), show.legend=FALSE) + 
+  theme(axis.text.x = element_text(size = 7, angle = 270), axis.text.y = element_text(size = 7)) + 
+  geom_text_repel(aes(label=ifelse(zscore>1.95,as.character(variable),'')),hjust=0,vjust=0, size=2, show.legend=FALSE)
+qt <- qt + facet_grid(~ week, scales="free_x", space="free_x", shrink = TRUE) + ylab(label = 'Normalized Coverage') + theme(strip.text = element_text(size =7)) + theme(axis.title = element_text(size = 7)) + theme(legend.text =element_text(face = "italic") )
+qt <- qt + theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 7)) + theme(axis.text.x = element_text(size = 7), axis.title = element_text(size = 7)) + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
+qt
+ggsave(file="prophage_coverage_DESEQ_norm7.svg", plot=qt,  width=8, height=11, units = c("cm"), dpi = 600)
+
+
+#Make a heatmap with normalized coverage
+normalized_coverage4 <- c()
+for (row in 1:nrow(present_prophages)){
+  if (present_prophages$coverage[row] < 75){
+    coverage = 0
+  } else {
+    x <- match(present_prophages$SRR_columns[row], runinfo$SRR)
+    coverage = present_prophages$meandepth[row]/runinfo$size_factors[x]
+  }
+  y <- c(present_prophages$contig[row], as.character(present_prophages$day[row]), as.numeric(coverage))
+  normalized_coverage4 <- rbind(normalized_coverage4, y)
+}
+
+normalized_coverage4 <- as.data.frame(normalized_coverage4)
+colnames(normalized_coverage4) <- c("contig", "day", "coverage") 
+normalized_coverage4$day <- as.factor(present_prophages$day)
+normalized_coverage4$coverage <- as.numeric(normalized_coverage4$coverage)
+normalized_coverage4[normalized_coverage4 == 0] <- NA
+normalized_coverage4$week <- runinfo$week[(match(normalized_coverage4$day, runinfo$ID))]
+normalized_coverage4$number <- prophage_host_match$prophage_numbers[(match(normalized_coverage4$contig, prophage_host_match$prophage_names))]
+
+
+prophage_coverage.heatmap <- ggplot(data = normalized_coverage4, aes(x = day, y = contig, fill = coverage)) +
+  geom_tile() + scale_fill_gradientn(colours = carto.pal(pal1 = "orange.pal" ,n1 = 5), na.value = 'white') + 
+  theme(axis.text.x = element_text(size = 7 ), axis.text.y = element_text(size = 7, face = "italic", )) +
+  xlab(label = "Day") + ylab(label = 'Active Prophages') + theme(axis.text.x = element_text(size = 7), axis.title = element_text(size = 7))
+prophage_coverage.heatmap_grided <- prophage_coverage.heatmap +facet_grid(~ week) + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
+prophage_coverage.heatmap_grided
+prophage_coverage.heatmap 
+
+#Changle-axis labels to the day they were sampled.
+day_sampled <- c(runinfo$Time_days)
+prophage_coverage.heatmap_2 <- prophage_coverage.heatmap + scale_x_discrete(labels= day_sampled) + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
+prophage_coverage.heatmap_2 <- prophage_coverage.heatmap_2 + theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 7))  
+prophage_coverage.heatmap_2 <- prophage_coverage.heatmap_2 + theme(legend.key.size = unit(2, 'mm'), #change legend key size
+                                  legend.key.height = unit(2, 'mm'), #change legend key height
+                                  legend.key.width = unit(2, 'mm'), #change legend key width
+                                  legend.title = element_text(size=7), #change legend title font size
+                                  legend.text = element_text(size=7)) #change legend text font size
+prophage_coverage.heatmap_2 
+ggsave(file='prophage_coverage.heatmap_3.svg', plot=prophage_coverage.heatmap_2, width=21, height=13, units = c("cm"), dpi = 600)
+ggsave(file='prophage_coverage.heatmap_grided.svg', plot=prophage_coverage.heatmap_grided, width=13, height=8)
+
+##### Check Induced Prophages ####
+#Important the prophages that went though a signficant fold increase by DESEQ analysis
+#Method1 use the significantly increased DESEQ prophages
+DESEQ_induced_prophages <- read.csv( file = '../DESEQ_induced_prophages.tsv', sep = "\t")
+DESEQ_induced_prophages$Phage <- prophage_host_match$prophage_names[match(DESEQ_induced_prophages$Phage, prophage_host_match$prophage_ID)]
+
+#Method2 use the normalized coverage with a z-score over 1.96
+nozero_coverage <- subset(normalized_coverage2_melted, value>0)
+nozero_coverage$logCov <- log(nozero_coverage$value)
+nozero_coverage$zscore <- scale(nozero_coverage$logCov)
+#nozero_coverage$variable <- prophage_host_match$prophage_names[match(nozero_coverage$variable, prophage_host_match$prophage_ID)]
+one_hundred_coverage <- subset(nozero_coverage, value>100)
+significant_coverage <- subset(nozero_coverage, zscore>1.96)
+
+# bins_with_induced <- unique((significant_coverage$variable))
+# bins_position <- match(bins_with_induced, bacterial_bin$V1)
+# bin_names <- bacterial_bin$V9[bins_position]
+# bin_names
+
+#Compare the different methods
+method1 <- DESEQ_induced_prophages$Phage
+method2 <- significant_coverage$variable
+intersect(method1,method2)
+
+##### Missclassified Prophage-Host #####
+
+coverage_data %>% filter(str_detect(contig, "DAS")) %>% filter(coverage > 95) %>% select(contig) -> high_covered_prophages
+unique(sort(high_covered_prophages[,]))
+
+#We see that 28 of 52 prophages could fall into this 'miss-classified to host' area. 
+
diff --git a/prophage_merger_v2.py b/prophage_merger_v2.py
new file mode 100644
index 0000000..2fe307e
--- /dev/null
+++ b/prophage_merger_v2.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Apr  8 10:13:33 2021
+
+@author: steven
+"""
+
+import pandas as pd
+
+#Removes prophages that overlap with each other
+
+#loading in the file
+    
+prophage_coordinates = pd.read_csv("path/to/prophage_coordinates.txt", sep='\t', header=None)
+
+#sorting
+sorted_coordinates = prophage_coordinates.sort_values(by=[1])
+
+double_sorted = prophage_coordinates.sort_values(by=[0,1])
+
+double_sorted = double_sorted.reset_index(drop=True)
+compairson = double_sorted
+
+#iterate through the dataframe:
+
+for i in range(0,(len(double_sorted)-1)):
+    check = True
+    while check:
+        for x in range(i+1,len(double_sorted)):
+            if str(double_sorted.loc[i, 0]) == str(double_sorted.loc[x, 0]):
+                if (int(double_sorted.loc[x, 1]) >= int((double_sorted.loc[i, 1]))) and (int(double_sorted.loc[x, 1]) <= int((double_sorted.loc[i, 2]))):
+                    if (int(double_sorted.loc[i, 2]) <= int((double_sorted.loc[x, 2]))):
+                        double_sorted.loc[i,2] = double_sorted.loc[x,2]
+            
+            else:
+                check = False
+                break
+                    
+                
+     
+def remove_duplicates(z):
+           
+    #iterate through the dataframe creating indices to delete
+    indices_to_delete = []
+    
+    for i in range(1,len(z)):
+        if str(z.loc[i, 0]) == str(z.loc[i-1, 0]):
+            if (int(z.loc[i, 1]) >= int((z.loc[i-1, 1]))) and (int(z.loc[i, 1]) <= int((z.loc[i-1, 2]))):
+                indices_to_delete.append(i)
+                
+        
+    #Create final dataframe with prophages merged
+    z = z.drop(indices_to_delete)
+    return z
+
+#Check for duplicates to remove and then remove them
+check = True
+while check:
+    check = False
+    double_sorted = remove_duplicates(double_sorted)
+    double_sorted = double_sorted.reset_index(drop=True)
+    for i in range(1,len(double_sorted)):
+        if str(double_sorted.loc[i, 0]) == str(double_sorted.loc[i-1, 0]):
+            if (int(double_sorted.loc[i, 1]) >= int((double_sorted.loc[i-1, 1]))) and (int(double_sorted.loc[i, 1]) <= int((double_sorted.loc[i-1, 2]))):
+                check = True
+  
+                
+        
+    
+double_sorted.to_csv('final', sep='\t', header=False, index=False)
\ No newline at end of file