-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprophage_merger_v2.py
71 lines (48 loc) · 2.23 KB
/
prophage_merger_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 8 10:13:33 2021
@author: steven
"""
import pandas as pd
#Removes prophages that overlap with each other
#loading in the file
prophage_coordinates = pd.read_csv("path/to/prophage_coordinates.txt", sep='\t', header=None)
#sorting
sorted_coordinates = prophage_coordinates.sort_values(by=[1])
double_sorted = prophage_coordinates.sort_values(by=[0,1])
double_sorted = double_sorted.reset_index(drop=True)
compairson = double_sorted
#iterate through the dataframe:
for i in range(0,(len(double_sorted)-1)):
check = True
while check:
for x in range(i+1,len(double_sorted)):
if str(double_sorted.loc[i, 0]) == str(double_sorted.loc[x, 0]):
if (int(double_sorted.loc[x, 1]) >= int((double_sorted.loc[i, 1]))) and (int(double_sorted.loc[x, 1]) <= int((double_sorted.loc[i, 2]))):
if (int(double_sorted.loc[i, 2]) <= int((double_sorted.loc[x, 2]))):
double_sorted.loc[i,2] = double_sorted.loc[x,2]
else:
check = False
break
def remove_duplicates(z):
#iterate through the dataframe creating indices to delete
indices_to_delete = []
for i in range(1,len(z)):
if str(z.loc[i, 0]) == str(z.loc[i-1, 0]):
if (int(z.loc[i, 1]) >= int((z.loc[i-1, 1]))) and (int(z.loc[i, 1]) <= int((z.loc[i-1, 2]))):
indices_to_delete.append(i)
#Create final dataframe with prophages merged
z = z.drop(indices_to_delete)
return z
#Check for duplicates to remove and then remove them
check = True
while check:
check = False
double_sorted = remove_duplicates(double_sorted)
double_sorted = double_sorted.reset_index(drop=True)
for i in range(1,len(double_sorted)):
if str(double_sorted.loc[i, 0]) == str(double_sorted.loc[i-1, 0]):
if (int(double_sorted.loc[i, 1]) >= int((double_sorted.loc[i-1, 1]))) and (int(double_sorted.loc[i, 1]) <= int((double_sorted.loc[i-1, 2]))):
check = True
double_sorted.to_csv('final', sep='\t', header=False, index=False)