forked from skingan/HomolContigsByAnnotation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFindSharedGenes_withLength.py
executable file
·99 lines (75 loc) · 3.11 KB
/
FindSharedGenes_withLength.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
# Sarah B. Kingan
# 19 August 2016
#
# Pacific Biosciences
# Applications Lab
###########################################################################################################
# import libraries
from argparse import ArgumentParser
import csv
###########################################################################################################
# define command line arguments, program description, and help
desc ='''Identify pairs of contigs that share one or or more annotated genes.
These contigs may represent homologous genomic regions with divergent haplotypes.'''
parser = ArgumentParser(description=desc)
parser.add_argument("GeneBed", help="bed file with gene spans")
parser.add_argument("ContigLengths", help="tab-delimited text with contigName contigLength")
args = parser.parse_args()
###########################################################################################################
# get filenames
BED_file = args.GeneBed
Len_file = args.ContigLengths
###########################################################################################################
# define functions
def BED2Dict(bed_file):
# import BED data into dict
# key = contigID
# value = list of gene IDs
my_dict= {}
with open(bed_file, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for row in reader:
if (row[0] in my_dict):
my_dict[row[0]].append(row[3])
else:
my_dict[row[0]] = [row[3]]
return my_dict
def Len2Dict(length_file):
# key = contigID
# value = contigLength
my_dict= {}
with open(length_file, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for row in reader:
my_dict[row[0]] = row[1]
return my_dict
def intersection(list1, list2):
# find intersection between two gene lists
return set(list1).intersection(list2)
###########################################################################################################
# program body
# make dictionary of BED file data and contig length data
contigGene_dict = BED2Dict(BED_file)
contigLength_dict = Len2Dict(Len_file)
# print header
header = ['Contig1', 'Contig2', 'Length1', 'Length2', 'N_shared_genes', 'gene_IDs']
print '\t'.join(str(h) for h in header)
# pairwise comparison of all contigs
contigID_list = contigGene_dict.keys()
for i in range(len(contigID_list)):
for j in range(i):
sharedGenes = list(intersection(contigGene_dict[contigID_list[i]],contigGene_dict[contigID_list[j]]))
if len(sharedGenes) > 0:
length1 = 'NA'
length2 = 'NA'
if contigLength_dict[contigID_list[i]]:
length1 = contigLength_dict[contigID_list[i]]
if contigLength_dict[contigID_list[j]]:
length2 = contigLength_dict[contigID_list[j]]
if int(length1) > int(length2):
output = [contigID_list[i], contigID_list[j], length1, length2]
if int(length2) > int(length1):
output = [contigID_list[j], contigID_list[i], length2, length1]
output.extend([len(sharedGenes), ",".join(str(g) for g in sharedGenes)])
print '\t'.join(str(o) for o in output)