-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsam-to-R.py
executable file
·137 lines (125 loc) · 4.47 KB
/
sam-to-R.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
import sys
import argparse
def cigar_to_aln_len(cigar):
current_number = ""
total_length = 0
for c in cigar:
if c in "0123456789":
current_number += c
elif c in "MD=X":
total_length += int(current_number)
current_number = ""
else:
current_number = ""
return total_length
def parse_opts(opt):
opts = {}
for text_opt in opt.split():
[name,typ,value] = text_opt.split(":")
if typ == "A":
opts[name] = value
elif typ == "i":
opts[name] = int(value)
elif typ == "f":
opts[name] = float(value)
elif typ == "Z":
opts[name] = value
elif typ == "H":
opts[name] = bytearray(value.decode("hex"))
elif typ == "B":
arr = value.split(",")
arrtyp = arr[0]
arr = arr[1:]
if arrtyp == "f":
arr = [float(f) for f in arr]
else:
arr = [int(f) for f in arr]
opts[name] = arr
return opts
# ----- command line parsing -----
parser = argparse.ArgumentParser(
prog="sam-to-svg",
description="Turns SAM file into SVG showing alignments.")
parser.add_argument("names", type=str,
help="Name of alignments, comma separated.")
parser.add_argument("sorted_sam_file", type=str, nargs="+",
help="Sorted SAM files.")
parser.add_argument("-l", "--ref_length", type=int,
help="Length of reference "
"(taken from SAM file if header is present.")
parser.add_argument("-m", "--min_length", type=int,
help="Minimum length of alignment to draw.")
parser.add_argument("-i", "--min_identity", type=float,
help="Minimum percent identity of match.")
args = parser.parse_args()
# ----- end command line parsing -----
names = args.names.split(",")
length = args.ref_length
lanes = []
lanesy = []
lanelens = []
if len(names) < len(args.sorted_sam_file):
sys.exit("Error: fewer names than SAM files given.")
n = 0
minlane = 0
nexty = 0
for sam_file in args.sorted_sam_file:
samf = open(sam_file)
reflengths = {}
refname = ""
for line in samf:
if line[0] == "@":
if line[1:3] == "SQ":
split = line[3:].split()
for bit in split:
if bit[0:2] == "SN":
refname = bit[3:]
if bit[0:2] == "LN":
reflengths[refname] = int(bit[3:])
elif line[1:3] == "PG":
split = line[3:].split()
else:
[qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual,opt] \
= line.split(None,11)
aln_len = cigar_to_aln_len(cigar)
opts = parse_opts(opt)
if rname == "*":
continue
if args.min_identity is not None and "NM" not in opts:
sys.exit("No NM value in SAM file, "
"can't calculate percent identity.")
if ((args.min_length is None or aln_len >= args.min_length)
and (args.min_identity is None
or ((float(aln_len - opts["NM"])/aln_len)*100
>= args.min_identity))):
inserted = False
for lane in lanes[minlane:]:
if len(lane) == 0:
lane.append((int(pos),int(pos)+aln_len,n))
inserted = True
break
elif lane[-1][1] < int(pos):
lane.append((int(pos),int(pos)+aln_len,n))
inserted = True
break
if not inserted:
lanes.append([(int(pos),int(pos)+aln_len,n)])
lanelens.append(reflengths[rname])
lanesy.append(nexty)
nexty += 1
minlane = len(lanes)
samf.close()
n += 1
nexty += 0.5
sys.stdout.write("{:s}\t{:s}\t{:s}\t{:s}\n"
.format("track","file","beg","end"))
n = 0
for lane,lanelen in zip(lanes,lanelens):
sys.stdout.write("{:.1f}\t{:s}\t{:d}\t{:d}\n"
.format(lanesy[n], "space", 1, lanelen))
for aln in lane:
sys.stdout.write("{:.1f}\t{:s}\t{:d}\t{:d}\n"
.format(lanesy[n], names[aln[2]],
aln[0], aln[1]))
n += 1