-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_to_subfiles.py
53 lines (46 loc) · 1.36 KB
/
split_to_subfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os,sys,csv
import subprocess
from multiprocessing import Pool, Process, Manager
from glob import glob
import Bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
def batch_iterator(iterator, batch_size) :
"""Returns lists of length batch_size.
This can be used on any iterator, for example to batch up
SeqRecord objects from Bio.SeqIO.parse(...), or to batch
Alignment objects from Bio.AlignIO.parse(...), or simply
lines from a file handle.
This is a generator function, and it returns lists of the
entries from the supplied iterator. Each list will have
batch_size entries, although the final list may be shorter.
"""
entry = True
while entry :
batch = []
while len(batch) < batch_size :
try :
entry = iterator.__next__()
except StopIteration :
entry = None
if entry is None :
break
batch.append(entry)
if batch:
yield batch
def split_to_subfiles(fasta_iteror):
for i, batch in enumerate(batch_iterator(fasta_iteror, 25000)):
filename = "%s/merge_%i.fasta" %(outdir, (i+1))
handle = open(filename, "w")
count = SeqIO.write(batch, handle, "fasta")
handle.close()
print("Done")
def process_sample(infile):
fasta_iteror = SeqIO.parse(open(infile), "fasta")
split_to_subfiles(fasta_iteror)
def main(infile):
process_sample(infile)
if __name__ == '__main__':
infile = sys.argv[1]
outdir = sys.argv[2]
main(infile)