draft finish

billzt · Jul 19, 2019 · bfed0a5 · bfed0a5
1 parent 6ac1485
commit bfed0a5
Show file tree

Hide file tree

Showing 7 changed files with 186 additions and 72 deletions.
diff --git a/setup.py b/setup.py
@@ -1,3 +1,3 @@
-# pysam
+
 # primer3-py
 # flask
diff --git a/src/make_primers.py b/src/make_primers.py
@@ -6,37 +6,40 @@ def calculate_GC(seq):
     gc = len([x for x in seq.upper() if x=='G' or x=='C'])
     return gc/len(seq)*100
 
-def make_primers(query_file):
+def make_primers(query):
     '''
         In case of user's input, make list of primer dicts similar as the result of design_primer module
+        Input:
+            query: a string in multi-lines
     '''
     primers = {}
-    with open(query_file) as f:
-        for line in f:
-            line_data = re.split(r'\s+', line.strip())
-            if len(line_data)==4:
-                (id, rank, seq_F, seq_R) = line_data
-                rank = int(rank)
-            else:
-                (id, seq_F, seq_R) = line_data
-                rank = 0
-            
-            if id not in primers:
-                primers[id] = {}
-            primers[id]['PRIMER_PAIR_NUM_RETURNED'] = rank+1
-            primers[id][f'PRIMER_PAIR_{rank}_PENALTY'] = 0
-            primers[id][f'PRIMER_LEFT_{rank}_SEQUENCE'] = seq_F
-            primers[id][f'PRIMER_RIGHT_{rank}_SEQUENCE'] =  seq_R
-            primers[id][f'PRIMER_LEFT_{rank}'] = [-1, len(seq_F)]
-            primers[id][f'PRIMER_RIGHT_{rank}'] = [-1, len(seq_R)]
-            primers[id][f'PRIMER_LEFT_{rank}_TM'] = primer3.calcTm(seq_F)
-            primers[id][f'PRIMER_RIGHT_{rank}_TM'] = primer3.calcTm(seq_R)
-            primers[id][f'PRIMER_LEFT_{rank}_GC_PERCENT'] = calculate_GC(seq_F)
-            primers[id][f'PRIMER_RIGHT_{rank}_GC_PERCENT'] = calculate_GC(seq_R)
-            primers[id][f'PRIMER_PAIR_{rank}_PRODUCT_SIZE'] = -1
+
+    for line in query.splitlines():
+        line_data = re.split(r'\s+', line.strip())
+        if len(line_data)==4:
+            (id, rank, seq_F, seq_R) = line_data
+            rank = int(rank)
+        else:
+            (id, seq_F, seq_R) = line_data
+            rank = 0
+
+        if id not in primers:
+            primers[id] = {}
+        primers[id]['PRIMER_PAIR_NUM_RETURNED'] = rank+1
+        primers[id][f'PRIMER_PAIR_{rank}_PENALTY'] = 0
+        primers[id][f'PRIMER_LEFT_{rank}_SEQUENCE'] = seq_F
+        primers[id][f'PRIMER_RIGHT_{rank}_SEQUENCE'] =  seq_R
+        primers[id][f'PRIMER_LEFT_{rank}'] = [-1, len(seq_F)]
+        primers[id][f'PRIMER_RIGHT_{rank}'] = [-1, len(seq_R)]
+        primers[id][f'PRIMER_LEFT_{rank}_TM'] = primer3.calcTm(seq_F)
+        primers[id][f'PRIMER_RIGHT_{rank}_TM'] = primer3.calcTm(seq_R)
+        primers[id][f'PRIMER_LEFT_{rank}_GC_PERCENT'] = calculate_GC(seq_F)
+        primers[id][f'PRIMER_RIGHT_{rank}_GC_PERCENT'] = calculate_GC(seq_R)
+        primers[id][f'PRIMER_PAIR_{rank}_PRODUCT_SIZE'] = -1
 
     return primers
 
 if __name__ == "__main__":
-    primers = make_primers('tests/query_check')
-    print(primers)
+    with open('tests/query_check') as f:
+        primers = make_primers(f.read())
+        print(primers)
diff --git a/src/make_sites.py b/src/make_sites.py
@@ -25,37 +25,39 @@ def faidx(template_file, region_string):
         result_seqs[seq_split[0]] = ''.join(seq_split[1:])
     return result_seqs
 
-def build(query_file, template_file, primer_type):
+def build(query, template_file, primer_type):
     '''
-        return a list of primer sites dict that can be passed to the design_primer module
-        [{
-            'id': , 
-            'template': ,
-            'type': ,
-            'pos': ,
-            'length':,
-            'size_min':,
-            'size_max':
-        }]
+        Input:
+            query: a string in multi-lines
+        Return:
+            a list of primer sites dict that can be passed to the design_primer module
+            [{
+                'id': , 
+                'template': ,
+                'type': ,
+                'pos': ,
+                'length':,
+                'size_min':,
+                'size_max':
+            }]
     '''
     primer_sites = []
     retrieve_region2raw_region = {}
-    with open(query_file) as f:
-        for line in f:
-            query_data = re.split(r'\s+', line.strip())
-            (chr, pos, length, size_min, size_max) = (query_data[0], 1, 1, 70, 1000)
-            if len(query_data)>1 :     # seq ID and pos
-                pos = int(query_data[1].replace(',', ''))
-            if len(query_data)>2 :
-                length = int(query_data[2])
-            if len(query_data)>3 :
-                size_min = int(query_data[3])
-                size_max = int(query_data[4])
+    for line in query.splitlines():
+        query_data = re.split(r'\s+', line.strip())
+        (chr, pos, length, size_min, size_max) = (query_data[0], 1, 1, 70, 1000)
+        if len(query_data)>1 :     # seq ID and pos
+            pos = int(query_data[1].replace(',', ''))
+        if len(query_data)>2 :
+            length = int(query_data[2])
+        if len(query_data)>3 :
+            size_min = int(query_data[3])
+            size_max = int(query_data[4])
 
-            # retrieve_start and retrieve_end are used by samtools to extract template sequences
-            retrieve_start = max(pos-size_max, 1)
-            retrieve_end = pos+length+size_max
-            retrieve_region2raw_region[f'{chr}:{retrieve_start}-{retrieve_end}'] = [chr, pos, length, size_min, size_max, retrieve_start]
+        # retrieve_start and retrieve_end are used by samtools to extract template sequences
+        retrieve_start = max(pos-size_max, 1)
+        retrieve_end = pos+length+size_max
+        retrieve_region2raw_region[f'{chr}:{retrieve_start}-{retrieve_end}'] = [chr, pos, length, size_min, size_max, retrieve_start]
 
     retrieve_region_string = '\n'.join(retrieve_region2raw_region.keys())
 
@@ -75,5 +77,6 @@ def build(query_file, template_file, primer_type):
     return primer_sites
 
 if __name__ == "__main__":
-    primer_sites = build(query_file='tests/query_design', template_file='tests/example.fa', primer_type='SEQUENCE_TARGET')
-    print(json.dumps(primer_sites))
+    with open('tests/query_design') as f:
+        primer_sites = build(query=f.read(), template_file='tests/example.fa', primer_type='SEQUENCE_TARGET')
+        print(json.dumps(primer_sites, indent=4))
diff --git a/src/output.py b/src/output.py
@@ -1,22 +1,26 @@
 import json
+import sys
+import os
 
-def text(primers, dbs):
+def after_check(primers, dbs, file):
     print('#Site_ID', 'Primer_Rank', 'Primer_Seq_Left', 'Primer_Seq_Right', 'Target_Amplicon_Size', \
         'Primer_Pair_Penalty_Score', 'Database', 'Possible_Amplicon_Number', \
-            'Primer_Rank_in_Primer3_output', sep='\t')
+            'Primer_Rank_in_Primer3_output', 'Tm_left', 'Tm_Right', sep='\t', file=file)
     for db in dbs:
         for (id, primers) in primers.items():
-            if 'PRIMER_PAIR_NUM_RETURNED_FINAL' not in primers:
-                print(id, 'No_Primer\t', sep='\t', end='')
+            # No primers
+            if primers['PRIMER_PAIR_NUM_RETURNED']==0:
+                print(id, 'No_Primer\t', sep='\t', end='', file=file)
                 if 'PRIMER_LEFT_EXPLAIN' in primers:
                     print(primers['PRIMER_LEFT_EXPLAIN'].replace(' ', '_'), \
                         primers['PRIMER_RIGHT_EXPLAIN'].replace(' ', '_'), \
                             primers['PRIMER_PAIR_EXPLAIN'].replace(' ', '_'), \
-                                sep='\t')
-                print('###')
+                                sep='\t', file=file)
+                print('###', file=file)
                 continue
 
             for amplicon_rank in range(0, primers['PRIMER_PAIR_NUM_RETURNED_FINAL']):
+                db_base = os.path.basename(db)
                 print_data = []
                 print_data.append(id)
                 print_data.append(str(amplicon_rank))
@@ -25,13 +29,16 @@ def text(primers, dbs):
                 print_data.append(primers[f'PRIMER_RIGHT_{raw_rank}_SEQUENCE'])
                 print_data.append(str(primers[f'PRIMER_PAIR_{raw_rank}_PRODUCT_SIZE']))
                 print_data.append(str(primers[f'PRIMER_PAIR_{raw_rank}_PENALTY']))
-                print_data.append(db)
-                print_data.append(str(len(primers[db][f'PRIMER_PAIR_{raw_rank}_AMPLICONS'])))
+                print_data.append(db_base)
+                print_data.append(str(len(primers[db_base][f'PRIMER_PAIR_{raw_rank}_AMPLICONS'])))
                 print_data.append(str(raw_rank))
-                print('\t'.join(print_data))
-            print('###')
+                print_data.append(str(round(primers[f'PRIMER_LEFT_{raw_rank}_TM'], 2)))
+                print_data.append(str(round(primers[f'PRIMER_RIGHT_{raw_rank}_TM'], 2)))
+                print('\t'.join(print_data), file=file)
+            print('###', file=file)
+
 
 if __name__ == "__main__":
     primers = json.load(open('tests/sort_primers.json'))
     dbs = ['example.fa']
-    text(primers, dbs)
+    after_check(primers, dbs, file=sys.stdout)
diff --git a/src/primertool b/src/primertool
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+'''Here is the title
+Here is the document
+'''
+
+__author__ = 'Tao Zhu'
+__copyright__ = 'Copyright 2019'
+__license__ = 'GPL'
+__version__ = '0.1'
+__email__ = '[email protected]'
+__status__ = 'Development'
+
+import argparse
+import re
+import os
+import json
+import shutil
+
+from distutils.version import LooseVersion
+
+import make_sites, make_primers, design_primer, run_blast, sort_primers, output
+
+parser = argparse.ArgumentParser(description='PrimerServer2: a high-throughput primer \
+    design and specificity-checking platform', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('query', help='query file. (STDIN is acceptable)', type=argparse.FileType('r'))
+parser.add_argument('template', help='template file in FASTA format')
+
+group = parser.add_mutually_exclusive_group()
+group.add_argument('--no-specificity', help="Don't check specificity; Only design primers", action='store_true')
+group.add_argument('--only-specificity', help="Only check specificity and skip primer designs", action='store_true')
+parser.add_argument('--type', choices=['SEQUENCE_TARGET', 'SEQUENCE_INCLUDED_REGION', 'FORCE_END'],\
+     help='primer types', default='SEQUENCE_TARGET')
+parser.add_argument('-p', '--cpu', type=int, help='Used CPU number.', default=2)
+parser.add_argument('-l', '--checking-size-min', type=int, help='Lower limit of the checking amplicon size range (bp).', \
+    default=70)
+parser.add_argument('-u', '--checking-size-max', type=int, help='Upper limit of the checking amplicon size range (bp).', \
+    default=1000)
+parser.add_argument('-r', '--primer-num-retain', type=int, help='The maximum number of primers for each site to return.', \
+    default=10)
+parser.add_argument('-a', '--report-amplicon-seqs', help="Get amplicon seqs (might be slow)", action='store_true')
+parser.add_argument('--json-debug', help="Output debug information in JSON mode", action='store_true')
+parser.add_argument('-o', '--out', help="Output primers in JSON format", type=argparse.FileType('w'))
+parser.add_argument('-t', '--tsv', help="Output primers in TSV format", type=argparse.FileType('w'))
+args = parser.parse_args()
+
+def error(msg):
+    if args.json_debug is True:
+        print(json.dumps({'ERROR': msg}))
+    raise Exception(msg)
+
+############### Check Environments ###############
+if shutil.which('samtools') is None:
+    error('No samtools detected in your system')
+
+samtools_version = os.popen('samtools --version').readlines()[0].strip().split(' ')[1]
+if LooseVersion(samtools_version) < LooseVersion('1.9'):
+    error(f'Your samtools version is v{samtools_version}, but >=v1.9 is required')
+
+if shutil.which('blastn') is None:
+    error('No NCBI-BLAST+ (blastn) detected in your system')
+
+if shutil.which('makeblastdb') is None:
+    error('No NCBI-BLAST+ (makeblastdb) detected in your system')
+
+############### Check Template File (FASTA) #######
+if os.path.isfile(args.template) is False:
+    error(f'File not found: {args.template}')
+if os.path.isfile(args.template+'.fai') is False:
+    code = os.system(f'samtools faidx {args.template} 2>/dev/null')
+    if code != 0:
+        error(f'File {args.template} cannot be indexed by samtools faidx. Perhaps it is not in FASTA format')
+
+############### Run ###############################
+#### primers ####
+query_string = args.query.read()
+if args.only_specificity is True:
+    primers = make_primers.make_primers(query=query_string)
+else:
+    sites = make_sites.build(query=query_string, template_file=args.template, primer_type=args.type)
+    primers = design_primer.multiple(sites, cpu=args.cpu)
+
+#### specificity ####
+dbs = [args.template]
+if args.no_specificity is False:
+    primers = run_blast.run_blast_parallel(primers=primers, dbs=dbs, cpu=args.cpu,\
+        checking_size_max=args.checking_size_max, checking_size_min=args.checking_size_min, \
+            report_amplicon_seq=args.report_amplicon_seqs)
+    primers = sort_primers.sort_rank(primers=primers, dbs=dbs, max_num_return=args.primer_num_retain)
+
+#### output #########
+if args.out is not None:
+    print(json.dumps(primers, indent=4), file=args.out)
+else:
+    print(json.dumps(primers, indent=4))
+
+if args.tsv is not None:
+    if args.no_specificity is False:
+        output.after_check(primers, dbs, file=args.tsv)
diff --git a/src/run_blast.py b/src/run_blast.py
@@ -41,7 +41,7 @@ def run_blast_parallel(primers, dbs, cpu=2, checking_size_min=70, checking_size_
                 p3_input = {
                     'id': id,
                     'rank': rank,
-                    'db': 'tests/'+ db,
+                    'db': db,
                     'seq_L': primer[f'PRIMER_LEFT_{rank}_SEQUENCE'],
                     'seq_R': primer[f'PRIMER_RIGHT_{rank}_SEQUENCE'],
                     'checking_size_min': checking_size_min,
@@ -63,7 +63,8 @@ def run_blast_parallel(primers, dbs, cpu=2, checking_size_min=70, checking_size_
     return primers
 
 if __name__ == "__main__":
-    primers = make_primers('tests/query_check_multiple')
-    dbs = ['example.fa']
-    report_primers = run_blast_parallel(primers, dbs, cpu=10, report_amplicon_seq=True)
-    print(json.dumps(report_primers))
+    with open('tests/query_check_multiple') as f:
+        primers = make_primers(query=f.read())
+        dbs = ['tests/example.fa']
+        report_primers = run_blast_parallel(primers, dbs, cpu=10, report_amplicon_seq=True)
+        print(json.dumps(report_primers, indent=4))
diff --git a/src/sort_primers.py b/src/sort_primers.py
@@ -1,4 +1,5 @@
 import json
+import os
 
 def sort_rank(primers, dbs, max_num_return=10):
     '''
@@ -14,7 +15,7 @@ def sort_rank(primers, dbs, max_num_return=10):
     '''
     for (id, primer) in primers.items():
         rank_to_amplicon_valid_num = {}
-        main_db = dbs[0]
+        main_db = os.path.basename(dbs[0])
         for rank in range(0, primer['PRIMER_PAIR_NUM_RETURNED']):
             if f'PRIMER_PAIR_{rank}_AMPLICONS' not in primer[main_db]:
                 continue