updates

feldman4 · Feb 21, 2024 · 84386fe · 84386fe
1 parent 2d9a6d2
commit 84386fe
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 15 deletions.
diff --git a/src/ngs_analysis/app.py b/src/ngs_analysis/app.py
@@ -209,7 +209,7 @@ def parse_reads(sample, simulate=False):
         parse_sequences(config, reads).to_parquet(filenames['parsed'])
 
 
-def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10):
+def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10, drop_unmapped=True):
     """For each read, find nearest match for the requested fields. 
     
     Candidate matches from the design table are found via mmseqs search, 
@@ -220,6 +220,9 @@ def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10):
 
     :param sample: sample name
     :param simulate: if True, use ./*/simulate/ subdirectories
+    :param drop_unmapped: whether to drop reads that fail to map to all
+        of the requested fields, if not dropping then the mapped table
+        may contain null entries
     """
     filenames = get_filenames(sample, simulate)
 
@@ -245,9 +248,10 @@ def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10):
             df_match = match_mapped(df_mapped, field)
         arr += [df_match]
 
+    join = 'inner' if drop_unmapped else 'outer'
     df_match = arr[0]
     for df in arr[1:]:
-        df_match = df_match.merge(df, on='read_index')
+        df_match = df_match.merge(df, on='read_index', how=join)
 
     if secondary:
         df_designs = load_designs()
@@ -325,22 +329,25 @@ def parse_sequences(config, sequences):
     Finally, the protein is matched using the template string protein.pattern,
     and named matches are added to the result dictionary. Only DNA is stored here.
     """
-    re_insert = re.compile('{left_adapter}(.*){right_adapter}'.format(**config))
+    re_insert = re.compile('{left_adapter}(.*?){right_adapter}'.format(**config))
     capture = config['protein'].get('capture', {})
     optional = config['protein'].get('optional', [])
     if 'protein' in config:
-
         re_proteins = [re.compile(format_string_to_capture_regex(
             x, optional, **capture)) for x in config['protein']['patterns']]
 
     arr = []
     for i, dna in enumerate(sequences):
         dna = dna.upper()
-        entry = {}
-        try:
-            insert = re_insert.findall(dna)[0]
-        except IndexError:
+        entry = {'sense': 1}
+        matches = re_insert.findall(dna)
+        if len(matches) == 0:
+            matches = re_insert.findall(reverse_complement(dna))
+            entry['sense'] = -1
+        if len(matches) == 0:
             continue
+        insert = matches[0]
+
         entry['read_index'] = i
         entry['read_length'] = len(dna)
         entry['insert_length'] = len(insert)

diff --git a/src/ngs_analysis/nanopore.py b/src/ngs_analysis/nanopore.py
@@ -11,7 +11,7 @@
 
 
 def setup_from_nanopore_fastqs(path_to_fastq_pass, min_reads=10, 
-        exclude='unclassified', write_sample_table=True):
+        exclude='unclassified', include=None, write_sample_table=True):
     """Set up analysis from fastq.gz files produced by dorado
     basecalling and barcode demultiplexing.
 
@@ -27,8 +27,14 @@ def setup_from_nanopore_fastqs(path_to_fastq_pass, min_reads=10,
     :param write_sample_table: write "samples.csv"
     """
     search = os.path.join(path_to_fastq_pass, '*')
-    folders = [x for x in nglob(search) 
-               if os.path.isdir(x) and not re.findall(exclude, x)]
+    folders = []
+    for x in nglob(search):
+        if not os.path.isdir(x) or re.findall(exclude, x):
+            continue
+        if include is not None and not re.findall(include, x):
+            continue
+        folders += [x]
+
     samples = []
     # combine nanopore output fastq files    
     for folder in folders:
@@ -60,7 +66,7 @@ def setup_from_nanopore_fastqs(path_to_fastq_pass, min_reads=10,
          .to_csv('samples.csv', index=None))
 
 
-def demux_reads():
+def demux_reads(flye='flye'):
     shutil.rmtree('4_demux', ignore_errors=True)
     os.makedirs('4_demux')
     print('Cleared files from 4_demux/')
@@ -95,10 +101,10 @@ def demux_reads():
 
     print(f'Wrote {num_files} files to 4_demux/{{sample}}/{{name}}.fastq')
 
-    write_flye_commands()
+    write_flye_commands(flye=flye)
 
 
-def write_flye_commands(search='4_demux/*/*.fastq', flags='--nano-hq'):
+def write_flye_commands(search='4_demux/*/*.fastq', flags='--nano-hq', flye='flye'):
     """Write commands that can be run with bash, parallel, job submission etc.
     """
     f_out = '4_demux/flye.sh'
@@ -107,7 +113,7 @@ def write_flye_commands(search='4_demux/*/*.fastq', flags='--nano-hq'):
     arr = []
     for fastq in nglob(search):
         out_dir = fastq.replace(".fastq", "")
-        arr += [f'flye {flags} {fastq} --out-dir {out_dir}']
+        arr += [f'{flye} {flags} {fastq} --out-dir {out_dir}']
     with open(f_out, 'w') as fh:
         fh.write('\n'.join(arr))
     print(f'Wrote {len(arr)} commands to {f_out}')