Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
David Feldman committed Feb 21, 2024
1 parent 2d9a6d2 commit 84386fe
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 15 deletions.
23 changes: 15 additions & 8 deletions src/ngs_analysis/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def parse_reads(sample, simulate=False):
parse_sequences(config, reads).to_parquet(filenames['parsed'])


def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10):
def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10, drop_unmapped=True):
"""For each read, find nearest match for the requested fields.
Candidate matches from the design table are found via mmseqs search,
Expand All @@ -220,6 +220,9 @@ def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10):
:param sample: sample name
:param simulate: if True, use ./*/simulate/ subdirectories
:param drop_unmapped: whether to drop reads that fail to map to all
of the requested fields, if not dropping then the mapped table
may contain null entries
"""
filenames = get_filenames(sample, simulate)

Expand All @@ -245,9 +248,10 @@ def map_parsed_reads(sample, simulate=False, mmseqs_max_seqs=10):
df_match = match_mapped(df_mapped, field)
arr += [df_match]

join = 'inner' if drop_unmapped else 'outer'
df_match = arr[0]
for df in arr[1:]:
df_match = df_match.merge(df, on='read_index')
df_match = df_match.merge(df, on='read_index', how=join)

if secondary:
df_designs = load_designs()
Expand Down Expand Up @@ -325,22 +329,25 @@ def parse_sequences(config, sequences):
Finally, the protein is matched using the template string protein.pattern,
and named matches are added to the result dictionary. Only DNA is stored here.
"""
re_insert = re.compile('{left_adapter}(.*){right_adapter}'.format(**config))
re_insert = re.compile('{left_adapter}(.*?){right_adapter}'.format(**config))
capture = config['protein'].get('capture', {})
optional = config['protein'].get('optional', [])
if 'protein' in config:

re_proteins = [re.compile(format_string_to_capture_regex(
x, optional, **capture)) for x in config['protein']['patterns']]

arr = []
for i, dna in enumerate(sequences):
dna = dna.upper()
entry = {}
try:
insert = re_insert.findall(dna)[0]
except IndexError:
entry = {'sense': 1}
matches = re_insert.findall(dna)
if len(matches) == 0:
matches = re_insert.findall(reverse_complement(dna))
entry['sense'] = -1
if len(matches) == 0:
continue
insert = matches[0]

entry['read_index'] = i
entry['read_length'] = len(dna)
entry['insert_length'] = len(insert)
Expand Down
20 changes: 13 additions & 7 deletions src/ngs_analysis/nanopore.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


def setup_from_nanopore_fastqs(path_to_fastq_pass, min_reads=10,
exclude='unclassified', write_sample_table=True):
exclude='unclassified', include=None, write_sample_table=True):
"""Set up analysis from fastq.gz files produced by dorado
basecalling and barcode demultiplexing.
Expand All @@ -27,8 +27,14 @@ def setup_from_nanopore_fastqs(path_to_fastq_pass, min_reads=10,
:param write_sample_table: write "samples.csv"
"""
search = os.path.join(path_to_fastq_pass, '*')
folders = [x for x in nglob(search)
if os.path.isdir(x) and not re.findall(exclude, x)]
folders = []
for x in nglob(search):
if not os.path.isdir(x) or re.findall(exclude, x):
continue
if include is not None and not re.findall(include, x):
continue
folders += [x]

samples = []
# combine nanopore output fastq files
for folder in folders:
Expand Down Expand Up @@ -60,7 +66,7 @@ def setup_from_nanopore_fastqs(path_to_fastq_pass, min_reads=10,
.to_csv('samples.csv', index=None))


def demux_reads():
def demux_reads(flye='flye'):
shutil.rmtree('4_demux', ignore_errors=True)
os.makedirs('4_demux')
print('Cleared files from 4_demux/')
Expand Down Expand Up @@ -95,10 +101,10 @@ def demux_reads():

print(f'Wrote {num_files} files to 4_demux/{{sample}}/{{name}}.fastq')

write_flye_commands()
write_flye_commands(flye=flye)


def write_flye_commands(search='4_demux/*/*.fastq', flags='--nano-hq'):
def write_flye_commands(search='4_demux/*/*.fastq', flags='--nano-hq', flye='flye'):
"""Write commands that can be run with bash, parallel, job submission etc.
"""
f_out = '4_demux/flye.sh'
Expand All @@ -107,7 +113,7 @@ def write_flye_commands(search='4_demux/*/*.fastq', flags='--nano-hq'):
arr = []
for fastq in nglob(search):
out_dir = fastq.replace(".fastq", "")
arr += [f'flye {flags} {fastq} --out-dir {out_dir}']
arr += [f'{flye} {flags} {fastq} --out-dir {out_dir}']
with open(f_out, 'w') as fh:
fh.write('\n'.join(arr))
print(f'Wrote {len(arr)} commands to {f_out}')
Expand Down

0 comments on commit 84386fe

Please sign in to comment.