Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not report a match for ambiguous sequences #827

Merged
merged 1 commit into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ Changelog
development version
-------------------

This release increases the major version number because results when
demultiplexing may change with this release.

* :pr:`827`: When matching multiple adapters (typically when demultiplexing
using barcodes), Cutadapt now no longer assigns ambiguous matches to one
of the adapters/barcodes.
* :issue:`808`: Made gzip compression level 1 the default, which improves
runtime significantly in many cases. (Compressing the output is often a
bottleneck when using multiple threads.) Output files will be larger, but because
Expand Down
19 changes: 19 additions & 0 deletions doc/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1915,6 +1915,7 @@ To demultiplex this type of data, the


.. _speed-up-demultiplexing:
.. _adapter-indexing:

Speeding up demultiplexing/adapter indexing
-------------------------------------------
Expand Down Expand Up @@ -1961,6 +1962,24 @@ Cutadapt’s output::
.. versionadded:: 5.0
An index is created up to three (instead of two) allowed errors.

Ambiguous sequences
~~~~~~~~~~~~~~~~~~~

When :ref:`an index is created for multiple anchored adapters <adapter-indexing>`, Cutadapt checks
whether there are any possible input sequences that lead to ambiguous matches, that is, which
would match two or more adapter sequences equally well.
If there are ambiguous sequences, Cutadapt prints a warning like this::

WARNING: The adapters are too similar. When creating the index, 31 ambiguous sequences were found that cannot be assigned uniquely.
WARNING: For example, 'TAGTGCTTGA', when found in a read, would result in 10 matches for both bc3 'TAGTGCTTGA' and bc11 'TAGTGCTTGA'
WARNING: Reads with ambiguous sequence will *not* be trimmed.

If you use ``-no-index``, Cutadapt will not print this warning and instead assign ambiguous reads
to the first of the adapters that match equally well.

.. versionadded:: 5.0
Ambiguous sequences were not handled specially in earlier versions.

Demultiplexing paired-end reads in mixed orientation
----------------------------------------------------

Expand Down
62 changes: 35 additions & 27 deletions src/cutadapt/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -1255,10 +1255,11 @@ def __init__(self, adapters, prefix: bool):
for adapter in adapters:
self._accept(adapter, prefix)
self._adapters = adapters
self._lengths, self._index = self._make_index()
self._lengths, self._index, self._ambiguous = self._make_index()
logger.debug(
"String lengths in the index: %s", sorted(self._lengths, reverse=True)
)

if len(self._lengths) == 1:
self._length = self._lengths[0]
self.match_to = self._match_to_one_length
Expand Down Expand Up @@ -1337,7 +1338,7 @@ def is_acceptable(cls, adapter: SingleAdapter, prefix: bool):
return False
return True

def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]:
def _make_index(self) -> Tuple[List[int], "AdapterIndexDict", int]:
start_time = time.time()
max_k = max(
(
Expand All @@ -1356,7 +1357,7 @@ def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]:
)
index: Dict[str, Tuple[SingleAdapter, int, int]] = dict()
lengths = set()
has_warned = False
ambiguous = []
for adapter in self._adapters:
sequence = adapter.sequence
k = int(adapter.max_error_rate * len(sequence))
Expand All @@ -1367,47 +1368,54 @@ def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]:
other_adapter, other_errors, other_matches = index[s]
if matches < other_matches:
continue
if other_matches == matches and not has_warned:
self._warn_similar(adapter, other_adapter, k, s, matches)
has_warned = True
if other_matches == matches:
ambiguous.append((s, adapter, other_adapter, k, matches))
index[s] = (adapter, errors, matches)
lengths.add(len(s))
else:
n = len(sequence)
for errors in range(k + 1):
matches = n - errors
for s in hamming_sphere(sequence, errors):
matches = n - errors
if s in index:
other_adapter, other_errors, other_matches = index[s]
if matches < other_matches:
continue
if other_matches == matches and not has_warned:
self._warn_similar(
adapter, other_adapter, k, s, matches
if other_matches == matches:
ambiguous.append(
(s, adapter, other_adapter, k, matches)
)
has_warned = True
index[s] = (adapter, errors, matches)
lengths.add(n)

if ambiguous:
logger.warning(
"WARNING: The adapters are too similar. When creating the index, "
"%d ambiguous sequences were found that cannot be assigned uniquely.",
len(ambiguous),
)
s, adapter, other_adapter, k, matches = ambiguous[0]
logger.warning(
"WARNING: For example, %r, when found in a read, would result in "
"%s matches for both %s %r and %s %r",
s,
matches,
other_adapter.name,
other_adapter.sequence,
adapter.name,
adapter.sequence,
)
logger.warning(
"WARNING: Reads with ambiguous sequence will *not* be trimmed."
)
for s, adapter, other_adapter, k, matches in ambiguous:
del index[s]

elapsed = time.time() - start_time
logger.info("Built an index containing %s strings.", len(index))
logger.debug("Building the index took %.1f s", elapsed)

return sorted(lengths, reverse=True), index

@staticmethod
def _warn_similar(adapter, other_adapter, k, s, matches):
logger.warning(
"Adapters %s %r and %s %r are very similar. At %s allowed errors, "
"the sequence %r cannot be assigned uniquely because the number of "
"matches is %s compared to both adapters.",
other_adapter.name,
other_adapter.sequence,
adapter.name,
adapter.sequence,
k,
s,
matches,
)
return sorted(lengths, reverse=True), index, len(ambiguous)

def _match_to_one_length(self, sequence: str):
"""
Expand Down
22 changes: 21 additions & 1 deletion tests/test_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def test_indexed_very_similar(caplog):
PrefixAdapter("GAAG", max_errors=1, indels=False),
]
)
assert "cannot be assigned uniquely" in caplog.text
assert "ambiguous sequences" in caplog.text


def test_indexed_too_high_k():
Expand Down Expand Up @@ -582,6 +582,26 @@ def test_indexed_prefix_adapters_with_n_collision(sequence):
assert result.adapter is a2


def test_indexed_prefix_adapters_ignore_ambiguous_matches():
a1 = PrefixAdapter("AAAAA", max_errors=1, indels=False)
a2 = PrefixAdapter("TTAAA", max_errors=1, indels=False)
ipa = IndexedPrefixAdapters([a1, a2])

result = ipa.match_to("ATAAA")

assert result is None


def test_indexed_prefix_adapters_ignore_ambiguous_matches_with_indels():
a1 = PrefixAdapter("AGTACGT", max_errors=1, indels=True)
a2 = PrefixAdapter("ACGTAGT", max_errors=1, indels=True)
ipa = IndexedPrefixAdapters([a1, a2])

result = ipa.match_to("ACGTACGT")

assert result is None


def test_inosine_wildcard():
adapter = BackAdapter("CTGIAIT", max_errors=0, min_overlap=3)
match = adapter.match_to("GGCTGAATTGGG")
Expand Down
Loading