diff --git a/CHANGES.rst b/CHANGES.rst index 51d3df8a..6c3ce4b6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -25,6 +25,10 @@ development version empty read. By @rhpvorderman. * :issue:`824`: Fix: discard_(un)trimmed values were always ``none`` in JSON report. +* An index (that speeds up matching of many anchored adapters such as barcodes) + is now created for up to three allowed errors (instead of two). Since this can + be slow and memory intensive, particularly when indels are enabled, + a message is printed that suggests using ``--no-indels`` or ``--no-index``. * Dropped support for Python 3.8. * Added support for Python 3.13. diff --git a/doc/guide.rst b/doc/guide.rst index b64bb594..0fda2049 100644 --- a/doc/guide.rst +++ b/doc/guide.rst @@ -1916,32 +1916,37 @@ To demultiplex this type of data, the .. _speed-up-demultiplexing: -Speeding up demultiplexing --------------------------- +Speeding up demultiplexing/adapter indexing +------------------------------------------- Finding many adapters/barcodes simultaneously (which is what demultiplexing in Cutadapt is about), can be sped up tremendously by using the right options since Cutadapt will then be able to create an -index of the barcode sequences instead of checking for each barcode separately. Currently, the -following conditions need to be met in order for index creation to be enabled: +index of the barcode sequences instead of checking for each barcode separately. +The following conditions need to be met for index creation to be enabled: * The barcodes/adapters must be anchored: For 5’ adapters, use ``-g ^ADAPTER`` or ``-g ^file:adapters.fasta``. For 3’ adapters, use ``-a ADAPTER$`` or ``-a file$:adapters.fasta``. -* The maximum error rate (``-e``) must be set such that at most 2 errors are allowed: - Use ``-e 0``, ``-e 1`` or ``-e 2``. +* The maximum error rate (``-e``) must be set such that at most 3 errors are allowed: + Use ``-e 0`` up to ``-e 3``. * No IUPAC wildcards must be used in the barcode/adapter. Also, you cannot use the option ``--match-read-wildcards``. +Index creation is significantly faster and uses less memory if ``--no-indels`` is provided. + +Index creation can be disabled with ``--no-index``. + An index will be built for all the adapters that fulfill these criteria if there are at least two of them. You can provide additional adapters/barcodes, and they will just not be included in the -index. Whether an index is created or not should not affect the results, only how fast you get them. +index. Instead, they will be searched for one by one. +Whether an index is created or not should not affect the results, only how fast you get them. To see whether an index is created, look for a message like this in the first few lines of Cutadapt’s output:: Building index of 23 adapters ... -Hopefully some of the above restrictions will be lifted in the future. + .. versionadded:: 1.15 Demultiplexing of paired-end data. @@ -1953,6 +1958,8 @@ Hopefully some of the above restrictions will be lifted in the future. An index can be built even when indels are allowed (that is, ``--no-indels`` is no longer required). +.. versionadded:: 5.0 + An index is created up to three (instead of two) allowed errors. Demultiplexing paired-end reads in mixed orientation ---------------------------------------------------- diff --git a/src/cutadapt/adapters.py b/src/cutadapt/adapters.py index b51aef84..f3d1415c 100644 --- a/src/cutadapt/adapters.py +++ b/src/cutadapt/adapters.py @@ -4,6 +4,7 @@ The ...Adapter classes are responsible for finding adapters. The ...Match classes trim the reads. """ + import logging from enum import IntFlag from collections import defaultdict @@ -1319,7 +1320,7 @@ def _accept(cls, adapter: SingleAdapter, prefix: bool): if adapter.adapter_wildcards: raise ValueError("Wildcards in the adapter not supported") k = int(len(adapter) * adapter.max_error_rate) - if k > 2: + if k > 3: raise ValueError("Error rate too high") @classmethod @@ -1338,7 +1339,21 @@ def is_acceptable(cls, adapter: SingleAdapter, prefix: bool): def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]: start_time = time.time() + max_k = max( + ( + int(adapter.max_error_rate * len(adapter.sequence)) + for adapter in self._adapters + if adapter.indels + ), + default=0, + ) logger.info("Building index of %s adapters ...", len(self._adapters)) + if max_k == 3: + logger.info( + "Three errors and indels allowed for at least one of the adapter sequences: " + "Indexing could take long and use a lot of memory. " + "If this becomes a problem, try --no-indels and/or --no-index." + ) index: Dict[str, Tuple[SingleAdapter, int, int]] = dict() lengths = set() has_warned = False diff --git a/tests/test_adapters.py b/tests/test_adapters.py index 3976bfa4..f6fb8dc7 100644 --- a/tests/test_adapters.py +++ b/tests/test_adapters.py @@ -516,8 +516,8 @@ def test_indexed_too_high_k(): with pytest.raises(ValueError) as e: IndexedPrefixAdapters( [ - PrefixAdapter("ACGTACGT", max_errors=3, indels=False), - PrefixAdapter("AAGGTTCC", max_errors=2, indels=False), + PrefixAdapter("ACGTACGT", max_errors=4, indels=False), + PrefixAdapter("AAGGTTCC", max_errors=3, indels=False), ] ) assert "Error rate too high" in e.value.args[0]