diff --git a/scripts/cutsite_pairs.ipynb b/scripts/cutsite_pairs.ipynb index 9b2705c8..ac26e9b4 100644 --- a/scripts/cutsite_pairs.ipynb +++ b/scripts/cutsite_pairs.ipynb @@ -4,46 +4,271 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# New cut implementation\n" + "# New cut implementation\n", + "\n", + "The most important thing is that cuts are now represented as `((cut_watson, ovhg), enz)`:\n", + "\n", + "- `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence that will be cut. It represents the position of the cut on the watson strand, using the full sequence as a reference. By \"full sequence\" I mean the one you would get from `str(Dseq)`. See example below.\n", + "- `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property.\n", + "- `enz` is the enzyme object. It's not necessary to perform the cut, but can be used to keep track of which enzyme was used.\n", + "\n", + "The new implementation of `Dseq.cut` now looks like this:\n", + "\n", + "```python\n", + "cutsites = self.get_cutsites(*enzymes)\n", + "cutsite_pairs = self.get_cutsite_pairs(cutsites)\n", + "return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)\n", + "```\n", + "\n", + "Let's go through it step by step" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get_cutsites output: [((3, -4), EcoRI), ((11, -4), EcoRI)]\n", + "EcoRI.search output: [4, 12] < (positions are 1-based)\n", + "EcoRI.ovhg: -4\n", + "\n", + "get_cutsites output: [((6, -4), EcoRI)]\n", + "EcoRI.search output: [7] < (positions are 1-based)\n", + "EcoRI.ovhg: -4\n", + "\n", + "get_cutsites output: [((1, 2), PacI)]\n", + "PacI.search output: [2] < (positions are 1-based)\n", + "PacI.ovhg: 2\n", + "\n" + ] + } + ], "source": [ - "# Coming soon\n", - "# seq = Dseq('aaGAATTCaa', circular=True)\n", + "from pydna.dseq import Dseq\n", + "from Bio.Restriction import EcoRI, PacI\n", "\n", - "# print('EcORI', EcoRI.ovhg, len(seq))\n", - "# for shift in range(len(seq)):\n", - "# seq_shifted = seq.shifted(shift)\n", - "# cut_site = seq_shifted.get_cutsites(EcoRI)[0][0]\n", - "# print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n", + "dseq = Dseq('aaGAATTCaaGAATTCaa')\n", "\n", - "# seq = Dseq('ccTTAATTAAcc', circular=True)\n", - "# print('PacI', PacI.ovhg, len(seq))\n", - "# for shift in range(len(seq)):\n", - "# seq_shifted = seq.shifted(shift)\n", - "# cut_site = seq_shifted.get_cutsites(PacI)[0][0]\n", - "# print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n", + "# what this function does is basically handle the format of the enzymes, and return the cut positions\n", + "# that are returned by enz.search, along with the enzyme name and overhang. Positions are made zero-base\n", + "# instead of one-based\n", "\n", + "print('get_cutsites output:', dseq.get_cutsites([EcoRI]))\n", + "print('EcoRI.search output:', EcoRI.search(dseq), '< (positions are 1-based)')\n", + "print('EcoRI.ovhg:', EcoRI.ovhg)\n", + "print()\n", "\n", - "# seq = Dseq('TTAAccccTTAA', circular=True)\n", - "# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': 2})())\n", - "# print(seq.apply_cut(custom_cut, custom_cut).__repr__())\n", + "# Below are two examples of circular sequences with a cutsite that spans the origin.\n", + "dseq = Dseq('TTCaaGAA', circular=True)\n", + "print('get_cutsites output:', dseq.get_cutsites([EcoRI]))\n", + "print('EcoRI.search output:', EcoRI.search(dseq, linear=False), '< (positions are 1-based)')\n", + "print('EcoRI.ovhg:', EcoRI.ovhg)\n", + "print()\n", "\n", - "# print()\n", + "dseq = Dseq('TTAAaaTTAA', circular=True)\n", + "print('get_cutsites output:', dseq.get_cutsites([PacI]))\n", + "print('PacI.search output:', PacI.search(dseq, linear=False), '< (positions are 1-based)')\n", + "print('PacI.ovhg:', PacI.ovhg)\n", + "print()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note in the above printed output how if the ovhg is negative, for an origin spanning cutsite, the position lies on the left side of the origin, and viceversa.\n", "\n", - "# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': -10})())\n", - "# print(seq.apply_cut(custom_cut, custom_cut).__repr__())" + "Below, you can see that the `cut_watson` is defined with respect to the \"full sequence\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dseq(-10)\n", + "aaGAATTCaa\n", + " tCTTAAGtt\n", + "ovhg: -1 >> [((3, -4), EcoRI)]\n", + "\n", + "Dseq(-10)\n", + "aaGAATTCaa\n", + "ttCTTAAGtt\n", + "ovhg: 0 >> [((3, -4), EcoRI)]\n", + "\n", + "Dseq(-10)\n", + " aGAATTCaa\n", + "ttCTTAAGtt\n", + "ovhg: 1 >> [((3, -4), EcoRI)]\n", + "\n" + ] + } + ], + "source": [ + "# `cut_watson` is defined with respect to the \"full sequence\"\n", + "for ovhg in [-1, 0, 1]:\n", + " dseq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', ovhg, 0)\n", + " print(dseq.__repr__())\n", + " print('ovhg:', ovhg, '>>', dseq.get_cutsites([EcoRI]))\n", + " print()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cuts are only returned if the recognition site and overhang are on the double-strand part of the sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[((1, -4), EcoRI)]\n", + "[]\n" + ] + } + ], + "source": [ + "\n", + "seq = Dseq('GAATTC')\n", + "print(seq.get_cutsites([EcoRI]))\n", + "\n", + "seq = Dseq.from_full_sequence_and_overhangs('GAATTC', -1, 0)\n", + "print(seq.get_cutsites([EcoRI]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Pairing cutsites\n", + "\n", + "A fragment produced by restriction is represented by a tuple of length 2 that may contain cutsites or `None`:\n", + "\n", + "- Two cutsites: represents the extraction of a fragment between those two cutsites, in that orientation. To represent the opening of a circular molecule with a single cutsite, we put the same cutsite twice. See below.\n", + "- `None`, cutsite: represents the extraction of a fragment between the left edge of linear sequence and the cutsite.\n", + "- cutsite, `None`: represents the extraction of a fragment between the cutsite and the right edge of a linear sequence.\n", + "\n", + "## Generating the sequence\n", + "\n", + "To get the fragment, we use the function `dseq.apply_cut`, passing the two elements of the tuple as arguments." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> None, cutsite : (None, ((3, -4), EcoRI))\n", + "Dseq(-7)\n", + "aaG\n", + "ttCTTAA\n", + "\n", + "> cutsite, cutsite : (((3, -4), EcoRI), ((11, -4), EcoRI))\n", + "Dseq(-12)\n", + "AATTCaaG\n", + " GttCTTAA\n", + "\n", + "> cutsite, None : (((11, -4), EcoRI), None)\n", + "Dseq(-7)\n", + "AATTCaa\n", + " Gtt\n", + "\n", + "Circular molecule\n", + "> cutsite, cutsite : (((6, -4), EcoRI), ((6, -4), EcoRI))\n", + "Dseq(-12)\n", + "AATTCaaG\n", + " GttCTTAA\n" + ] + } + ], + "source": [ + "dseq = Dseq('aaGAATTCaaGAATTCaa')\n", + "cutsites = dseq.get_cutsites([EcoRI])\n", + "\n", + "cutsite_pairs = dseq.get_cutsite_pairs(cutsites)\n", + "pair_types = ['None, cutsite', 'cutsite, cutsite', 'cutsite, None']\n", + "\n", + "for pair, pair_type in zip(cutsite_pairs, pair_types):\n", + " print('>', pair_type, ':',pair)\n", + " print(dseq.apply_cut(*pair).__repr__())\n", + " print()\n", + "\n", + "# Opening a circular sequence\n", + "print('Circular molecule')\n", + "dseq = Dseq('TTCaaGAA', circular=True)\n", + "cutsites = dseq.get_cutsites([EcoRI])\n", + "cutsite_pairs = dseq.get_cutsite_pairs(cutsites)\n", + "print('> cutsite, cutsite :', cutsite_pairs[0])\n", + "print(dseq.apply_cut(*cutsite_pairs[0]).__repr__())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dseq(-7)\n", + " aG\n", + "ttCTTAA\n", + "\n", + "Dseq(-7)\n", + "AATTCaa\n", + " Gt\n" + ] + } + ], + "source": [ + "# Note that the cutsite respects the ovhg of the parent sequence:\n", + "dseq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaaGAATTCaa', 1, 1)\n", + "f1, f2, f3 = dseq.cut([EcoRI])\n", + "print(f1.__repr__())\n", + "print()\n", + "print(f3.__repr__())\n" ] } ], "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" } }, "nbformat": 4, diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index b7a9a5ae..93a29e97 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -1446,8 +1446,67 @@ def cut(self, *enzymes): cutsite_pairs = self.get_cutsite_pairs(cutsites) return tuple(self.apply_cut(*cs) for cs in cutsite_pairs) + def cutsite_is_valid(self, cutsite): + """Returns False if: + - Cut positions fall outside the sequence (could be moved to Biopython) + - Overhang is not double stranded + - Recognition site is not double stranded or is outside the sequence + - For enzymes that cut twice, it checks that at least one possibility is valid + """ + + assert cutsite != None, "cutsite is None" + + enz = cutsite[1] + watson, crick, ovhg = self.get_cut_parameters(cutsite, True) + + # The cut positions fall within the sequence + # This could go into Biopython + if not self.circular and crick < 0 or crick > len(self): + return False + + # The overhang is double stranded + overhang_dseq = self[watson:crick] if ovhg < 0 else self[crick:watson] + if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg() != 0: + return False + + # The recognition site is double stranded and within the sequence + start_of_recognition_site = watson - enz.fst5 + if start_of_recognition_site < 0: + start_of_recognition_site += len(self) + end_of_recognition_site = start_of_recognition_site + enz.size + if self.circular: + end_of_recognition_site %= len(self) + recognition_site = self[start_of_recognition_site:end_of_recognition_site] + if len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg() != 0: + if enz.scd5 is None: + return False + else: + # For enzymes that cut twice, this might be referring to the second one + start_of_recognition_site = watson - enz.scd5 + if start_of_recognition_site < 0: + start_of_recognition_site += len(self) + end_of_recognition_site = start_of_recognition_site + enz.size + if self.circular: + end_of_recognition_site %= len(self) + recognition_site = self[start_of_recognition_site:end_of_recognition_site] + if (len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg() != 0): + return False + + return True + def get_cutsites(self, *enzymes): - """Returns a list of cutsites, represented by tuples ((cut_watson, cut_crick), enzyme), sorted by where they cut on the 5' strand. + """Returns a list of cutsites, represented represented as `((cut_watson, ovhg), enz)`: + + - `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence + that will be cut. It represents the position of the cut on the watson strand, using the full + sequence as a reference. By "full sequence" I mean the one you would get from `str(Dseq)`. + - `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in + the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property. + - `enz` is the enzyme object. It's not necessary to perform the cut, but can be + used to keep track of which enzyme was used. + + Cuts are only returned if the recognition site and overhang are on the double-strand + part of the sequence. Parameters ---------- @@ -1465,9 +1524,27 @@ def get_cutsites(self, *enzymes): >>> from pydna.dseq import Dseq >>> seq = Dseq('AAGAATTCAAGAATTC') >>> seq.get_cutsites(EcoRI) - [((3, 7), EcoRI), ((11, 15), EcoRI)] + [((3, -4), EcoRI), ((11, -4), EcoRI)] + + `cut_watson` is defined with respect to the "full sequence", not the + watson strand: + + >>> dseq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', 1, 0) + >>> dseq + Dseq(-10) + aGAATTCaa + ttCTTAAGtt + >>> dseq.get_cutsites([EcoRI]) + [((3, -4), EcoRI)] + + Cuts are only returned if the recognition site and overhang are on the double-strand + part of the sequence. + + >>> Dseq('GAATTC').get_cutsites([EcoRI]) + [((1, -4), EcoRI)] + >>> Dseq.from_full_sequence_and_overhangs('GAATTC', -1, 0).get_cutsites([EcoRI]) + [] - TODO: check that the cutsite does not fall on the ovhg and that cutsites don't crash """ if len(enzymes) == 1 and isinstance(enzymes[0], _RestrictionBatch): @@ -1477,18 +1554,18 @@ def get_cutsites(self, *enzymes): enzymes = _flatten(enzymes) out = list() for e in enzymes: - # Positions are 1-based, so we subtract 1 to get 0-based positions + # Positions of the cut on the watson strand. They are 1-based, so we subtract + # 1 to get 0-based positions cuts_watson = [c - 1 for c in e.search(self, linear=(not self.circular))] - cuts_crick = [(c - e.ovhg) % len(self) for c in cuts_watson] - out += [((w, c), e) for w, c in zip(cuts_watson, cuts_crick)] + out += [((w, e.ovhg), e) for w in cuts_watson] - return sorted(out) + return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)]) def left_end_position(self) -> tuple[int, int]: - """The index in the global sequence of the watson and crick start positions. + """The index in the full sequence of the watson and crick start positions. - Global sequence (str(self)) for all three cases is AAA + full sequence (str(self)) for all three cases is AAA ``` AAA AA AAT @@ -1502,9 +1579,9 @@ def left_end_position(self) -> tuple[int, int]: return 0, -self.ovhg def right_end_position(self) -> tuple[int, int]: - """The index in the global sequence of the watson and crick end positions. + """The index in the full sequence of the watson and crick end positions. - Global sequence (str(self)) for all three cases is AAA + full sequence (str(self)) for all three cases is AAA ``` AAA AA AAA @@ -1517,28 +1594,113 @@ def right_end_position(self) -> tuple[int, int]: return len(self) + self.watson_ovhg(), len(self) return len(self), len(self) - self.watson_ovhg() + def get_cut_parameters(self, cut: tuple, is_left: bool): + """For a given cut expressed as ((cut_watson, ovhg), enz), returns + a tuple (cut_watson, cut_crick, ovhg). + + - cut_watson: see get_cutsites docs + - cut_crick: equivalent of cut_watson in the crick strand + - ovhg: see get_cutsites docs + + The cut can be None if it represents the left or right end of the sequence. + Then it will return the position of the watson and crick ends with respect + to the "full sequence". The `is_left` parameter is only used in this case. + + """ + if cut is not None: + watson, ovhg = cut[0] + crick = (watson - ovhg) + if self.circular: + crick %= len(self) + return watson, crick, ovhg + + assert not self.circular, 'Circular sequences should not have None cuts' + + if is_left: + return *self.left_end_position(), self.ovhg + # In the right end, the overhang does not matter + return *self.right_end_position(), self.watson_ovhg() + def apply_cut(self, left_cut, right_cut): + """Extracts a subfragment of the sequence between two cuts. + + For more detail see the documentation of get_cutsite_pairs. + + Parameters + ---------- + left_cut : Union[tuple[tuple[int,int], _RestrictionType], None] + right_cut: Union[tuple[tuple[int,int], _RestrictionType], None] + + Returns + ------- + Dseq + + Examples + -------- + >>> from Bio.Restriction import EcoRI + >>> from pydna.dseq import Dseq + >>> dseq = Dseq('aaGAATTCaaGAATTCaa') + >>> cutsites = dseq.get_cutsites([EcoRI]) + >>> cutsites + [((3, -4), EcoRI), ((11, -4), EcoRI)] + >>> p1, p2, p3 = dseq.get_cutsite_pairs(cutsites) + >>> p1 + (None, ((3, -4), EcoRI)) + >>> dseq.apply_cut(*p1) + Dseq(-7) + aaG + ttCTTAA + >>> p2 + (((3, -4), EcoRI), ((11, -4), EcoRI)) + >>> dseq.apply_cut(*p2) + Dseq(-12) + AATTCaaG + GttCTTAA + >>> p3 + (((11, -4), EcoRI), None) + >>> dseq.apply_cut(*p3) + Dseq(-7) + AATTCaa + Gtt + + >>> dseq = Dseq('TTCaaGAA', circular=True) + >>> cutsites = dseq.get_cutsites([EcoRI]) + >>> cutsites + [((6, -4), EcoRI)] + >>> pair = dseq.get_cutsite_pairs(cutsites)[0] + >>> pair + (((6, -4), EcoRI), ((6, -4), EcoRI)) + >>> dseq.apply_cut(*pair) + Dseq(-12) + AATTCaaG + GttCTTAA + + """ if _cuts_overlap(left_cut, right_cut, len(self)): raise ValueError("Cuts overlap") - left_watson, left_crick = left_cut[0] if left_cut is not None else self.left_end_position() - ovhg = left_cut[1].ovhg if left_cut is not None else self.ovhg - right_watson, right_crick = right_cut[0] if right_cut is not None else self.right_end_position() + + left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True) + right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False) return Dseq( str(self[left_watson:right_watson]), # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case str(self.reverse_complement()[len(self) - right_crick:len(self) - left_crick]), - ovhg=ovhg, + ovhg=ovhg_left, ) def get_cutsite_pairs(self, cutsites): - """ Pairs the cutsites 2 by 2 to render the edges of the resulting fragments. + """ Returns pairs of cutsites that render the edges of the resulting fragments. - Special cases: - - Single cutsite on circular sequence: returns a pair where both cutsites are the same - - Linear sequence: - - creates a new left_cut on the first pair equal to `None` to represent the left edge of the sequence as it is. - - creates a new right_cut on the last pair equal to `None` to represent the right edge of the sequence as it is. - - In both new cuts, the enzyme is set to None to indicate that the cut is not made by an enzyme. + A fragment produced by restriction is represented by a tuple of length 2 that + may contain cutsites or `None`: + + - Two cutsites: represents the extraction of a fragment between those two + cutsites, in that orientation. To represent the opening of a circular + molecule with a single cutsite, we put the same cutsite twice. + - `None`, cutsite: represents the extraction of a fragment between the left + edge of linear sequence and the cutsite. + - cutsite, `None`: represents the extraction of a fragment between the cutsite + and the right edge of a linear sequence. Parameters ---------- @@ -1553,15 +1715,19 @@ def get_cutsite_pairs(self, cutsites): >>> from Bio.Restriction import EcoRI >>> from pydna.dseq import Dseq - >>> seq = Dseq('AAGAATTCAAGAATTC') - >>> seq.get_cutsite_pairs(seq.get_cutsites(EcoRI)) - [(None, ((3, 7), EcoRI)), (((3, 7), EcoRI), ((11, 15), EcoRI)), (((11, 15), EcoRI), None)] - >>> seq = Dseq('AAGAATTCAAGAATTC', circular=True) - >>> seq.get_cutsite_pairs(seq.get_cutsites(EcoRI)) - [(((3, 7), EcoRI), ((11, 15), EcoRI)), (((11, 15), EcoRI), ((3, 7), EcoRI))] - >>> seq = Dseq('AAGAATTCAA', circular=True) - >>> seq.get_cutsite_pairs(seq.get_cutsites(EcoRI)) - [(((3, 7), EcoRI), ((3, 7), EcoRI))] + >>> dseq = Dseq('aaGAATTCaaGAATTCaa') + >>> cutsites = dseq.get_cutsites([EcoRI]) + >>> cutsites + [((3, -4), EcoRI), ((11, -4), EcoRI)] + >>> dseq.get_cutsite_pairs(cutsites) + [(None, ((3, -4), EcoRI)), (((3, -4), EcoRI), ((11, -4), EcoRI)), (((11, -4), EcoRI), None)] + + >>> dseq = Dseq('TTCaaGAA', circular=True) + >>> cutsites = dseq.get_cutsites([EcoRI]) + >>> cutsites + [((6, -4), EcoRI)] + >>> dseq.get_cutsite_pairs(cutsites) + [(((6, -4), EcoRI), ((6, -4), EcoRI))] """ if len(cutsites) == 0: return [] diff --git a/src/pydna/dseqrecord.py b/src/pydna/dseqrecord.py index 79b988cf..05981dc5 100644 --- a/src/pydna/dseqrecord.py +++ b/src/pydna/dseqrecord.py @@ -1361,7 +1361,8 @@ def apply_cut(self, left_cut, right_cut): # 000 # 2222 # - features = self.shifted(min(left_cut[0])).features + left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(left_cut, True) + features = self.shifted(min(left_watson, left_crick)).features # for f in features: # print(f.id, f.location, _location_boundaries(f.location)) # Here, we have done what's shown below (* indicates the origin). @@ -1374,8 +1375,8 @@ def apply_cut(self, left_cut, right_cut): # 000 # 2222 - features_need_transfer = [f for f in features if (_location_boundaries(f.location)[1] <= abs(left_cut[1].ovhg))] - features_need_transfer = [_shift_feature(f, -abs(left_cut[1].ovhg), len(self)) for f in features_need_transfer] + features_need_transfer = [f for f in features if (_location_boundaries(f.location)[1] <= abs(left_ovhg))] + features_need_transfer = [_shift_feature(f, -abs(left_ovhg), len(self)) for f in features_need_transfer] # ^ ^^^^^^^^^ # Now we have shifted the features that end before the cut (0 and 1, but not 3), as if # they referred to the below sequence (* indicates the origin): @@ -1388,7 +1389,7 @@ def apply_cut(self, left_cut, right_cut): # The features 0 and 1 would have the right location if the final sequence had the same length # as the original one. However, the final product is longer because of the overhang. - features += [_shift_feature(f, abs(left_cut[1].ovhg), len(dseq)) for f in features_need_transfer] + features += [_shift_feature(f, abs(left_ovhg), len(dseq)) for f in features_need_transfer] # ^ ^^^^^^^^^ # So we shift back by the same amount in the opposite direction, but this time we pass the # length of the final product. @@ -1398,11 +1399,11 @@ def apply_cut(self, left_cut, right_cut): _location_boundaries(f.location)[1] <= len(dseq) and _location_boundaries(f.location)[0] <= _location_boundaries(f.location)[1])] else: - left_watson, left_crick = left_cut[0] if left_cut is not None else (0, 0) - right_watson, right_crick = right_cut[0] if right_cut is not None else (None, None) + left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(left_cut, True) + right_watson, right_crick, right_ovhg = self.seq.get_cut_parameters(right_cut, False) - left_edge = left_crick if dseq.ovhg > 0 else left_watson - right_edge = right_watson if dseq.watson_ovhg() > 0 else right_crick + left_edge = left_crick if left_ovhg > 0 else left_watson + right_edge = right_watson if right_ovhg > 0 else right_crick features = self[left_edge:right_edge].features return Dseqrecord(dseq, features=features) diff --git a/src/pydna/utils.py b/src/pydna/utils.py index a569c923..378abb6d 100644 --- a/src/pydna/utils.py +++ b/src/pydna/utils.py @@ -21,6 +21,7 @@ import collections as _collections import itertools as _itertools from copy import deepcopy as _deepcopy +from typing import Union as _Union import sys as _sys import re @@ -816,10 +817,11 @@ def cuts_overlap(left_cut, right_cut, seq_len): # This block of code would not be necessary if the cuts were # initially represented like this - (left_watson, _), enz1 = left_cut - (right_watson, _), enz2 = right_cut - left_crick = left_watson - enz1.ovhg - right_crick = right_watson - enz2.ovhg + (left_watson, left_ovhg), _ = left_cut + (right_watson, right_ovhg), _ = right_cut + # Position of the cut on the crick strands on the left and right + left_crick = left_watson - left_ovhg + right_crick = right_watson - right_ovhg if left_crick >= seq_len: left_crick -= seq_len left_watson -= seq_len @@ -832,7 +834,7 @@ def cuts_overlap(left_cut, right_cut, seq_len): y = sorted([right_watson, right_crick]) return (x[1] > y[0]) != (y[1] < x[0]) -def location_boundaries(loc: _sl|_cl): +def location_boundaries(loc: _Union[_sl,_cl]): #TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179 if loc.strand != 1: diff --git a/tests/test_module_dseq.py b/tests/test_module_dseq.py index a3602e8e..4c672bc9 100644 --- a/tests/test_module_dseq.py +++ b/tests/test_module_dseq.py @@ -825,7 +825,7 @@ def test_apply_cut(): assert seq.apply_cut(None, None) == seq # A cut where one side is None leaves that side intact - EcoRI_cut = ((3, 7), type('DynamicClass', (), {'ovhg': -4})()) + EcoRI_cut = ((3, -4), None) assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=0) assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=0, crick_ovhg=-4) @@ -844,28 +844,28 @@ def test_apply_cut(): # Two cuts extract a subsequence seq = Dseq('aaGAATTCaaGAATTCaa', circular=True) - EcoRI_cut_2 = ((11, 15), type('DynamicClass', (), {'ovhg': -4})()) + EcoRI_cut_2 = ((11, -4), None) assert seq.apply_cut(EcoRI_cut, EcoRI_cut_2) == Dseq.from_full_sequence_and_overhangs('AATTCaaGAATT', watson_ovhg=-4, crick_ovhg=-4) # Overlapping cuts should return an error seq = Dseq('aaGAATTCaa', circular=True) first_cuts = [ - ((3, 7), type('DynamicClass', (), {'ovhg': -4})()), - ((7, 3), type('DynamicClass', (), {'ovhg': 4})()), + ((3, -4), None), + ((7, 4), None), # Spanning the origin - ((9, 8), type('DynamicClass', (), {'ovhg': -8})()), - ((8, 9), type('DynamicClass', (), {'ovhg': 8})()), + ((9, -8), None), + ((8, 8), None), ] overlapping_cuts = [ - ((4, 8), type('DynamicClass', (), {'ovhg': -4})()), - ((2, 6), type('DynamicClass', (), {'ovhg': -4})()), - ((2, 8), type('DynamicClass', (), {'ovhg': -4})()), - ((8, 4), type('DynamicClass', (), {'ovhg': 4})()), - ((6, 2), type('DynamicClass', (), {'ovhg': 4})()), - ((8, 2), type('DynamicClass', (), {'ovhg': 4})()), + ((4, -4), None), + ((2, -4), None), + ((2, -6), None), + ((8, 4), None), + ((6, 4), None), + ((8, 6), None), # Spanning the origin - ((7, 6), type('DynamicClass', (), {'ovhg': -8})()), - ((6, 7), type('DynamicClass', (), {'ovhg': 8})()), + ((7, -8), None), + ((6, 8), None), ] for first_cut in first_cuts: @@ -878,6 +878,140 @@ def test_apply_cut(): print(first_cut, second_cut) assert False, 'Expected ValueError' +def test_cutsite_is_valid(): + + from pydna.dseq import Dseq + from Bio.Restriction import EcoRI, BsaI, PacI, NmeDI, Acc65I, NotI, BamHI, EcoRV + + # Works for circular case + seqs = ["GAATTC", "TTAATTAAC", "GATATC"] + enzs = [EcoRI, PacI, EcoRV] + for seq, enz in zip(seqs, enzs): + dseq = Dseq(seq, circular=True) + for shift in range(len(seq)): + dseq_shifted = dseq.shifted(shift) + cutsite, = dseq_shifted.get_cutsites([enz]) + assert dseq_shifted.cutsite_is_valid(cutsite) + + # Works for overhangs + seqs = ["GAATTC", "TTAATTAA", "GATATC"] + for seq, enz in zip(seqs, enzs): + for ovhg in [-1, 0, 1]: + dseq = Dseq.from_full_sequence_and_overhangs(seq, ovhg, 0) + if ovhg != 0: + assert len(dseq.get_cutsites([enz])) == 0 + else: + assert len(dseq.get_cutsites([enz])) == 1 + + dseq = Dseq.from_full_sequence_and_overhangs(seq, 0, ovhg) + if ovhg != 0: + assert len(dseq.get_cutsites([enz])) == 0 + else: + assert len(dseq.get_cutsites([enz])) == 1 + + # Special cases: + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 0, 0) + assert len(dseq.get_cutsites([NmeDI])) == 2 + # Remove left cutting place + assert len(dseq[2:].get_cutsites([NmeDI])) == 1 + # Remove right cutting place + assert len(dseq[:-2].get_cutsites([NmeDI])) == 1 + # Remove both cutting places + assert len(dseq[2:-2].get_cutsites([NmeDI])) == 0 + + # overhang left side + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', -2, 0) + assert len(dseq.get_cutsites([NmeDI])) == 1 + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 2, 0) + assert len(dseq.get_cutsites([NmeDI])) == 1 + + # overhang right side + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 0, 2) + assert len(dseq.get_cutsites([NmeDI])) == 1 + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 0, -2) + assert len(dseq.get_cutsites([NmeDI])) == 1 + + # overhang both sides + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 2, 2) + assert len(dseq.get_cutsites([NmeDI])) == 0 + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', -2, -2) + assert len(dseq.get_cutsites([NmeDI])) == 0 + + # overhang on recognition site removes both cutting places + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 16, 0) + assert len(dseq.get_cutsites([NmeDI])) == 0 + dseq = Dseq.from_full_sequence_and_overhangs('AAAAAAAAAAAAAGCCGGCAAAAAAAAAAAA', 0, 16) + assert len(dseq.get_cutsites([NmeDI])) == 0 + +def test_get_cutsite_pairs(): + from pydna.dseq import Dseq + + # in the test, we replace cuts by integers for clarity. + + dseq = Dseq('A') + + # Empty returns empty list + assert dseq.get_cutsite_pairs([]) == [] + + # Single cut on linear seq returns two fragments + assert dseq.get_cutsite_pairs([1]) == [(None, 1), (1, None)] + + # Two cuts on linear seq return three fragments + assert dseq.get_cutsite_pairs([1, 2]) == [(None, 1), (1, 2), (2, None)] + + dseq = Dseq('A', circular=True) + + # Empty returns empty list + assert dseq.get_cutsite_pairs([]) == [] + + # Single cut on circular seq returns opened molecule + assert dseq.get_cutsite_pairs([1]) == [(1, 1)] + + # Two cuts on circular seq return 2 fragments + assert dseq.get_cutsite_pairs([1, 2]) == [(1, 2), (2, 1)] + +def test_get_cut_parameters(): + + from pydna.dseq import Dseq + + dseq = Dseq.from_full_sequence_and_overhangs('aaaACGTaaa', 3, 3) + assert dseq.get_cut_parameters(None, True) == (*dseq.left_end_position(), dseq.ovhg) + assert dseq.get_cut_parameters(None, False) == (*dseq.right_end_position(), dseq.watson_ovhg()) + + assert dseq.get_cut_parameters(((4, -2), None), True) == (4, 6, -2) + assert dseq.get_cut_parameters(((4, -2), None), False) == (4, 6, -2) + assert dseq.get_cut_parameters(((6, 2), None), True) == (6, 4, 2) + assert dseq.get_cut_parameters(((6, 2), None), False) == (6, 4, 2) + + dseq = Dseq('aaaACGTaaa', circular=True) + + # None cannot be used on circular molecules + try: + assert dseq.get_cut_parameters(None, True) == (*dseq.left_end_position(), dseq.ovhg) + except AssertionError as e: + assert e.args[0] == 'Circular sequences should not have None cuts' + else: + assert False, 'Expected AssertionError' + + try: + assert dseq.get_cut_parameters(None, False) == (*dseq.right_end_position(), dseq.watson_ovhg()) + except AssertionError as e: + assert e.args[0] == 'Circular sequences should not have None cuts' + else: + assert False, 'Expected AssertionError' + + # "Normal" cuts + assert dseq.get_cut_parameters(((4, -2), None), True) == (4, 6, -2) + assert dseq.get_cut_parameters(((4, -2), None), False) == (4, 6, -2) + assert dseq.get_cut_parameters(((6, 2), None), True) == (6, 4, 2) + assert dseq.get_cut_parameters(((6, 2), None), False) == (6, 4, 2) + + # Origin-spannign cuts + assert dseq.get_cut_parameters(((9, -2), None), True) == (9, 1, -2) + assert dseq.get_cut_parameters(((9, -2), None), False) == (9, 1, -2) + assert dseq.get_cut_parameters(((1, 2), None), True) == (1, 9, 2) + assert dseq.get_cut_parameters(((1, 2), None), False) == (1, 9, 2) + if __name__ == "__main__": pytest.main([__file__, "-vv", "-s"]) diff --git a/tests/test_module_dseqrecord.py b/tests/test_module_dseqrecord.py index fc058d5e..543f36e7 100644 --- a/tests/test_module_dseqrecord.py +++ b/tests/test_module_dseqrecord.py @@ -2243,8 +2243,7 @@ def test_apply_cut(): # Single cut case for strand in [1, -1, None]: - for cut_coords, cut_ovhg in (((4, 7), -3), ((7, 4), 3)): - dummy_cut = (cut_coords, type('DynamicClass', (), {'ovhg': cut_ovhg})()) + for dummy_cut in (((4, -3), None), ((7, 3), None)): seq = Dseqrecord("acgtATGaatt", circular=True) seq.features.append(SeqFeature(SimpleLocation(4, 7, strand), id='full_overlap')) seq.features.append(SeqFeature(SimpleLocation(3, 7, strand), id='left_side'))