Skip to content

Commit

Permalink
Cutsite pairs bugs (#184)
Browse files Browse the repository at this point in the history
* fix Dseq.apply_cut and minor error in Dseqrecord.apply_cut

* WIP

* return error on overlapping cuts, add tests for it

* #180 solved for features on + strand, but not all

* fixes #180
  • Loading branch information
manulera authored Jan 19, 2024
1 parent a2adeef commit 3ff817c
Show file tree
Hide file tree
Showing 7 changed files with 305 additions and 7 deletions.
51 changes: 51 additions & 0 deletions scripts/cutsite_pairs.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# New cut implementation\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Coming soon\n",
"# seq = Dseq('aaGAATTCaa', circular=True)\n",
"\n",
"# print('EcORI', EcoRI.ovhg, len(seq))\n",
"# for shift in range(len(seq)):\n",
"# seq_shifted = seq.shifted(shift)\n",
"# cut_site = seq_shifted.get_cutsites(EcoRI)[0][0]\n",
"# print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n",
"\n",
"# seq = Dseq('ccTTAATTAAcc', circular=True)\n",
"# print('PacI', PacI.ovhg, len(seq))\n",
"# for shift in range(len(seq)):\n",
"# seq_shifted = seq.shifted(shift)\n",
"# cut_site = seq_shifted.get_cutsites(PacI)[0][0]\n",
"# print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n",
"\n",
"\n",
"# seq = Dseq('TTAAccccTTAA', circular=True)\n",
"# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': 2})())\n",
"# print(seq.apply_cut(custom_cut, custom_cut).__repr__())\n",
"\n",
"# print()\n",
"\n",
"# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': -10})())\n",
"# print(seq.apply_cut(custom_cut, custom_cut).__repr__())"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
44 changes: 39 additions & 5 deletions src/pydna/dseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
from pydna.utils import cseguid as _cseg
from pydna.utils import rc as _rc
from pydna.utils import flatten as _flatten
from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from pydna.utils import cuts_overlap as _cuts_overlap

from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from Bio.Restriction import RestrictionBatch as _RestrictionBatch
from Bio.Restriction import CommOnly

Expand Down Expand Up @@ -1484,11 +1485,44 @@ def get_cutsites(self, *enzymes):

return sorted(out)

def apply_cut(self, left_cut, right_cut):
def left_end_position(self) -> tuple[int, int]:
"""The index in the global sequence of the watson and crick start positions.
Global sequence (str(self)) for all three cases is AAA
```
AAA AA AAT
TT TTT TTT
Returns (0, 1) Returns (1, 0) Returns (0, 0)
```
"""
if self.ovhg > 0:
return self.ovhg, 0
return 0, -self.ovhg

left_watson, left_crick = left_cut[0] if left_cut is not None else ((self.ovhg, 0) if self.ovhg > 0 else (0, -self.ovhg))
ovhg = self.ovhg if left_cut is None else left_cut[1].ovhg
right_watson, right_crick = right_cut[0] if right_cut is not None else (len(self.watson), len(self.crick))
def right_end_position(self) -> tuple[int, int]:
"""The index in the global sequence of the watson and crick end positions.
Global sequence (str(self)) for all three cases is AAA
```
AAA AA AAA
TT TTT TTT
Returns (3, 2) Returns (2, 3) Returns (3, 3)
```
"""
if self.watson_ovhg() < 0:
return len(self) + self.watson_ovhg(), len(self)
return len(self), len(self) - self.watson_ovhg()

def apply_cut(self, left_cut, right_cut):
if _cuts_overlap(left_cut, right_cut, len(self)):
raise ValueError("Cuts overlap")
left_watson, left_crick = left_cut[0] if left_cut is not None else self.left_end_position()
ovhg = left_cut[1].ovhg if left_cut is not None else self.ovhg
right_watson, right_crick = right_cut[0] if right_cut is not None else self.right_end_position()
return Dseq(
str(self[left_watson:right_watson]),
# The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case
Expand Down
57 changes: 55 additions & 2 deletions src/pydna/dseqrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
from Bio.Restriction import CommOnly
from pydna.dseq import Dseq as _Dseq
from pydna._pretty import pretty_str as _pretty_str
from pydna.utils import flatten as _flatten
from pydna.utils import flatten as _flatten, location_boundaries as _location_boundaries

# from pydna.utils import memorize as _memorize
from pydna.utils import rc as _rc
from pydna.utils import shift_location as _shift_location
from pydna.utils import shift_feature as _shift_feature
from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from Bio.SeqFeature import SeqFeature as _SeqFeature
from Bio import SeqIO
Expand Down Expand Up @@ -900,6 +901,7 @@ def __getitem__(self, sl):
# origin-spanning features should only be included after shifting
# in cases where the slice comprises the entire sequence, but then
# sl_start == sl_stop and the second condition is not met
# TODO: _location_boundaries
answer.features = [f for f in answer.features if (
f.location.parts[-1].end <= answer.seq.length and
f.location.parts[0].start <= f.location.parts[-1].end)]
Expand Down Expand Up @@ -1343,7 +1345,58 @@ def apply_cut(self, left_cut, right_cut):
# Not really a cut, but to handle the general case
if left_cut is None:
features = self.features
features = self.shifted(min(left_cut[0])).features
else:
# The features that span the origin if shifting with left_cut, but that do not cross
# the cut site should be included, and if there is a feature within the cut site, it should
# be duplicated. See https://github.com/BjornFJohansson/pydna/issues/180 for a practical example.
#
# Let's say we are going to open a circular plasmid like below (| inidicate cuts, numbers indicate
# features)
#
# 3333|3
# 1111
# 000
# XXXXatg|YYY
# XXX|tacYYYY
# 000
# 2222
#
features = self.shifted(min(left_cut[0])).features
# for f in features:
# print(f.id, f.location, _location_boundaries(f.location))
# Here, we have done what's shown below (* indicates the origin).
# The features 0 and 2 have the right location for the final product:
#
# 3*3333
# 1*111
# XXXX*atgYYY
# XXXX*tacYYY
# 000
# 2222

features_need_transfer = [f for f in features if (_location_boundaries(f.location)[1] <= abs(left_cut[1].ovhg))]
features_need_transfer = [_shift_feature(f, -abs(left_cut[1].ovhg), len(self)) for f in features_need_transfer]
# ^ ^^^^^^^^^
# Now we have shifted the features that end before the cut (0 and 1, but not 3), as if
# they referred to the below sequence (* indicates the origin):
#
# 1111
# 000
# XXXXatg*YYY
# XXXXtac*YYY
#
# The features 0 and 1 would have the right location if the final sequence had the same length
# as the original one. However, the final product is longer because of the overhang.

features += [_shift_feature(f, abs(left_cut[1].ovhg), len(dseq)) for f in features_need_transfer]
# ^ ^^^^^^^^^
# So we shift back by the same amount in the opposite direction, but this time we pass the
# length of the final product.
# print(*features, sep='\n')
# Features like 3 are removed here
features = [f for f in features if (
_location_boundaries(f.location)[1] <= len(dseq) and
_location_boundaries(f.location)[0] <= _location_boundaries(f.location)[1])]
else:
left_watson, left_crick = left_cut[0] if left_cut is not None else (0, 0)
right_watson, right_crick = right_cut[0] if right_cut is not None else (None, None)
Expand Down
40 changes: 40 additions & 0 deletions src/pydna/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import keyword as _keyword
import collections as _collections
import itertools as _itertools
from copy import deepcopy as _deepcopy

import sys as _sys
import re
Expand Down Expand Up @@ -71,6 +72,14 @@ def shift_location(original_location, shift, lim):
assert len(newloc) == len(original_location)
return newloc

def shift_feature(feature, shift, lim):
"""Return a new feature with shifted location."""
# TODO: Missing tests
new_location = shift_location(feature.location, shift, lim)
new_feature = _deepcopy(feature)
new_feature.location = new_location
return new_feature


# def smallest_rotation(s):
# """Smallest rotation of a string.
Expand Down Expand Up @@ -800,6 +809,37 @@ def eq(*args, **kwargs):
same = False
return same

def cuts_overlap(left_cut, right_cut, seq_len):
# Special cases:
if left_cut is None or right_cut is None or left_cut == right_cut:
return False

# This block of code would not be necessary if the cuts were
# initially represented like this
(left_watson, _), enz1 = left_cut
(right_watson, _), enz2 = right_cut
left_crick = left_watson - enz1.ovhg
right_crick = right_watson - enz2.ovhg
if left_crick >= seq_len:
left_crick -= seq_len
left_watson -= seq_len
if right_crick >= seq_len:
right_crick -= seq_len
right_watson -= seq_len

# Convert into ranges x and y and see if ranges overlap
x = sorted([left_watson, left_crick])
y = sorted([right_watson, right_crick])
return (x[1] > y[0]) != (y[1] < x[0])

def location_boundaries(loc: _sl|_cl):

#TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179
if loc.strand != 1:
return loc.parts[-1].start, loc.parts[0].end
else:
return loc.parts[0].start, loc.parts[-1].end


if __name__ == "__main__":
cached = _os.getenv("pydna_cached_funcs", "")
Expand Down
90 changes: 90 additions & 0 deletions tests/test_module_dseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,5 +789,95 @@ def test_from_full_sequence_and_overhangs():
assert dseq_2.watson_ovhg() == watson_ovhg


def test_right_end_position():

from pydna.dseq import Dseq

test_cases = [
("AAA", "TT", (3, 2)),
("AA", "TTT", (2, 3)),
("AAA", "TTT", (3, 3)),
]
for watson, crick, expected in test_cases:
dseq = Dseq(watson, crick, ovhg=0, circular=False)
assert dseq.right_end_position() == expected

def test_left_end_position():

from pydna.dseq import Dseq

test_cases = [
("AAA", "TT", (0, 1), -1),
("AA", "TTT", (1, 0), 1),
("AAT", "TTT", (0, 0), 0),
]
for watson, crick, expected, ovhg in test_cases:
dseq = Dseq(watson, crick, ovhg=ovhg, circular=False)
assert dseq.left_end_position() == expected


def test_apply_cut():
from pydna.dseq import Dseq

seq = Dseq('aaGAATTCaa', circular=False)

# A cut where both sides are None returns the same sequence
assert seq.apply_cut(None, None) == seq

# A cut where one side is None leaves that side intact
EcoRI_cut = ((3, 7), type('DynamicClass', (), {'ovhg': -4})())
assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=0)
assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=0, crick_ovhg=-4)

# It respects the original overhang
seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=1, crick_ovhg=1)
assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=1)
assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=1, crick_ovhg=-4)

seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=-1, crick_ovhg=-1)
assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=-1)
assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=-1, crick_ovhg=-4)

# A repeated cut in a circular molecule opens it up
seq = Dseq('aaGAATTCaa', circular=True)
assert seq.apply_cut(EcoRI_cut, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('AATTCaaaaGAATT', watson_ovhg=-4, crick_ovhg=-4)

# Two cuts extract a subsequence
seq = Dseq('aaGAATTCaaGAATTCaa', circular=True)
EcoRI_cut_2 = ((11, 15), type('DynamicClass', (), {'ovhg': -4})())
assert seq.apply_cut(EcoRI_cut, EcoRI_cut_2) == Dseq.from_full_sequence_and_overhangs('AATTCaaGAATT', watson_ovhg=-4, crick_ovhg=-4)

# Overlapping cuts should return an error
seq = Dseq('aaGAATTCaa', circular=True)
first_cuts = [
((3, 7), type('DynamicClass', (), {'ovhg': -4})()),
((7, 3), type('DynamicClass', (), {'ovhg': 4})()),
# Spanning the origin
((9, 8), type('DynamicClass', (), {'ovhg': -8})()),
((8, 9), type('DynamicClass', (), {'ovhg': 8})()),
]
overlapping_cuts = [
((4, 8), type('DynamicClass', (), {'ovhg': -4})()),
((2, 6), type('DynamicClass', (), {'ovhg': -4})()),
((2, 8), type('DynamicClass', (), {'ovhg': -4})()),
((8, 4), type('DynamicClass', (), {'ovhg': 4})()),
((6, 2), type('DynamicClass', (), {'ovhg': 4})()),
((8, 2), type('DynamicClass', (), {'ovhg': 4})()),
# Spanning the origin
((7, 6), type('DynamicClass', (), {'ovhg': -8})()),
((6, 7), type('DynamicClass', (), {'ovhg': 8})()),
]

for first_cut in first_cuts:
for second_cut in overlapping_cuts:
try:
seq.apply_cut(first_cut, second_cut)
except ValueError as e:
assert e.args[0] == 'Cuts overlap'
else:
print(first_cut, second_cut)
assert False, 'Expected ValueError'


if __name__ == "__main__":
pytest.main([__file__, "-vv", "-s"])
24 changes: 24 additions & 0 deletions tests/test_module_dseqrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -2237,6 +2237,30 @@ def test_assemble_YEp24PGK_XK():
assert YEp24PGK_XK_correct.cseguid() == "t9fs_9UvEuD-Ankyy8XEr1hD5DQ"
assert eq(YEp24PGK_XK, YEp24PGK_XK_correct)

def test_apply_cut():
from pydna.dseqrecord import Dseqrecord
from Bio.SeqFeature import SeqFeature, SimpleLocation

# Single cut case
for strand in [1, -1, None]:
for cut_coords, cut_ovhg in (((4, 7), -3), ((7, 4), 3)):
dummy_cut = (cut_coords, type('DynamicClass', (), {'ovhg': cut_ovhg})())
seq = Dseqrecord("acgtATGaatt", circular=True)
seq.features.append(SeqFeature(SimpleLocation(4, 7, strand), id='full_overlap'))
seq.features.append(SeqFeature(SimpleLocation(3, 7, strand), id='left_side'))
seq.features.append(SeqFeature(SimpleLocation(4, 8, strand), id='right_side'))
seq.features.append(SeqFeature(SimpleLocation(3, 10, strand), id='throughout'))
open_seq = seq.apply_cut(dummy_cut, dummy_cut)
assert len(open_seq.features) == 4
new_locs = [str(f.location) for f in open_seq.features]
if strand == 1:
assert new_locs == ['[0:3](+)', '[0:4](+)', '[11:14](+)', '[10:14](+)']
elif strand == -1:
# TODO: change the join{[11:14](-), [10:11](-)} case?
assert new_locs == ['[0:3](-)', '[0:4](-)', '[11:14](-)', 'join{[11:14](-), [10:11](-)}']
if strand == None:
# TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179
assert new_locs == ['[0:3]', '[0:4]', '[11:14]', 'join{[11:14], [10:11]}']

if __name__ == "__main__":
args = [
Expand Down
6 changes: 6 additions & 0 deletions tests/test_module_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,5 +457,11 @@ def close(self):
assert mf(1, kw=1) == ((1,), {"kw": 1})


def test_shift_location():
from pydna.utils import shift_location

# Shifting of locations should be reversible


if __name__ == "__main__":
pytest.main([__file__, "-vv", "-s"])

0 comments on commit 3ff817c

Please sign in to comment.