added cutsite_is_valid to check that the cutting site and recognition…

… site are double stranded
pydna-group · Jan 24, 2024 · c4acf48 · c4acf48
1 parent f21effd
commit c4acf48
Show file tree

Hide file tree

Showing 3 changed files with 331 additions and 26 deletions.
diff --git a/scripts/cutsite_pairs.ipynb b/scripts/cutsite_pairs.ipynb
@@ -4,40 +4,235 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# New cut implementation\n"
+    "# New cut implementation\n",
+    "\n",
+    "The most important thing is that cuts are now represented as `((cut_watson, ovhg), enz)`:\n",
+    "\n",
+    "- `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence that will be cut. It represents the position of the cut on the watson strand, using the full sequence as a reference. By \"full sequence\" I mean the one you would get from `str(Dseq)`. See example below.\n",
+    "- `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property.\n",
+    "- `enz` is the enzyme object. It's not necessary to perform the cut, but can be used to keep track of which enzyme was used.\n",
+    "\n",
+    "The new implementation of `Dseq.cut` now looks like this:\n",
+    "\n",
+    "```python\n",
+    "cutsites = self.get_cutsites(*enzymes)\n",
+    "cutsite_pairs = self.get_cutsite_pairs(cutsites)\n",
+    "return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)\n",
+    "```\n",
+    "\n",
+    "Let's go through it step by step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get_cutsites output: [((3, -4), EcoRI), ((11, -4), EcoRI)]\n",
+      "EcoRI.search output: [4, 12] < (positions are 1-based)\n",
+      "EcoRI.ovhg: -4\n",
+      "\n",
+      "get_cutsites output: [((6, -4), EcoRI)]\n",
+      "EcoRI.search output: [7] < (positions are 1-based)\n",
+      "EcoRI.ovhg: -4\n",
+      "\n",
+      "get_cutsites output: [((1, 2), PacI)]\n",
+      "PacI.search output: [2] < (positions are 1-based)\n",
+      "PacI.ovhg: 2\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pydna.dseq import Dseq\n",
+    "from Bio.Restriction import EcoRI, PacI\n",
+    "\n",
+    "dseq = Dseq('aaGAATTCaaGAATTCaa')\n",
+    "\n",
+    "# what this function does is basically handle the format of the enzymes, and return the cut positions\n",
+    "# that are returned by enz.search, along with the enzyme name and overhang. Positions are made zero-base\n",
+    "# instead of one-based\n",
+    "\n",
+    "print('get_cutsites output:', dseq.get_cutsites([EcoRI]))\n",
+    "print('EcoRI.search output:', EcoRI.search(dseq), '< (positions are 1-based)')\n",
+    "print('EcoRI.ovhg:', EcoRI.ovhg)\n",
+    "print()\n",
+    "\n",
+    "# Below are two examples of circular sequences with a cutsite that spans the origin.\n",
+    "dseq = Dseq('TTCaaGAA', circular=True)\n",
+    "print('get_cutsites output:', dseq.get_cutsites([EcoRI]))\n",
+    "print('EcoRI.search output:', EcoRI.search(dseq, linear=False), '< (positions are 1-based)')\n",
+    "print('EcoRI.ovhg:', EcoRI.ovhg)\n",
+    "print()\n",
+    "\n",
+    "dseq = Dseq('TTAAaaTTAA', circular=True)\n",
+    "print('get_cutsites output:', dseq.get_cutsites([PacI]))\n",
+    "print('PacI.search output:', PacI.search(dseq, linear=False), '< (positions are 1-based)')\n",
+    "print('PacI.ovhg:', PacI.ovhg)\n",
+    "print()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note how if the ovhg is negative, for an origin spanning cutsite, the position lies on the left side of the origin, and viceversa."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dseq(-10)\n",
+      "aaGAATTCaa\n",
+      " tCTTAAGtt\n",
+      "ovhg: -1 >> [((3, -4), EcoRI)]\n",
+      "\n",
+      "Dseq(-10)\n",
+      "aaGAATTCaa\n",
+      "ttCTTAAGtt\n",
+      "ovhg: 0 >> [((3, -4), EcoRI)]\n",
+      "\n",
+      "Dseq(-10)\n",
+      " aGAATTCaa\n",
+      "ttCTTAAGtt\n",
+      "ovhg: 1 >> [((3, -4), EcoRI)]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# `cut_watson` is defined with respect to the \"full sequence\"\n",
+    "for ovhg in [-1, 0, 1]:\n",
+    "    dseq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', ovhg, 0)\n",
+    "    print(dseq.__repr__())\n",
+    "    print('ovhg:', ovhg, '>>', dseq.get_cutsites([EcoRI]))\n",
+    "    print()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Cuts are only returned if the recognition site and overhang are on the double-strand part\n",
+    "of the sequence. Not 100% sure if that's the \"right\" way to do it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Coming soon\n",
-    "# seq = Dseq('aaGAATTCaa', circular=True)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "\n",
-    "# print('EcORI', EcoRI.ovhg, len(seq))\n",
-    "# for shift in range(len(seq)):\n",
-    "#     seq_shifted = seq.shifted(shift)\n",
-    "#     cut_site = seq_shifted.get_cutsites(EcoRI)[0][0]\n",
-    "#     print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n",
     "\n",
-    "# seq = Dseq('ccTTAATTAAcc', circular=True)\n",
-    "# print('PacI', PacI.ovhg, len(seq))\n",
-    "# for shift in range(len(seq)):\n",
-    "#     seq_shifted = seq.shifted(shift)\n",
-    "#     cut_site = seq_shifted.get_cutsites(PacI)[0][0]\n",
-    "#     print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n",
+    "## Pairing cutsites\n",
     "\n",
+    "A fragment produced by restriction is represented by a tuple of length two that may contain cutsites or `None`:\n",
     "\n",
-    "# seq = Dseq('TTAAccccTTAA', circular=True)\n",
-    "# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': 2})())\n",
-    "# print(seq.apply_cut(custom_cut, custom_cut).__repr__())\n",
+    "- Two cutsites: represents the extraction of a fragment between those two cutsites, in that orientation. To represent the opening of a circular molecule with a single cutsite, we put the same cutsite twice. See below.\n",
+    "- `None`, cutsite: represents the extraction of a fragment between the left edge of linear sequence and the cutsite.\n",
+    "- cutsite, `None`: represents the extraction of a fragment between the cutsite and the right edge of a linear sequence.\n",
     "\n",
-    "# print()\n",
+    "## Generating the sequence\n",
     "\n",
-    "# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': -10})())\n",
-    "# print(seq.apply_cut(custom_cut, custom_cut).__repr__())"
+    "To get the fragment, we use the function `dseq.apply_cut`, passing the two elements of the tuple as arguments."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> None, cutsite : (None, ((3, -4), EcoRI))\n",
+      "Dseq(-7)\n",
+      "aaG\n",
+      "ttCTTAA\n",
+      "\n",
+      "> cutsite, cutsite : (((3, -4), EcoRI), ((11, -4), EcoRI))\n",
+      "Dseq(-12)\n",
+      "AATTCaaG\n",
+      "    GttCTTAA\n",
+      "\n",
+      "> cutsite, None : (((11, -4), EcoRI), None)\n",
+      "Dseq(-7)\n",
+      "AATTCaa\n",
+      "    Gtt\n",
+      "\n",
+      "Circular molecule\n",
+      "> cutsite, cutsite : (((6, -4), EcoRI), ((6, -4), EcoRI))\n",
+      "Dseq(-12)\n",
+      "AATTCaaG\n",
+      "    GttCTTAA\n"
+     ]
+    }
+   ],
+   "source": [
+    "dseq = Dseq('aaGAATTCaaGAATTCaa')\n",
+    "cutsites = dseq.get_cutsites([EcoRI])\n",
+    "cutsite_pairs = dseq.get_cutsite_pairs(cutsites)\n",
+    "\n",
+    "pair_types = ['None, cutsite', 'cutsite, cutsite', 'cutsite, None']\n",
+    "\n",
+    "for pair, pair_type in zip(cutsite_pairs, pair_types):\n",
+    "    print('>', pair_type, ':',pair)\n",
+    "    print(dseq.apply_cut(*pair).__repr__())\n",
+    "    print()\n",
+    "\n",
+    "# Opening a circular sequence\n",
+    "print('Circular molecule')\n",
+    "dseq = Dseq('TTCaaGAA', circular=True)\n",
+    "cutsites = dseq.get_cutsites([EcoRI])\n",
+    "cutsite_pairs = dseq.get_cutsite_pairs(cutsites)\n",
+    "print('> cutsite, cutsite :', cutsite_pairs[0])\n",
+    "print(dseq.apply_cut(*cutsite_pairs[0]).__repr__())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dseq(-7)\n",
+      " aG\n",
+      "ttCTTAA\n",
+      "\n",
+      "Dseq(-7)\n",
+      "AATTCaa\n",
+      "    Gt\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Note that the cutsite respects the ovhg of the sequence:\n",
+    "dseq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaaGAATTCaa', 1, 1)\n",
+    "print(dseq.apply_cut(*cutsite_pairs[0]).__repr__())\n",
+    "print()\n",
+    "print(dseq.apply_cut(*cutsite_pairs[-1]).__repr__())\n"
    ]
   }
  ],
@@ -57,7 +252,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,

diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py
@@ -1446,6 +1446,51 @@ def cut(self, *enzymes):
         cutsite_pairs = self.get_cutsite_pairs(cutsites)
         return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
 
+    def cutsite_is_valid(self, cutsite):
+        """Returns False if:
+        - Cut positions fall outside the sequence (could be moved to Biopython)
+        - Overhang is not double stranded
+        - Recognition site is not double stranded or is outside the sequence
+        - For enzymes that cut twice, it checks that at least one possibility is valid
+        """
+        enz = cutsite[1]
+        watson, crick, ovhg = self.get_cut_parameters(cutsite, True)
+
+        # The cut positions fall within the sequence
+        # This could go into Biopython
+        if not self.circular and crick < 0 or crick > len(self):
+            return False
+
+        # The overhang is double stranded
+        overhang_dseq = self[watson:crick] if ovhg < 0 else self[crick:watson]
+        if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg() != 0:
+            return False
+
+        # The recognition site is double stranded and within the sequence
+        start_of_recognition_site = watson - enz.fst5
+        if start_of_recognition_site < 0:
+            start_of_recognition_site += len(self)
+        end_of_recognition_site = start_of_recognition_site + enz.size
+        if self.circular:
+            end_of_recognition_site %= len(self)
+        recognition_site = self[start_of_recognition_site:end_of_recognition_site]
+        if len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg() != 0:
+            if enz.scd5 is None:
+                return False
+            else:
+                # For enzymes that cut twice, this might be referring to the second one
+                start_of_recognition_site = watson - enz.scd5
+                if start_of_recognition_site < 0:
+                    start_of_recognition_site += len(self)
+                end_of_recognition_site = start_of_recognition_site + enz.size
+                if self.circular:
+                    end_of_recognition_site %= len(self)
+                recognition_site = self[start_of_recognition_site:end_of_recognition_site]
+                if (len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg() != 0):
+                    return False
+
+        return True
+
     def get_cutsites(self, *enzymes):
         """Returns a list of cutsites, represented by tuples ((cut_watson, cut_crick), enzyme), sorted by where they cut on the 5' strand.
 
@@ -1483,7 +1528,7 @@ def get_cutsites(self, *enzymes):
 
             out += [((w, e.ovhg), e) for w in cuts_watson]
 
-        return sorted(out)
+        return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)])
 
     def left_end_position(self) -> tuple[int, int]:
         """The index in the global sequence of the watson and crick start positions.
@@ -1524,7 +1569,9 @@ def get_cut_parameters(self, cut: tuple, is_left: bool):
         is_left parameter is for."""
         if cut is not None:
             watson, ovhg = cut[0]
-            crick = (watson - ovhg) % len(self)
+            crick = (watson - ovhg)
+            if self.circular:
+                crick %= len(self)
             return watson, crick, ovhg
         if is_left:
             return *self.left_end_position(), self.ovhg
@@ -1536,8 +1583,6 @@ def apply_cut(self, left_cut, right_cut):
 
         left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True)
         right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False)
-        print(left_watson, left_crick)
-        print(right_watson, right_crick)
         return Dseq(
                     str(self[left_watson:right_watson]),
                     # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case