diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 24ceefe..437617b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,12 @@ All notable changes to this project will be documented in this file. The format is based on `Keep a Changelog `_. +1.6.0 +----- +- Added ability to parse second upstream / downstream region in ``IlluminaBarcodeParser`` by adding ``upstream2`` and ``downstream2`` parameters. Also modified ``IlluminaBarcodeParser`` so that reads will only be parsed if they are long enough to fully cover the region containing the barcodes and specified upstream / downstream sequences. Based on docs, this is how it was supposed to function before but did not. Additionally, this adds another row ("reads too short") to the fates from the barcode parser, as well as the ``outer_flank_fates`` option to report just failing the additional upstream and downstream regions. +- Change default color of heatmaps made by ``CodonVariantTable`` due to current one being obsolete. +- Remove obsolete ``guide=False`` from some ``plotnine`` plots in examples / tests (this was removed in ``plotnine`` version 0.13). + 1.5.0 ----- - Remove use of deprecated ``scipy`` functions like ``flip`` to use ``numpy`` alternatives instead (fixes [this issue](https://github.com/jbloomlab/dms_variants/issues/86)). diff --git a/dms_variants/__init__.py b/dms_variants/__init__.py index db24f4b..1538f11 100644 --- a/dms_variants/__init__.py +++ b/dms_variants/__init__.py @@ -10,5 +10,5 @@ __author__ = "`the Bloom lab `_" __email__ = "jbloom@fredhutch.org" -__version__ = "1.5.0" +__version__ = "1.6.0" __url__ = "https://github.com/jbloomlab/dms_variants" diff --git a/dms_variants/codonvarianttable.py b/dms_variants/codonvarianttable.py index 1db8132..7665566 100644 --- a/dms_variants/codonvarianttable.py +++ b/dms_variants/codonvarianttable.py @@ -1934,7 +1934,6 @@ def plotMutHeatmap( expand=(0, 0), ) + p9.ylab(mut_desc) - + p9.scale_fill_cmap("gnuplot") ) if samples is None: @@ -1999,7 +1998,7 @@ def plotMutFreqs( assert "target" not in set(df.columns).union(set(n_variants.columns)) df = ( - df.groupby(["library", "sample", "mutation_type", "site"]) + df.groupby(["library", "sample", "mutation_type", "site"], observed=False) .aggregate({"count": "sum"}) .reset_index() .merge(n_variants, on=["library", "sample"]) diff --git a/dms_variants/illuminabarcodeparser.py b/dms_variants/illuminabarcodeparser.py index 3eef2f7..26e7da8 100644 --- a/dms_variants/illuminabarcodeparser.py +++ b/dms_variants/illuminabarcodeparser.py @@ -30,7 +30,7 @@ class IlluminaBarcodeParser: ---- Barcodes should be read by R1 and optionally R2. Expected arrangement is - 5'-[R2_start]-upstream-barcode-downstream-[R1_start]-3' + 5'-[R2_start]-upstream2-upstream-barcode-downstream-downstream2-[R1_start]-3' R1 anneals downstream of barcode and reads backwards. If R2 is used, it anneals upstream of barcode and reads forward. There can be sequences @@ -38,7 +38,11 @@ class IlluminaBarcodeParser: must fully cover region between R1 start and barcode, and if using R2 then `upstream` must fully cover region between R2 start and barcode. However, it is fine if R1 reads backwards past `upstream`, and if `R2` - reads forward past `downstream`. + reads forward past `downstream`. The `upstream2` and `downstream2` + can be used to require additional flanking sequences. Normally these + would just be rolled into `upstream` and `downstream`, but you might + specify separately if you are actually using these to parse additional + indices that you might want to set different mismatch criteria for. Parameters ---------- @@ -72,12 +76,20 @@ class IlluminaBarcodeParser: Length of barcodes. upstream : str Sequence upstream of barcode. + upstream2 : str + Second sequence upstream of barcode. downstream : str Sequence downstream of barcode. + downstream2 : str + Second sequence downstream of barcode upstream_mismatch : int Max number of mismatches allowed in `upstream`. + upstream2_mismatch : int + Max number of mismatches allowed in `upstream2`. downstream_mismatch : int Max number of mismatches allowed in `downstream`. + downstream2_mismatch : int + Max number of mismatches allowed in `downstream2`. valid_barcodes : None or set If not `None`, set of barcodes to retain. bc_orientation : {'R1', 'R2'} @@ -101,9 +113,13 @@ def __init__( *, bclen=None, upstream="", + upstream2="", downstream="", + downstream2="", upstream_mismatch=0, + upstream2_mismatch=0, downstream_mismatch=0, + downstream2_mismatch=0, valid_barcodes=None, bc_orientation="R1", minq=20, @@ -112,16 +128,20 @@ def __init__( ): """See main class doc string.""" self.bclen = bclen - if regex.match(f"^[{self.VALID_NTS}]*$", upstream): - self.upstream = upstream - else: - raise ValueError(f"invalid chars in upstream {upstream}") - if regex.match(f"^[{self.VALID_NTS}]*$", downstream): - self.downstream = downstream - else: - raise ValueError(f"invalid chars in downstream {downstream}") + for param_name, param_val in [ + ("upstream", upstream), + ("downstream", downstream), + ("upstream2", upstream2), + ("downstream2", downstream2), + ]: + if regex.match(f"^[{self.VALID_NTS}]*$", param_val): + setattr(self, param_name, param_val) + else: + raise ValueError(f"invalid chars in {param_name} {param_val}") self.upstream_mismatch = upstream_mismatch self.downstream_mismatch = downstream_mismatch + self.upstream2_mismatch = upstream2_mismatch + self.downstream2_mismatch = downstream2_mismatch self.valid_barcodes = valid_barcodes if self.valid_barcodes is not None: self.valid_barcodes = set(self.valid_barcodes) @@ -142,15 +162,61 @@ def __init__( self.list_all_valid_barcodes = list_all_valid_barcodes # specify information about R1 / R2 matches - self._bcend = { - "R1": self.bclen + len(self.downstream), - "R2": self.bclen + len(self.upstream), - } self._rcdownstream = reverse_complement(self.downstream) self._rcupstream = reverse_complement(self.upstream) - self._matches = {"R1": {}, "R2": {}} # match objects by read length + self._rcdownstream2 = reverse_complement(self.downstream2) + self._rcupstream2 = reverse_complement(self.upstream2) + + # build the regex read matches + self._matchers = { + "R1": regex.compile( + f"({self._rcdownstream2})" + + f"{{s<={self.downstream2_mismatch}}}" + + f"({self._rcdownstream})" + + f"{{s<={self.downstream_mismatch}}}" + + f"(?P[ACTG]{{{self.bclen}}})" + + f"({self._rcupstream})" + + f"{{s<={self.upstream_mismatch}}}" + + f"({self._rcupstream2})" + + f"{{s<={self.upstream2_mismatch}}}" + ), + "R2": regex.compile( + f"({self.upstream2})" + + f"{{s<={self.upstream2_mismatch}}}" + + f"({self.upstream})" + + f"{{s<={self.upstream_mismatch}}}" + + f"(?P[ACTG]{{{self.bclen}}})" + + f"({self.downstream})" + + f"{{s<={self.downstream_mismatch}}}" + + f"({self.downstream2})" + + f"{{s<={self.downstream2_mismatch}}}" + ), + } + + # build matchers that do not have upstream2 or downstream2 if needed + self._has_flank2 = (len(self.upstream2) > 0) or (len(self.downstream2) > 0) + self._matchers_no_flank2 = { + "R1": regex.compile( + f"[{self.VALID_NTS}]{{{len(self.downstream2)}}}" + + f"({self._rcdownstream})" + + f"{{s<={self.downstream_mismatch}}}" + + f"(?P[ACTG]{{{self.bclen}}})" + + f"({self._rcupstream})" + + f"{{s<={self.upstream_mismatch}}}" + + f"[{self.VALID_NTS}]{{{len(self.upstream2)}}}" + ), + "R2": regex.compile( + f"[{self.VALID_NTS}]{{{len(self.upstream2)}}}" + + f"^({self.upstream})" + + f"{{s<={self.upstream_mismatch}}}" + + f"(?P[ACTG]{{{self.bclen}}})" + + f"({self.downstream})" + + f"{{s<={self.downstream_mismatch}}}" + + f"[{self.VALID_NTS}]{{{len(self.downstream2)}}}" + ), + } - def parse(self, r1files, *, r2files=None, add_cols=None): + def parse(self, r1files, *, r2files=None, add_cols=None, outer_flank_fates=False): """Parse barcodes from files. Parameters @@ -162,6 +228,11 @@ def parse(self, r1files, *, r2files=None, add_cols=None): add_cols : None or dict If dict, specify names and values (i.e., sample or library names) to be aded to returned data frames. + outer_flank_fates : bool + If `True`, if using outer flanking regions then in the output fates + specify reads that fail just the outer flanking regions (`upstream2` or + `downstream2`). Otherwise, such failures will be grouped with the + "unparseable barcode" fate. Returns ------- @@ -177,6 +248,9 @@ def parse(self, r1files, *, r2files=None, add_cols=None): - "R1 / R2 disagree" (if using `r2files`) - "low quality barcode": sequencing quality low - "unparseable barcode": invalid flank sequence, N in barcode + - "read too short": read is too short to cover specified region + - "invalid outer flank" : if using `outer_flank_fates` and + `upstream2` or `downstream2` fails. Note that these data frames also include any columns specified by `add_cols`. @@ -210,21 +284,30 @@ def parse(self, r1files, *, r2files=None, add_cols=None): "low quality barcode": 0, "invalid barcode": 0, "valid barcode": 0, + "read too short": 0, } if not r1only: fates["R1 / R2 disagree"] = 0 - - # max length of interest for reads - max_len = self.bclen + len(self.upstream) + len(self.downstream) + if outer_flank_fates and self._has_flank2: + fates["invalid outer flank"] = 0 + + # min length of interest for reads + minlen = ( + self.bclen + + len(self.upstream) + + len(self.downstream) + + len(self.upstream2) + + len(self.downstream2) + ) for filetup in zip(*fileslist): if r1only: assert len(filetup) == 1 - iterator = iterate_fastq(filetup[0], check_pair=1, trim=max_len) + iterator = iterate_fastq(filetup[0], check_pair=1, trim=minlen) else: assert len(filetup) == 2, f"{filetup}\n{fileslist}" iterator = iterate_fastq_pair( - filetup[0], filetup[1], r1trim=max_len, r2trim=max_len + filetup[0], filetup[1], r1trim=minlen, r2trim=minlen ) for entry in iterator: @@ -242,44 +325,18 @@ def parse(self, r1files, *, r2files=None, add_cols=None): fates["failed chastity filter"] += 1 continue - matches = {} - for read, r in zip(reads, readlist): - rlen = len(r) + if any(len(r) < minlen for r in readlist): + fates["read too short"] += 1 + continue - # get or build matcher for read of this length - len_past_bc = rlen - self._bcend[read] - if len_past_bc < 0: - raise ValueError(f"{read} too short: {rlen}") - elif rlen in self._matches[read]: - matcher = self._matches[read][rlen] - else: - if read == "R1": - match_str = ( - f"^({self._rcdownstream})" - f"{{s<={self.downstream_mismatch}}}" - f"(?P[ACTG]{{{self.bclen}}})" - f"({self._rcupstream[: len_past_bc]})" - f"{{s<={self.upstream_mismatch}}}" - ) - else: - assert read == "R2" - match_str = ( - f"^({self.upstream})" - f"{{s<={self.upstream_mismatch}}}" - f"(?P[ACTG]{{{self.bclen}}})" - f"({self.downstream[: len_past_bc]})" - f"{{s<={self.downstream_mismatch}}}" - ) - matcher = regex.compile(match_str, flags=regex.BESTMATCH) - self._matches[read][rlen] = matcher - - m = matcher.match(r) - if m: - matches[read] = m - else: - break + assert all(len(r) == minlen for r in readlist) + + matches = { + read: self._matchers[read].fullmatch(r) + for (read, r) in zip(reads, readlist) + } - if len(matches) == len(reads): + if all(m is not None for m in matches.values()): bc = {} bc_q = {} for read, q in zip(reads, qlist): @@ -321,6 +378,15 @@ def parse(self, r1files, *, r2files=None, add_cols=None): fates["low quality barcode"] += 1 else: fates["R1 / R2 disagree"] += 1 + elif ( + outer_flank_fates + and self._has_flank2 + and all( + self._matchers_no_flank2[read].fullmatch(r) is not None + for (read, r) in zip(reads, readlist) + ) + ): + fates["invalid outer flank"] += 1 else: # invalid flanking sequence or N in barcode fates["unparseable barcode"] += 1 diff --git a/notebooks/codonvariant_sim_data.ipynb b/notebooks/codonvariant_sim_data.ipynb index a2ac6af..51aef91 100644 --- a/notebooks/codonvariant_sim_data.ipynb +++ b/notebooks/codonvariant_sim_data.ipynb @@ -3584,7 +3584,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "_ = p.draw(show=True)" ] diff --git a/notebooks/codonvariant_sim_data_multi_targets.ipynb b/notebooks/codonvariant_sim_data_multi_targets.ipynb index 8b95145..226b3cc 100644 --- a/notebooks/codonvariant_sim_data_multi_targets.ipynb +++ b/notebooks/codonvariant_sim_data_multi_targets.ipynb @@ -5430,7 +5430,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "\n", "_ = p.draw(show=True)" diff --git a/notebooks/multi_latent_phenos.ipynb b/notebooks/multi_latent_phenos.ipynb index e1e5b0e..bde24eb 100644 --- a/notebooks/multi_latent_phenos.ipynb +++ b/notebooks/multi_latent_phenos.ipynb @@ -768,7 +768,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "_ = p.draw(show=True)" ] diff --git a/notebooks/narrow_bottleneck.ipynb b/notebooks/narrow_bottleneck.ipynb index ab08e57..b56f35c 100644 --- a/notebooks/narrow_bottleneck.ipynb +++ b/notebooks/narrow_bottleneck.ipynb @@ -835,7 +835,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "_ = p.draw(show=True)" ] diff --git a/notebooks/parsebarcodes_sim_data.ipynb b/notebooks/parsebarcodes_sim_data.ipynb index e7cc757..9f67cd7 100644 --- a/notebooks/parsebarcodes_sim_data.ipynb +++ b/notebooks/parsebarcodes_sim_data.ipynb @@ -26,6 +26,13 @@ "classes": [], "id": "", "n": "1" + }, + "execution": { + "iopub.execute_input": "2024-04-10T20:05:40.815432Z", + "iopub.status.busy": "2024-04-10T20:05:40.814366Z", + "iopub.status.idle": "2024-04-10T20:05:44.954972Z", + "shell.execute_reply": "2024-04-10T20:05:44.954174Z", + "shell.execute_reply.started": "2024-04-10T20:05:40.815379Z" } }, "outputs": [], @@ -61,6 +68,13 @@ "classes": [], "id": "", "n": "3" + }, + "execution": { + "iopub.execute_input": "2024-04-10T20:05:44.958831Z", + "iopub.status.busy": "2024-04-10T20:05:44.958529Z", + "iopub.status.idle": "2024-04-10T20:05:44.961978Z", + "shell.execute_reply": "2024-04-10T20:05:44.961384Z", + "shell.execute_reply.started": "2024-04-10T20:05:44.958804Z" } }, "outputs": [], @@ -83,6 +97,13 @@ "classes": [], "id": "", "n": "5" + }, + "execution": { + "iopub.execute_input": "2024-04-10T20:05:44.965486Z", + "iopub.status.busy": "2024-04-10T20:05:44.965154Z", + "iopub.status.idle": "2024-04-10T20:05:44.968603Z", + "shell.execute_reply": "2024-04-10T20:05:44.968002Z", + "shell.execute_reply.started": "2024-04-10T20:05:44.965457Z" } }, "outputs": [], @@ -100,7 +121,15 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:44.972410Z", + "iopub.status.busy": "2024-04-10T20:05:44.971869Z", + "iopub.status.idle": "2024-04-10T20:05:44.977782Z", + "shell.execute_reply": "2024-04-10T20:05:44.977077Z", + "shell.execute_reply.started": "2024-04-10T20:05:44.972378Z" + } + }, "outputs": [], "source": [ "theme_set(dms_variants.plotnine_themes.theme_graygrid())" @@ -124,6 +153,13 @@ "classes": [], "id": "", "n": "6" + }, + "execution": { + "iopub.execute_input": "2024-04-10T20:05:44.981199Z", + "iopub.status.busy": "2024-04-10T20:05:44.980887Z", + "iopub.status.idle": "2024-04-10T20:05:45.004497Z", + "shell.execute_reply": "2024-04-10T20:05:45.003961Z", + "shell.execute_reply.started": "2024-04-10T20:05:44.981171Z" } }, "outputs": [], @@ -153,6 +189,13 @@ "classes": [], "id": "", "n": "8" + }, + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.007722Z", + "iopub.status.busy": "2024-04-10T20:05:45.007435Z", + "iopub.status.idle": "2024-04-10T20:05:45.022236Z", + "shell.execute_reply": "2024-04-10T20:05:45.021685Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.007698Z" } }, "outputs": [ @@ -275,7 +318,15 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.025335Z", + "iopub.status.busy": "2024-04-10T20:05:45.025036Z", + "iopub.status.idle": "2024-04-10T20:05:45.029102Z", + "shell.execute_reply": "2024-04-10T20:05:45.028494Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.025309Z" + } + }, "outputs": [ { "name": "stdout", @@ -301,7 +352,15 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.032391Z", + "iopub.status.busy": "2024-04-10T20:05:45.032082Z", + "iopub.status.idle": "2024-04-10T20:05:45.042341Z", + "shell.execute_reply": "2024-04-10T20:05:45.041684Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.032363Z" + } + }, "outputs": [ { "data": { @@ -384,7 +443,15 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.045607Z", + "iopub.status.busy": "2024-04-10T20:05:45.045302Z", + "iopub.status.idle": "2024-04-10T20:05:45.050103Z", + "shell.execute_reply": "2024-04-10T20:05:45.049529Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.045581Z" + } + }, "outputs": [], "source": [ "barcode_seqs = []\n", @@ -404,7 +471,15 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.053294Z", + "iopub.status.busy": "2024-04-10T20:05:45.052987Z", + "iopub.status.idle": "2024-04-10T20:05:45.057238Z", + "shell.execute_reply": "2024-04-10T20:05:45.056639Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.053267Z" + } + }, "outputs": [], "source": [ "n_invalid = 3\n", @@ -423,7 +498,15 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.060813Z", + "iopub.status.busy": "2024-04-10T20:05:45.060433Z", + "iopub.status.idle": "2024-04-10T20:05:45.064295Z", + "shell.execute_reply": "2024-04-10T20:05:45.063634Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.060785Z" + } + }, "outputs": [], "source": [ "n_unparseable = 2\n", @@ -442,7 +525,15 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.067729Z", + "iopub.status.busy": "2024-04-10T20:05:45.067406Z", + "iopub.status.idle": "2024-04-10T20:05:45.070818Z", + "shell.execute_reply": "2024-04-10T20:05:45.070140Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.067700Z" + } + }, "outputs": [], "source": [ "minq = 20\n", @@ -459,7 +550,15 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.077081Z", + "iopub.status.busy": "2024-04-10T20:05:45.076734Z", + "iopub.status.idle": "2024-04-10T20:05:45.080445Z", + "shell.execute_reply": "2024-04-10T20:05:45.079808Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.077053Z" + } + }, "outputs": [], "source": [ "barcode_seqs = [(seq, minq_char * len(seq)) for seq in barcode_seqs]" @@ -475,7 +574,15 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.081577Z", + "iopub.status.busy": "2024-04-10T20:05:45.081245Z", + "iopub.status.idle": "2024-04-10T20:05:45.085347Z", + "shell.execute_reply": "2024-04-10T20:05:45.084804Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.081550Z" + } + }, "outputs": [], "source": [ "lowq_char = chr(minq + 33 - 1) # low-quality Q-score\n", @@ -495,7 +602,15 @@ { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.086445Z", + "iopub.status.busy": "2024-04-10T20:05:45.086135Z", + "iopub.status.idle": "2024-04-10T20:05:45.090674Z", + "shell.execute_reply": "2024-04-10T20:05:45.089984Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.086419Z" + } + }, "outputs": [ { "name": "stdout", @@ -576,6 +691,13 @@ "classes": [], "id": "", "n": "13" + }, + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.091953Z", + "iopub.status.busy": "2024-04-10T20:05:45.091602Z", + "iopub.status.idle": "2024-04-10T20:05:45.098706Z", + "shell.execute_reply": "2024-04-10T20:05:45.097642Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.091921Z" } }, "outputs": [], @@ -602,7 +724,15 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.100991Z", + "iopub.status.busy": "2024-04-10T20:05:45.100290Z", + "iopub.status.idle": "2024-04-10T20:05:45.113899Z", + "shell.execute_reply": "2024-04-10T20:05:45.113239Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.100933Z" + } + }, "outputs": [], "source": [ "with tempfile.NamedTemporaryFile(\"r+\") as fastq:\n", @@ -624,7 +754,15 @@ { "cell_type": "code", "execution_count": 18, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.115209Z", + "iopub.status.busy": "2024-04-10T20:05:45.114844Z", + "iopub.status.idle": "2024-04-10T20:05:45.123314Z", + "shell.execute_reply": "2024-04-10T20:05:45.122677Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.115178Z" + } + }, "outputs": [ { "data": { @@ -705,7 +843,15 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.124438Z", + "iopub.status.busy": "2024-04-10T20:05:45.124130Z", + "iopub.status.idle": "2024-04-10T20:05:45.132350Z", + "shell.execute_reply": "2024-04-10T20:05:45.131799Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.124412Z" + } + }, "outputs": [ { "data": { @@ -770,6 +916,13 @@ " lib_1\n", " sample_1\n", " \n", + " \n", + " 5\n", + " read too short\n", + " 0\n", + " lib_1\n", + " sample_1\n", + " \n", " \n", "\n", "" @@ -780,7 +933,8 @@ "1 invalid barcode 3 lib_1 sample_1\n", "2 unparseable barcode 2 lib_1 sample_1\n", "3 low quality barcode 1 lib_1 sample_1\n", - "4 failed chastity filter 0 lib_1 sample_1" + "4 failed chastity filter 0 lib_1 sample_1\n", + "5 read too short 0 lib_1 sample_1" ] }, "execution_count": 19, @@ -803,7 +957,15 @@ { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.133441Z", + "iopub.status.busy": "2024-04-10T20:05:45.133143Z", + "iopub.status.idle": "2024-04-10T20:05:45.155925Z", + "shell.execute_reply": "2024-04-10T20:05:45.155393Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.133416Z" + } + }, "outputs": [ { "name": "stdout", @@ -929,6 +1091,13 @@ "classes": [], "id": "", "n": "14" + }, + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.157017Z", + "iopub.status.busy": "2024-04-10T20:05:45.156703Z", + "iopub.status.idle": "2024-04-10T20:05:45.163082Z", + "shell.execute_reply": "2024-04-10T20:05:45.162517Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.156993Z" } }, "outputs": [], @@ -955,7 +1124,15 @@ { "cell_type": "code", "execution_count": 22, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.164156Z", + "iopub.status.busy": "2024-04-10T20:05:45.163844Z", + "iopub.status.idle": "2024-04-10T20:05:45.171713Z", + "shell.execute_reply": "2024-04-10T20:05:45.170643Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.164133Z" + } + }, "outputs": [], "source": [ "pd.testing.assert_frame_equal(\n", @@ -967,6 +1144,7 @@ " (\"unparseable barcode\", n_unparseable),\n", " (\"low quality barcode\", n_low_quality),\n", " (\"failed chastity filter\", 0),\n", + " (\"read too short\", 0),\n", " ],\n", " columns=[\"fate\", \"count\"],\n", " ),\n", @@ -984,17 +1162,28 @@ { "cell_type": "code", "execution_count": 23, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T20:05:45.173799Z", + "iopub.status.busy": "2024-04-10T20:05:45.173196Z", + "iopub.status.idle": "2024-04-10T20:05:49.443413Z", + "shell.execute_reply": "2024-04-10T20:05:49.442570Z", + "shell.execute_reply.started": "2024-04-10T20:05:45.173754Z" + } + }, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] }, "metadata": { - "needs_background": "light" + "image/png": { + "height": 308, + "width": 280 + } }, "output_type": "display_data" } @@ -1016,7 +1205,7 @@ " + geom_bar(stat=\"identity\")\n", " + facet_grid(\"sample ~ library\")\n", " + facet_grid(\"sample ~ library\")\n", - " + scale_fill_manual(CBPALETTE, guide=False)\n", + " + scale_fill_manual(CBPALETTE)\n", " + theme(\n", " figure_size=(\n", " 1.4 * (1 + len(fates[\"library\"].unique())),\n", @@ -1050,7 +1239,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.7" }, "toc": { "nav_menu": {}, diff --git a/tests/count_codonvariant_files/fates.csv b/tests/count_codonvariant_files/fates.csv index 501ead0..5b030ee 100644 --- a/tests/count_codonvariant_files/fates.csv +++ b/tests/count_codonvariant_files/fates.csv @@ -3,14 +3,17 @@ invalid barcode,1739,library-1,plasmid,run-1 low quality barcode,480,library-1,plasmid,run-1 unparseable barcode,281,library-1,plasmid,run-1 failed chastity filter,0,library-1,plasmid,run-1 +read too short,0,library-1,plasmid,run-1 valid barcode,0,library-1,plasmid,run-1 invalid barcode,1761,library-1,uninduced,run-1 low quality barcode,374,library-1,uninduced,run-1 unparseable barcode,362,library-1,uninduced,run-1 valid barcode,3,library-1,uninduced,run-1 failed chastity filter,0,library-1,uninduced,run-1 +read too short,0,library-1,uninduced,run-1 invalid barcode,1844,library-2,plasmid,run-1 low quality barcode,435,library-2,plasmid,run-1 unparseable barcode,220,library-2,plasmid,run-1 valid barcode,1,library-2,plasmid,run-1 failed chastity filter,0,library-2,plasmid,run-1 +read too short,0,library-2,plasmid,run-1 diff --git a/tests/illuminabarcodeparser_toy_example.ipynb b/tests/illuminabarcodeparser_toy_example.ipynb index f077c54..c51037d 100644 --- a/tests/illuminabarcodeparser_toy_example.ipynb +++ b/tests/illuminabarcodeparser_toy_example.ipynb @@ -15,7 +15,15 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:22.648068Z", + "iopub.status.busy": "2024-04-10T19:51:22.647666Z", + "iopub.status.idle": "2024-04-10T19:51:23.690513Z", + "shell.execute_reply": "2024-04-10T19:51:23.689696Z", + "shell.execute_reply.started": "2024-04-10T19:51:22.648033Z" + } + }, "outputs": [], "source": [ "import tempfile\n", @@ -35,7 +43,15 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:23.695972Z", + "iopub.status.busy": "2024-04-10T19:51:23.695581Z", + "iopub.status.idle": "2024-04-10T19:51:23.702713Z", + "shell.execute_reply": "2024-04-10T19:51:23.701795Z", + "shell.execute_reply.started": "2024-04-10T19:51:23.695935Z" + } + }, "outputs": [], "source": [ "parser = IlluminaBarcodeParser(bclen=4, upstream=\"ACATGA\", downstream=\"GACT\")" @@ -54,7 +70,15 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:23.708338Z", + "iopub.status.busy": "2024-04-10T19:51:23.707670Z", + "iopub.status.idle": "2024-04-10T19:51:23.726018Z", + "shell.execute_reply": "2024-04-10T19:51:23.725068Z", + "shell.execute_reply.started": "2024-04-10T19:51:23.708272Z" + } + }, "outputs": [], "source": [ "r1file = tempfile.NamedTemporaryFile(mode=\"w\")\n", @@ -163,23 +187,32 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:23.729694Z", + "iopub.status.busy": "2024-04-10T19:51:23.729427Z", + "iopub.status.idle": "2024-04-10T19:51:23.744681Z", + "shell.execute_reply": "2024-04-10T19:51:23.743646Z", + "shell.execute_reply.started": "2024-04-10T19:51:23.729664Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " barcode count\n", - "0 CGTA 2\n", - "1 AGTA 1\n", + "0 AGTA 1\n", + "1 CGTA 1\n", "2 GCCG 1\n", " fate count\n", - "0 valid barcode 4\n", - "1 unparseable barcode 3\n", + "0 unparseable barcode 3\n", + "1 valid barcode 3\n", "2 R1 / R2 disagree 1\n", "3 low quality barcode 1\n", - "4 failed chastity filter 0\n", - "5 invalid barcode 0\n" + "4 read too short 1\n", + "5 failed chastity filter 0\n", + "6 invalid barcode 0\n" ] } ], @@ -200,22 +233,31 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:23.748395Z", + "iopub.status.busy": "2024-04-10T19:51:23.748172Z", + "iopub.status.idle": "2024-04-10T19:51:23.759268Z", + "shell.execute_reply": "2024-04-10T19:51:23.758404Z", + "shell.execute_reply.started": "2024-04-10T19:51:23.748370Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " barcode count\n", - "0 CGTA 2\n", - "1 AAGT 1\n", + "0 AAGT 1\n", + "1 CGTA 1\n", "2 GCCG 1\n", " fate count\n", - "0 valid barcode 4\n", - "1 unparseable barcode 3\n", + "0 unparseable barcode 3\n", + "1 valid barcode 3\n", "2 low quality barcode 2\n", - "3 failed chastity filter 0\n", - "4 invalid barcode 0\n" + "3 read too short 1\n", + "4 failed chastity filter 0\n", + "5 invalid barcode 0\n" ] } ], @@ -235,22 +277,31 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:23.763562Z", + "iopub.status.busy": "2024-04-10T19:51:23.763160Z", + "iopub.status.idle": "2024-04-10T19:51:23.777870Z", + "shell.execute_reply": "2024-04-10T19:51:23.776948Z", + "shell.execute_reply.started": "2024-04-10T19:51:23.763528Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " barcode count library sample\n", - "0 CGTA 2 lib-1 s1\n", - "1 AAGT 1 lib-1 s1\n", + "0 AAGT 1 lib-1 s1\n", + "1 CGTA 1 lib-1 s1\n", "2 GCCG 1 lib-1 s1\n", " fate count library sample\n", - "0 valid barcode 4 lib-1 s1\n", - "1 unparseable barcode 3 lib-1 s1\n", + "0 unparseable barcode 3 lib-1 s1\n", + "1 valid barcode 3 lib-1 s1\n", "2 low quality barcode 2 lib-1 s1\n", - "3 failed chastity filter 0 lib-1 s1\n", - "4 invalid barcode 0 lib-1 s1\n" + "3 read too short 1 lib-1 s1\n", + "4 failed chastity filter 0 lib-1 s1\n", + "5 invalid barcode 0 lib-1 s1\n" ] } ], @@ -272,24 +323,33 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:23.779178Z", + "iopub.status.busy": "2024-04-10T19:51:23.778863Z", + "iopub.status.idle": "2024-04-10T19:51:23.792937Z", + "shell.execute_reply": "2024-04-10T19:51:23.791547Z", + "shell.execute_reply.started": "2024-04-10T19:51:23.779151Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " barcode count\n", - "0 CGTA 2\n", - "1 AGTA 1\n", + "0 AGTA 1\n", + "1 CGTA 1\n", "2 GCCG 1\n", "3 GGAG 1\n", " fate count\n", - "0 valid barcode 5\n", + "0 valid barcode 4\n", "1 unparseable barcode 2\n", "2 R1 / R2 disagree 1\n", "3 low quality barcode 1\n", - "4 failed chastity filter 0\n", - "5 invalid barcode 0\n" + "4 read too short 1\n", + "5 failed chastity filter 0\n", + "6 invalid barcode 0\n" ] } ], @@ -318,23 +378,32 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:23.795621Z", + "iopub.status.busy": "2024-04-10T19:51:23.794665Z", + "iopub.status.idle": "2024-04-10T19:51:23.811820Z", + "shell.execute_reply": "2024-04-10T19:51:23.811230Z", + "shell.execute_reply.started": "2024-04-10T19:51:23.795555Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " barcode count\n", - "0 CGTA 2\n", - "1 AGTA 1\n", + "0 AGTA 1\n", + "1 CGTA 1\n", "2 TAAT 0\n", " fate count\n", "0 unparseable barcode 3\n", - "1 valid barcode 3\n", + "1 valid barcode 2\n", "2 R1 / R2 disagree 1\n", "3 invalid barcode 1\n", "4 low quality barcode 1\n", - "5 failed chastity filter 0\n" + "5 read too short 1\n", + "6 failed chastity filter 0\n" ] } ], @@ -357,7 +426,15 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T19:51:23.813291Z", + "iopub.status.busy": "2024-04-10T19:51:23.812924Z", + "iopub.status.idle": "2024-04-10T19:51:23.816757Z", + "shell.execute_reply": "2024-04-10T19:51:23.815995Z", + "shell.execute_reply.started": "2024-04-10T19:51:23.813264Z" + } + }, "outputs": [], "source": [ "r1file.close()\n", @@ -367,7 +444,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -381,7 +458,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.11.7" }, "toc": { "base_numbering": 1, @@ -398,5 +475,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/tests/illuminabarcodeparser_toy_example_w_upstream2.ipynb b/tests/illuminabarcodeparser_toy_example_w_upstream2.ipynb new file mode 100644 index 0000000..6af345b --- /dev/null +++ b/tests/illuminabarcodeparser_toy_example_w_upstream2.ipynb @@ -0,0 +1,393 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Toy example with `IlluminaBarcodeParser` containing an `upstream2` sequence\n", + "This example illustrates use of a [IlluminaBarcodeParser](https://jbloomlab.github.io/dms_variants/dms_variants.illuminabarcodeparser.html#dms_variants.illuminabarcodeparser.IlluminaBarcodeParser) on a toy example.\n", + "\n", + "It is written primarily as a test for that class.\n", + "\n", + "Import required modules:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T22:17:38.979735Z", + "iopub.status.busy": "2024-04-10T22:17:38.979371Z", + "iopub.status.idle": "2024-04-10T22:17:39.999532Z", + "shell.execute_reply": "2024-04-10T22:17:39.998756Z", + "shell.execute_reply.started": "2024-04-10T22:17:38.979703Z" + } + }, + "outputs": [], + "source": [ + "import tempfile\n", + "\n", + "from dms_variants.illuminabarcodeparser import IlluminaBarcodeParser" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initialize an `IlluminaBarcodeParser` for a barcode arrangement that looks like this:\n", + "\n", + " 5'-[R2 binding site]-GCA-ACATGA-NNNN-[R1 binding site]-3'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T22:17:40.003968Z", + "iopub.status.busy": "2024-04-10T22:17:40.003644Z", + "iopub.status.idle": "2024-04-10T22:17:40.011300Z", + "shell.execute_reply": "2024-04-10T22:17:40.010467Z", + "shell.execute_reply.started": "2024-04-10T22:17:40.003937Z" + } + }, + "outputs": [], + "source": [ + "parser = IlluminaBarcodeParser(bclen=4, upstream=\"ACATGA\", upstream2=\"GCA\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create temporary file holding the FASTQ reads.\n", + "We write some valid test reads and some invalid reads. \n", + "The header for each read explains why it is valid / invalid. \n", + "We use quality scores of ``?`` (30) or ``+`` (10) for high- and low-quality bases:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T22:17:40.015148Z", + "iopub.status.busy": "2024-04-10T22:17:40.014722Z", + "iopub.status.idle": "2024-04-10T22:17:40.022961Z", + "shell.execute_reply": "2024-04-10T22:17:40.022166Z", + "shell.execute_reply.started": "2024-04-10T22:17:40.015118Z" + } + }, + "outputs": [], + "source": [ + "r1file = tempfile.NamedTemporaryFile(mode=\"w\")\n", + "\n", + "# valid TACG barcode, full flanking regions\n", + "_ = r1file.write(\"@valid_CGTA_barcode\\n\" \"CGTATCATGTTGC\\n\" \"+\\n\" \"?????????????\\n\")\n", + "\n", + "# valid CGTA barcode, partial flanking regions\n", + "_ = r1file.write(\n", + " \"@valid_CGTA_barcode_partial_flanking_region\\n\"\n", + " \"CGTATCATTGC\\n\"\n", + " \"+\\n\"\n", + " \"???????????\\n\"\n", + ")\n", + "\n", + "# valid GCCG barcode, extended flanking regions\n", + "_ = r1file.write(\n", + " \"@valid_GCCG_barcode_extended_flanking_region\\n\"\n", + " \"GCCGTCATGTTGCCAA\\n\"\n", + " \"+\\n\"\n", + " \"????????????????\\n\"\n", + ")\n", + "\n", + "# some sites low quality\n", + "_ = r1file.write(\"@low_quality_site\\n\" \"CGTATCATGTTGC\\n\" \"+\\n\" \"???+?????????\\n\")\n", + "\n", + "# N in barcode\n", + "_ = r1file.write(\"@N_in_barcode\\n\" \"CGTNTCATGTTGC\\n\" \"+\\n\" \"?????????????\\n\")\n", + "\n", + "# GGAG barcode, one mismatch in flanking region\n", + "_ = r1file.write(\n", + " \"@GGAG_barcode_one_mismatch_in_upstream\\n\" \"GGAGTCATGATGC\\n\" \"+\\n\" \"?????????????\\n\"\n", + ")\n", + "\n", + "# GGTG barcode, mismatch in both upstream regions\n", + "_ = r1file.write(\n", + " \"@GGTG_barcode_two_mismatch_in_upstream_and_upstream2\\n\"\n", + " \"GGTGTCATGATGG\\n\"\n", + " \"+\\n\"\n", + " \"?????????????\\n\"\n", + ")\n", + "\n", + "r1file.flush()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Parse the barcodes using both R1 and R2 reads:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T22:17:40.026647Z", + "iopub.status.busy": "2024-04-10T22:17:40.026403Z", + "iopub.status.idle": "2024-04-10T22:17:40.039583Z", + "shell.execute_reply": "2024-04-10T22:17:40.038894Z", + "shell.execute_reply.started": "2024-04-10T22:17:40.026620Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " barcode count\n", + "0 CGTA 1\n", + "1 GCCG 1\n", + " fate count\n", + "0 unparseable barcode 3\n", + "1 valid barcode 2\n", + "2 low quality barcode 1\n", + "3 read too short 1\n", + "4 failed chastity filter 0\n", + "5 invalid barcode 0\n" + ] + } + ], + "source": [ + "barcodes, fates = parser.parse(r1file.name)\n", + "print(barcodes)\n", + "print(fates)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now create a parser that allows mismatch in `upstream`, and check that we recover barcode:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T22:17:40.042962Z", + "iopub.status.busy": "2024-04-10T22:17:40.042642Z", + "iopub.status.idle": "2024-04-10T22:17:40.056823Z", + "shell.execute_reply": "2024-04-10T22:17:40.056209Z", + "shell.execute_reply.started": "2024-04-10T22:17:40.042936Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " barcode count\n", + "0 CGTA 1\n", + "1 GCCG 1\n", + "2 GGAG 1\n", + " fate count\n", + "0 valid barcode 3\n", + "1 unparseable barcode 2\n", + "2 low quality barcode 1\n", + "3 read too short 1\n", + "4 failed chastity filter 0\n", + "5 invalid barcode 0\n" + ] + } + ], + "source": [ + "parser_mismatch = IlluminaBarcodeParser(\n", + " bclen=4,\n", + " upstream=\"ACATGA\",\n", + " upstream2=\"GCA\",\n", + " upstream_mismatch=1,\n", + ")\n", + "barcodes_mismatch, fates_mismatch = parser_mismatch.parse(r1file.name)\n", + "print(barcodes_mismatch)\n", + "print(fates_mismatch)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now classify outer flank failures differently from unparseable barcodes:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T22:17:40.058324Z", + "iopub.status.busy": "2024-04-10T22:17:40.057954Z", + "iopub.status.idle": "2024-04-10T22:17:40.069187Z", + "shell.execute_reply": "2024-04-10T22:17:40.068353Z", + "shell.execute_reply.started": "2024-04-10T22:17:40.058295Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " barcode count\n", + "0 CGTA 1\n", + "1 GCCG 1\n", + "2 GGAG 1\n", + " fate count\n", + "0 valid barcode 3\n", + "1 invalid outer flank 1\n", + "2 low quality barcode 1\n", + "3 read too short 1\n", + "4 unparseable barcode 1\n", + "5 failed chastity filter 0\n", + "6 invalid barcode 0\n" + ] + } + ], + "source": [ + "parser_mismatch = IlluminaBarcodeParser(\n", + " bclen=4,\n", + " upstream=\"ACATGA\",\n", + " upstream2=\"GCA\",\n", + " upstream_mismatch=1,\n", + ")\n", + "barcodes_mismatch, fates_mismatch = parser_mismatch.parse(\n", + " r1file.name, outer_flank_fates=True\n", + ")\n", + "print(barcodes_mismatch)\n", + "print(fates_mismatch)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now create a parser that allows mismatch in `upstream` and `upstream2`, and check that we recover barcode:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T22:17:40.070486Z", + "iopub.status.busy": "2024-04-10T22:17:40.070109Z", + "iopub.status.idle": "2024-04-10T22:17:40.082614Z", + "shell.execute_reply": "2024-04-10T22:17:40.081961Z", + "shell.execute_reply.started": "2024-04-10T22:17:40.070458Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " barcode count\n", + "0 CGTA 1\n", + "1 GCCG 1\n", + "2 GGAG 1\n", + "3 GGTG 1\n", + " fate count\n", + "0 valid barcode 4\n", + "1 low quality barcode 1\n", + "2 read too short 1\n", + "3 unparseable barcode 1\n", + "4 failed chastity filter 0\n", + "5 invalid barcode 0\n" + ] + } + ], + "source": [ + "parser_mismatch = IlluminaBarcodeParser(\n", + " bclen=4,\n", + " upstream=\"ACATGA\",\n", + " upstream2=\"GCA\",\n", + " upstream_mismatch=1,\n", + " upstream2_mismatch=1,\n", + ")\n", + "barcodes_mismatch, fates_mismatch = parser_mismatch.parse(r1file.name)\n", + "print(barcodes_mismatch)\n", + "print(fates_mismatch)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Close the temporary file:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-04-10T22:17:40.083879Z", + "iopub.status.busy": "2024-04-10T22:17:40.083467Z", + "iopub.status.idle": "2024-04-10T22:17:40.087525Z", + "shell.execute_reply": "2024-04-10T22:17:40.086863Z", + "shell.execute_reply.started": "2024-04-10T22:17:40.083850Z" + } + }, + "outputs": [], + "source": [ + "r1file.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/monotonicsplineepistasisbottlenecklikelihood_model.ipynb b/tests/monotonicsplineepistasisbottlenecklikelihood_model.ipynb index eb6b40d..1150b2c 100644 --- a/tests/monotonicsplineepistasisbottlenecklikelihood_model.ipynb +++ b/tests/monotonicsplineepistasisbottlenecklikelihood_model.ipynb @@ -490,7 +490,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "_ = p.draw()" ] diff --git a/tests/monotonicsplineepistasiscauchylikelihood_model.ipynb b/tests/monotonicsplineepistasiscauchylikelihood_model.ipynb index 5b962c2..53b625d 100644 --- a/tests/monotonicsplineepistasiscauchylikelihood_model.ipynb +++ b/tests/monotonicsplineepistasiscauchylikelihood_model.ipynb @@ -745,7 +745,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "_ = p.draw()" ] diff --git a/tests/monotonicsplineepistasisgaussianlikelihood_model.ipynb b/tests/monotonicsplineepistasisgaussianlikelihood_model.ipynb index e9206c9..ddf217b 100644 --- a/tests/monotonicsplineepistasisgaussianlikelihood_model.ipynb +++ b/tests/monotonicsplineepistasisgaussianlikelihood_model.ipynb @@ -605,7 +605,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "_ = p.draw()" ] diff --git a/tests/noepistasisbottlenecklikelihood_model.ipynb b/tests/noepistasisbottlenecklikelihood_model.ipynb index 01ab9ee..35c0108 100644 --- a/tests/noepistasisbottlenecklikelihood_model.ipynb +++ b/tests/noepistasisbottlenecklikelihood_model.ipynb @@ -403,7 +403,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "_ = p.draw()" ] diff --git a/tests/noepistasisgaussianlikelihood_model.ipynb b/tests/noepistasisgaussianlikelihood_model.ipynb index 402e350..02307d3 100644 --- a/tests/noepistasisgaussianlikelihood_model.ipynb +++ b/tests/noepistasisgaussianlikelihood_model.ipynb @@ -705,7 +705,7 @@ " axis_text_x=element_text(angle=90),\n", " panel_grid_major_x=element_blank(), # no vertical grid lines\n", " )\n", - " + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n", + " + scale_fill_manual(values=CBPALETTE[1:])\n", ")\n", "_ = p.draw()" ] diff --git a/tests/test_count_codonvariants.py b/tests/test_count_codonvariants.py index 7d3defc..ba165e2 100644 --- a/tests/test_count_codonvariants.py +++ b/tests/test_count_codonvariants.py @@ -155,6 +155,7 @@ def test_count_codonvariants(self): ) fatesfile = os.path.join(indir, "fates.csv") + print(fates) assert_frame_equal(fates, pd.read_csv(fatesfile)) libs_to_analyze = ["library-1"]