From 1baeb88c6fff2ae5157bfe1b1ca055d68fdcfa3a Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 19 Mar 2024 11:14:45 +0000 Subject: [PATCH] add BGC --- dependencies/py-environment.yml | 1 - .../Atlanteco Interactive Sample Map.ipynb | 366 ++++++++++++++---- 2 files changed, 295 insertions(+), 72 deletions(-) diff --git a/dependencies/py-environment.yml b/dependencies/py-environment.yml index 086cfc8..3f57f3a 100644 --- a/dependencies/py-environment.yml +++ b/dependencies/py-environment.yml @@ -19,5 +19,4 @@ dependencies: - pip - pip: - sourmash==4.1.2 - - rocrate==0.9.0 - gffutils==0.12 diff --git a/src/notebooks/Python Examples/Atlanteco Interactive Sample Map.ipynb b/src/notebooks/Python Examples/Atlanteco Interactive Sample Map.ipynb index 0c3ee20..d10e415 100644 --- a/src/notebooks/Python Examples/Atlanteco Interactive Sample Map.ipynb +++ b/src/notebooks/Python Examples/Atlanteco Interactive Sample Map.ipynb @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 265, "id": "78657238-8961-4fe9-ac64-02dda587c8b8", "metadata": { "is_executing": true, @@ -99,9 +99,9 @@ " \n", " type\n", " id\n", + " attributes.samples-count\n", " attributes.bioproject\n", " attributes.accession\n", - " attributes.samples-count\n", " attributes.is-private\n", " attributes.last-update\n", " attributes.secondary-accession\n", @@ -117,9 +117,9 @@ " 0\n", " studies\n", " MGYS00006613\n", + " 58\n", " PRJEB40759\n", " MGYS00006613\n", - " 58\n", " False\n", " 2024-03-01T18:29:37\n", " ERP124426\n", @@ -133,9 +133,9 @@ " 1\n", " studies\n", " MGYS00006612\n", + " 48\n", " PRJEB40763\n", " MGYS00006612\n", - " 48\n", " False\n", " 2024-03-01T18:14:18\n", " ERP124432\n", @@ -149,9 +149,9 @@ " 2\n", " studies\n", " MGYS00006611\n", + " 63\n", " PRJEB55999\n", " MGYS00006611\n", - " 63\n", " False\n", " 2024-03-01T18:01:09\n", " ERP140920\n", @@ -165,9 +165,9 @@ " 3\n", " studies\n", " MGYS00006610\n", + " 50\n", " PRJEB56005\n", " MGYS00006610\n", - " 50\n", " False\n", " 2024-03-01T17:44:36\n", " ERP140926\n", @@ -181,9 +181,9 @@ " 4\n", " studies\n", " MGYS00006609\n", + " 193\n", " PRJEB9737\n", " MGYS00006609\n", - " 193\n", " False\n", " 2024-03-01T17:30:57\n", " ERP010877\n", @@ -198,19 +198,19 @@ "" ], "text/plain": [ - " type id attributes.bioproject attributes.accession \\\n", - "0 studies MGYS00006613 PRJEB40759 MGYS00006613 \n", - "1 studies MGYS00006612 PRJEB40763 MGYS00006612 \n", - "2 studies MGYS00006611 PRJEB55999 MGYS00006611 \n", - "3 studies MGYS00006610 PRJEB56005 MGYS00006610 \n", - "4 studies MGYS00006609 PRJEB9737 MGYS00006609 \n", + " type id attributes.samples-count attributes.bioproject \\\n", + "0 studies MGYS00006613 58 PRJEB40759 \n", + "1 studies MGYS00006612 48 PRJEB40763 \n", + "2 studies MGYS00006611 63 PRJEB55999 \n", + "3 studies MGYS00006610 50 PRJEB56005 \n", + "4 studies MGYS00006609 193 PRJEB9737 \n", "\n", - " attributes.samples-count attributes.is-private attributes.last-update \\\n", - "0 58 False 2024-03-01T18:29:37 \n", - "1 48 False 2024-03-01T18:14:18 \n", - "2 63 False 2024-03-01T18:01:09 \n", - "3 50 False 2024-03-01T17:44:36 \n", - "4 193 False 2024-03-01T17:30:57 \n", + " attributes.accession attributes.is-private attributes.last-update \\\n", + "0 MGYS00006613 False 2024-03-01T18:29:37 \n", + "1 MGYS00006612 False 2024-03-01T18:14:18 \n", + "2 MGYS00006611 False 2024-03-01T18:01:09 \n", + "3 MGYS00006610 False 2024-03-01T17:44:36 \n", + "4 MGYS00006609 False 2024-03-01T17:30:57 \n", "\n", " attributes.secondary-accession attributes.centre-name \\\n", "0 ERP124426 Ocean Sampling Day Consortium \n", @@ -248,7 +248,7 @@ "4 [{'id': 'root:Environmental:Aquatic:Marine', '... " ] }, - "execution_count": 75, + "execution_count": 265, "metadata": {}, "output_type": "execute_result" } @@ -279,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 266, "id": "bc5d0fc9-5c05-4bb1-b401-507b6e8c8877", "metadata": { "tags": [] @@ -317,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 267, "id": "2f503c45-d3c0-45b8-a9d5-e3d3eb69ce95", "metadata": { "tags": [] @@ -421,7 +421,7 @@ "SRS580495 SRS580495 MGYS00005810 -54.5100 10.2900 #FF0000" ] }, - "execution_count": 77, + "execution_count": 267, "metadata": {}, "output_type": "execute_result" } @@ -434,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 268, "id": "6c09ddf3-746e-4891-96a5-8f70682d530f", "metadata": { "tags": [] @@ -443,7 +443,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6ef86a64cf4c422da77e6367d252b6bb", + "model_id": "f5a7035d9747405f86b9fa98da925f94", "version_major": 2, "version_minor": 0 }, @@ -451,7 +451,7 @@ "Map(center=[0, 0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_text'…" ] }, - "execution_count": 78, + "execution_count": 268, "metadata": {}, "output_type": "execute_result" } @@ -490,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 271, "id": "8c7d6790-1cec-4ecf-b8be-56e3e03fe669", "metadata": { "tags": [] @@ -536,10 +536,10 @@ " type\n", " id\n", " attributes.analysis-status\n", - " attributes.accession\n", " attributes.experiment-type\n", - " attributes.analysis-summary\n", " attributes.pipeline-version\n", + " attributes.accession\n", + " attributes.analysis-summary\n", " attributes.is-private\n", " attributes.last-update\n", " attributes.complete-time\n", @@ -559,10 +559,10 @@ " analysis-jobs\n", " MGYA00593142\n", " completed\n", - " MGYA00593142\n", " assembly\n", - " [{'key': 'Submitted nucleotide sequences', 'va...\n", " 5.0\n", + " MGYA00593142\n", + " [{'key': 'Submitted nucleotide sequences', 'va...\n", " False\n", " 2024-01-29T15:29:19.757516\n", " 2021-12-06T19:31:17\n", @@ -580,10 +580,10 @@ " analysis-jobs\n", " MGYA00589840\n", " completed\n", - " MGYA00589840\n", " assembly\n", - " [{'key': 'Submitted nucleotide sequences', 'va...\n", " 5.0\n", + " MGYA00589840\n", + " [{'key': 'Submitted nucleotide sequences', 'va...\n", " False\n", " 2024-01-29T15:29:19.757516\n", " 2021-10-22T17:25:45\n", @@ -601,10 +601,10 @@ " analysis-jobs\n", " MGYA00589562\n", " completed\n", - " MGYA00589562\n", " assembly\n", - " [{'key': 'Submitted nucleotide sequences', 'va...\n", " 5.0\n", + " MGYA00589562\n", + " [{'key': 'Submitted nucleotide sequences', 'va...\n", " False\n", " 2024-01-29T15:29:19.757516\n", " 2021-10-21T08:23:57\n", @@ -622,10 +622,10 @@ " analysis-jobs\n", " MGYA00589561\n", " completed\n", - " MGYA00589561\n", " assembly\n", - " [{'key': 'Submitted nucleotide sequences', 'va...\n", " 5.0\n", + " MGYA00589561\n", + " [{'key': 'Submitted nucleotide sequences', 'va...\n", " False\n", " 2024-01-29T15:29:19.757516\n", " 2021-10-21T08:16:10\n", @@ -643,10 +643,10 @@ " analysis-jobs\n", " MGYA00589560\n", " completed\n", - " MGYA00589560\n", " assembly\n", - " [{'key': 'Submitted nucleotide sequences', 'va...\n", " 5.0\n", + " MGYA00589560\n", + " [{'key': 'Submitted nucleotide sequences', 'va...\n", " False\n", " 2024-01-29T15:29:19.757516\n", " 2021-10-21T08:13:12\n", @@ -671,26 +671,19 @@ "0 analysis-jobs MGYA00589561 completed \n", "0 analysis-jobs MGYA00589560 completed \n", "\n", - " attributes.accession attributes.experiment-type \\\n", - "0 MGYA00593142 assembly \n", - "0 MGYA00589840 assembly \n", - "0 MGYA00589562 assembly \n", - "0 MGYA00589561 assembly \n", - "0 MGYA00589560 assembly \n", - "\n", - " attributes.analysis-summary \\\n", - "0 [{'key': 'Submitted nucleotide sequences', 'va... \n", - "0 [{'key': 'Submitted nucleotide sequences', 'va... \n", - "0 [{'key': 'Submitted nucleotide sequences', 'va... \n", - "0 [{'key': 'Submitted nucleotide sequences', 'va... \n", - "0 [{'key': 'Submitted nucleotide sequences', 'va... \n", + " attributes.experiment-type attributes.pipeline-version attributes.accession \\\n", + "0 assembly 5.0 MGYA00593142 \n", + "0 assembly 5.0 MGYA00589840 \n", + "0 assembly 5.0 MGYA00589562 \n", + "0 assembly 5.0 MGYA00589561 \n", + "0 assembly 5.0 MGYA00589560 \n", "\n", - " attributes.pipeline-version attributes.is-private \\\n", - "0 5.0 False \n", - "0 5.0 False \n", - "0 5.0 False \n", - "0 5.0 False \n", - "0 5.0 False \n", + " attributes.analysis-summary attributes.is-private \\\n", + "0 [{'key': 'Submitted nucleotide sequences', 'va... False \n", + "0 [{'key': 'Submitted nucleotide sequences', 'va... False \n", + "0 [{'key': 'Submitted nucleotide sequences', 'va... False \n", + "0 [{'key': 'Submitted nucleotide sequences', 'va... False \n", + "0 [{'key': 'Submitted nucleotide sequences', 'va... False \n", "\n", " attributes.last-update attributes.complete-time \\\n", "0 2024-01-29T15:29:19.757516 2021-12-06T19:31:17 \n", @@ -728,7 +721,7 @@ "0 SRS580495 samples " ] }, - "execution_count": 79, + "execution_count": 271, "metadata": {}, "output_type": "execute_result" } @@ -758,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 272, "id": "7454b97b-2f9d-45b9-bc96-f1877cf3b58a", "metadata": { "tags": [] @@ -791,7 +784,7 @@ " m.add_points_from_xy(df2, \n", " x='lon', \n", " y='lat', \n", - " popup=[\"study_ID\", \"sample_ID\", \"assembly_ID\", \"analysis_ID\"],\n", + " popup=[\"study_ID\", \"sample_ID\", \"assembly_ID\", \"analysis_ID\", identifier],\n", " color_column=identifier, add_legend=False)\n", " return m" ] @@ -802,14 +795,14 @@ "metadata": {}, "source": [ "## GO term\n", - "This example is written for GO-term for biotin transport [GO:0015878](http://www.candidagenome.org/cgi-bin/GO/go.pl?goid=15878)\n", + "This example is written for GO-term for **biotin transport** [GO:0015878](http://www.candidagenome.org/cgi-bin/GO/go.pl?goid=15878)\n", "\n", "Other GO identifiers are available on the MGnify API." ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 273, "id": "20035a8f-7957-4d84-9099-0f930287fd69", "metadata": { "tags": [] @@ -835,7 +828,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1ad12d9989244fbf8884f3bd481fb4e8", + "model_id": "c0f03143745f4c8896ffc4f94f7262f4", "version_major": 2, "version_minor": 0 }, @@ -843,7 +836,7 @@ "Map(center=[0, 0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_text'…" ] }, - "execution_count": 81, + "execution_count": 273, "metadata": {}, "output_type": "execute_result" } @@ -865,14 +858,14 @@ }, "source": [ "## InterPro entry\n", - "This example is written for InterPro entry [IPR001650](https://www.ebi.ac.uk/interpro/entry/InterPro/IPR001650): Helicase, C-terminal domain-like \n", + "This example is written for InterPro entry [IPR001650](https://www.ebi.ac.uk/interpro/entry/InterPro/IPR001650): **Helicase, C-terminal domain-like**\n", "\n", "Other IPS identifiers are available on the MGnify API." ] }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 274, "id": "0ec7bbb7-0538-4d3d-b58e-efc10db32549", "metadata": { "tags": [] @@ -892,13 +885,13 @@ "processing MGYA00589557\n", "processing MGYA00589556\n", "processing MGYA00589555\n", - "Presented 0 of interpro-identifiers GO:0015878\n" + "Presented 10 of interpro-identifiers IPR001650\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6bbf06b4f71e4a838371b7c5864e29a6", + "model_id": "9fcde7f644004ebc9a1c3d8f5d8c5699", "version_major": 2, "version_minor": 0 }, @@ -906,7 +899,7 @@ "Map(center=[0, 0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_text'…" ] }, - "execution_count": 86, + "execution_count": 274, "metadata": {}, "output_type": "execute_result" } @@ -920,10 +913,241 @@ "map_vis" ] }, + { + "cell_type": "markdown", + "id": "75614961-8aab-4aa3-870e-92c745e4332d", + "metadata": {}, + "source": [ + "## BGC (Biosynthetic Gene Clusters)\n", + "\n", + "MGnify has additional analysis of [BGCs](https://mibig.secondarymetabolites.org/) provided by [Sanntis](https://github.com/Finn-Lab/SanntiS). These annotations are saved as [RO-Crates](https://www.researchobject.org/ro-crate/) objects and linked to assembly records.\n", + "\n", + "The following example counts the number of **truncated from beggining** proteins of nearest MiBIG class **Polyketide** with dice distance more than **0.65**. We will use [gffutils](https://daler.github.io/gffutils/index.html) for parsing GFF file.\n" + ] + }, + { + "cell_type": "markdown", + "id": "03c226f3-df3e-4cc8-8834-80d1ca602763", + "metadata": {}, + "source": [ + "Define a function to find GFF file in zipped archive by url:" + ] + }, + { + "cell_type": "code", + "execution_count": 275, + "id": "c228858d-4b5d-47a2-895a-32a55bd10173", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import requests\n", + "from zipfile import ZipFile\n", + "from io import BytesIO\n", + "\n", + "def find_gff_file(link):\n", + " # Read archive and find GFF file\n", + " \n", + " response = requests.get(link)\n", + "\n", + " # Check if the request was successful (status code 200)\n", + " if response.status_code == 200:\n", + " # Open the zip file from the content of the response\n", + " with ZipFile(BytesIO(response.content)) as zip_file:\n", + " # List all files in the zip archive\n", + " file_list = zip_file.namelist()\n", + "\n", + " # Filter files with .gff extension\n", + " gff_files = [file_name for file_name in file_list if file_name.endswith(\".gff\")]\n", + " print(f\"Found {gff_files}\")\n", + " # Read the first .gff file (you can modify this to read a specific file)\n", + " if gff_files:\n", + " first_gff_file = gff_files[0]\n", + " gff_content = zip_file.open(first_gff_file).read()\n", + " return gff_content\n", + " else:\n", + " print(\"No .gff files found in the zip archive.\")\n", + " return None\n", + " else:\n", + " print(\"Failed to fetch the zip file.\")\n", + " return None" + ] + }, + { + "cell_type": "markdown", + "id": "352dbabf-92da-470c-9b3a-cf46ee5e168f", + "metadata": {}, + "source": [ + "Define a function to get counts from GFF." + ] + }, + { + "cell_type": "code", + "execution_count": 276, + "id": "6bb16116-bbcf-4e74-a3ff-e7f328963ffc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import gffutils\n", + "\n", + "def get_count(nearest_MiBIG_class, nearest_MiBIG_diceDistance, partial_value, gff_content):\n", + " if gff_content:\n", + " try:\n", + " with tempfile.NamedTemporaryFile(delete=False) as temp_gff_file:\n", + " temp_gff_file.write(gff_content)\n", + " temp_gff_file_path = temp_gff_file.name\n", + "\n", + " # Create a GFF database using gffutils\n", + " db = gffutils.create_db(\n", + " temp_gff_file_path,\n", + " dbfn=':memory:', # Use an in-memory database\n", + " force=True, # Overwrite if the database already exists\n", + " keep_order=True, # Preserve feature order \n", + " )\n", + "\n", + " count = 0\n", + " for feature in db.all_features():\n", + " if feature[\"nearest_MiBIG_class\"][0] == nearest_MiBIG_class and \\\n", + " float(feature[\"nearest_MiBIG_diceDistance\"][0]) >= nearest_MiBIG_diceDistance and \\\n", + " feature[\"partial\"][0] == partial_value:\n", + " count += 1\n", + " print(f\"Count is {count}\")\n", + " return count\n", + " except:\n", + " print('Error in GFF DB')\n", + " return 0\n", + " else:\n", + " return 0" + ] + }, + { + "cell_type": "markdown", + "id": "47210f0e-c9b1-463f-a4e2-be97eeb41352", + "metadata": {}, + "source": [ + "Process data:" + ] + }, + { + "cell_type": "code", + "execution_count": 277, + "id": "7caa665a-036e-42a0-9d25-61d2e0e41239", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "processing MGYA00593142 ERZ2945023\n", + "Found ['ERZ2945023_FASTA.gb.sanntis.full.gff']\n", + "Count is 0\n", + "processing MGYA00589840 ERZ2945090\n", + "Found ['ERZ2945090_FASTA.gb.sanntis.full.gff']\n", + "Count is 2\n", + "processing MGYA00589562 ERZ2945101\n", + "Found ['ERZ2945101_FASTA.gb.sanntis.full.gff']\n", + "Error in GFF DB\n", + "processing MGYA00589561 ERZ2944669\n", + "Found ['ERZ2944669_FASTA.gb.sanntis.full.gff']\n", + "Error in GFF DB\n", + "processing MGYA00589560 ERZ2944798\n", + "Found ['ERZ2944798_FASTA.gb.sanntis.full.gff']\n", + "Error in GFF DB\n", + "processing MGYA00589559 ERZ2944724\n", + "Found ['ERZ2944724_FASTA.gb.sanntis.full.gff']\n", + "Count is 0\n", + "processing MGYA00589558 ERZ2944859\n", + "Found ['ERZ2944859_FASTA.gb.sanntis.full.gff']\n", + "Count is 0\n", + "processing MGYA00589557 ERZ2944879\n", + "Found ['ERZ2944879_FASTA.gb.sanntis.full.gff']\n", + "Count is 2\n", + "processing MGYA00589556 ERZ2945017\n", + "Found ['ERZ2945017_FASTA.gb.sanntis.full.gff']\n", + "Count is 0\n", + "processing MGYA00589555 ERZ2944878\n", + "Found ['ERZ2944878_FASTA.gb.sanntis.full.gff']\n", + "Count is 2\n" + ] + } + ], + "source": [ + "nearest_MiBIG_class = \"Polyketide\" \n", + "nearest_MiBIG_diceDistance = 0.65\n", + "partial_value = \"10\"\n", + "\n", + "counts = []\n", + "\n", + "with Session(\"https://www.ebi.ac.uk/metagenomics/api/v1\") as mgnify:\n", + " for idx, mgya in analyses.iterrows():\n", + " # get ERZ assembly accession\n", + " assembly = mgya['relationships.assembly.data.id']\n", + " annotations_for_assembly = mgnify.iterate(f'assemblies/{assembly}/extra-annotations')\n", + " sanntis_annotation = None\n", + " for item in annotations_for_assembly:\n", + " if 'sanntis' in item.id:\n", + " sanntis_annotation = item.links.self\n", + " break\n", + " if not sanntis_annotation:\n", + " print('Sanntis annotation was not found')\n", + " continue\n", + " \n", + " print(f\"processing {mgya.id} {assembly}\")\n", + " \n", + " gff_content = find_gff_file(sanntis_annotation)\n", + " counts.append(get_count(nearest_MiBIG_class, nearest_MiBIG_diceDistance, partial_value, gff_content))" + ] + }, + { + "cell_type": "markdown", + "id": "2806a194-b1b9-42af-8124-c00ce66fa66b", + "metadata": { + "tags": [] + }, + "source": [ + "Display on the interactive map" + ] + }, + { + "cell_type": "code", + "execution_count": 283, + "id": "0ba6ed0d-8169-4ddd-a9f9-3255350b2654", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "efb2adad335943738c604236714efcf4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map(center=[0, 0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_text'…" + ] + }, + "execution_count": 283, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identifier = \"bgc\"\n", + "analyses.insert(2, identifier, counts, True)\n", + "map_vis = show_on_map(analyses, studies_samples, identifier)\n", + "map_vis" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "ae2b8468-58e4-4032-ac1e-dd0f655fdaa3", + "id": "7ffb9081-73ba-4adf-9279-1d32f9e6c6fe", "metadata": {}, "outputs": [], "source": []