diff --git a/.github/workflows/install.yaml b/.github/workflows/install.yaml index 786741f97..de88ecc61 100644 --- a/.github/workflows/install.yaml +++ b/.github/workflows/install.yaml @@ -22,10 +22,3 @@ jobs: run: | conda activate chemnlp python -m src.chemnlp.data_val.validate data - - name: Lint - shell: bash -l {0} - run: | - conda activate chemnlp - black --check . - isort --check-only . - flake8 . diff --git a/.gitignore b/.gitignore index 63f2ba4b5..1c6f7d5cb 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,18 @@ dmypy.json .pyre/ # wandb +**/wandb/ scripts/wandb + +# scratch +scratch/ + + +# let's don't commit data unless we really want to +*.tab +*.csv + + +# vim +*~ +*.swp diff --git a/.gitmodules b/.gitmodules index 941b21b88..a184cb081 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "gpt-neox"] path = gpt-neox - url = git@github.com:EleutherAI/gpt-neox.git + url = git@github.com:OpenBioML/gpt-neox.git + branch = main diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b814d944a..757033acc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,8 @@ --- ci: autoupdate_schedule: quarterly + autofix_prs: false + submodules: false repos: - repo: https://github.com/pre-commit/pre-commit-hooks @@ -16,6 +18,7 @@ repos: rev: 0.2.2 hooks: - id: yamlfmt + exclude: ^experiments/configs - repo: https://github.com/psf/black rev: 22.12.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index db68515b4..c2cb22fb8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,19 +13,7 @@ Please make a [GitHub account](https://github.com/) prior to implementing a data For code and data contributions, we recommend you creata a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). If you do not have conda already installed on your system, we recommend installing [miniconda](https://docs.conda.io/en/latest/miniconda.html): -```bash -conda env create -f conda.yaml # Creates a conda env -conda activate chemnlp # Activate your conda environment -``` - -Then, please run - -```bash -pre-commit install -``` - -to install the [pre-commit hooks](https://pre-commit.com/). These will automatically format and lint your code upon every commit. -There might be some warnings, e.g., by `flake8`. If you struggle with them, do not hestiate to contact us. +To create your developer environment please follow the guidelines in the `Installation and set-up` of [README.md](README.md) # Implementing a dataset @@ -37,11 +25,17 @@ With "implementing" we mean the following: - Make an issue in this repository that you want to add this dataset (we will label this issue and assign it to you) - Make a PR that adds in a new folder in `data` - `meta.yaml` describing the dataset in the form that `transform.py` produces. We will use this later to construct the prompts. + > If your dataset has multiple natural splits (i.e. train, test, validation) you can create a \_meta.yaml for each. - `transform.py` Python code that transforms the original dataset (linked in `meta.yaml`) into a form that can be consumed by the loader. For tabular datasets that will mostly involve: Removing/merging duplicated entries, renaming columns, dropping unused columns. Try to keep the output your `transform.py` uses as lean as possible (i.e. no columns that will not be used). In some cases, you might envision that extra columns might be useful. If this is the case, please add them (e.g., indicating some grouping, etc.) Even though some examples create the `meta.yaml` in `transform.py` there is no need to do so. You can also do it by hand. + + + In the `transform.py` please try to download the data from an official resource. + We encourage you to upload the raw data to HuggingFace Hub, Foundry or some other repository and then retrieve the data from there with your script, if the raw data license permits it. + - If you need additional dependencies, add them to `dev-requirements.txt` (those are needed for linting/testing/validation) or `requirements.txt` (those are the ones for running `transform.py`) @@ -57,7 +51,7 @@ targets: - id: Solubility # name of the column in a tabular dataset description: Experimental aqueous solubility value (LogS) # description of what this column means units: log(mol/L) # units of the values in this column (leave empty if unitless) - type: continuous # can be "categorical", "ordinal", "continuous" + type: continuous # can be "categorical", "ordinal", "continuous", "boolean" names: # names for the property (to sample from for building the prompts) - solubility - water solubility @@ -69,9 +63,13 @@ targets: - solubility - water solubility - solubility in water +benchmarks: # lists all benchmarks this dataset has been part of. split_column is a column in this dataframe with the value "train", "valid", "test" - indicating to which fold a specific entry belongs to + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - id: InChI # column name - type: InChI # can be "SMILES", "SELFIES", "IUPAC", "OTHER" + type: InChI # can be "SMILES", "SELFIES", "IUPAC", "Other", "InChI", "InChiKey", "RXNSMILES", "RXNSMILESWAdd" see IdentifierEnum description: International Chemical Identifier # description (optional, except for "OTHER") license: CC0 1.0 # license under which the original dataset was published num_points: 10000 # number of datapoints in this dataset @@ -94,6 +92,7 @@ bibtex: # citation(s) for this dataset in BibTeX format journal = {Sci Data} }" ``` +Please do not simply copy/paste generic descriptions but try to give a concise and specific description for the dataset you are adding. For the typical material-property datasets, we will later use the `identifier` and `property` columns to create and fill prompt templates. In case your dataset isn't a simple tabular dataset with chemical compounds and properties, please also add the following additional fields for the templates: @@ -141,6 +140,68 @@ In this case, we will sample from the identifier and targets columns. If you spe Therefore, it is very important that the column names in the `meta.yaml` match the ones in the file that `transform.py` produces. One example of a prompt we might construct is `"What is the of "`, where we sample `target_name` from the names of the targets listed in `meta.yaml` and `identifier` from the identifiers provided in `meta.yaml`. +#### Splits + +If your dataset is part of a benchmark, please indicate what fold your data is part of using an additional `split_col` in which you use `train`, `valid`, `test` to indicate the split type. +Please indicate this in the `meta.yaml` under the field `split_col`. + +#### Identifiers + +We ask you to add `uris` and `pubchem_aids` in case you find suitable references. We distinguish certain types of identifiers, for which you have to specify the correct strings. The currently allowed types are in the `IdentifierEnum` in `src/chemnlp/data_val/model.py`: + +- `SMILES`: Use the canonical form ([RdKit](https://www.rdkit.org/docs/GettingStartedInPython.html)) +- `SELFIES`: [Self-referencing embedded strings](https://github.com/aspuru-guzik-group/selfies) +- `IUPAC`: IUPAC-Name, not use it for non-standard, common names +- `InChI` +- `InChIKey`: The key derived from the `InChI` +- `RXNSMILES`: The [reaction SMILES](https://www.daylight.com/meetings/summerschool98/course/dave/smiles-react.html) containing only educt and product +- `RXNSMILESWAdd`: The reaction SMILES also containing solvent and additives +- `Other`: For all other identifiers + +##### Uniform Resource Identifiers (URIs) + +If you have a uniform resource identifier (URI) that links to a suitable name of a property, please list it in the `uris` list for a given `target`. +Please ensure that the link is specific. If you have a boolean target that measures inhibition of a protein, link to `inhbitor of XY` and _not_ to the protein. +If such a link does not exist, leave the field empty. + +You might find suitable links using the following resources: + +- https://bioportal.bioontology.org/search +- https://goldbook.iupac.org/ + + +#### PubChem Assay IDs + +For some targets, the activity was measured using assays. In this case, please list the assays using with their _numeric_ PubChem assay id in the field `pubchem_aids`. +Please ensure that the _first_ entry in this list is a primary scan which corresponds to the target property (and not to its inverse or a control). +Keep in mind that we plan to look up the name and the description of the assay to build prompt. That is, the name of the assay of the _first entry_ in this list should also work in a prompt such as `Is active in `?` + +#### Prompt examples + +##### Boolean variables + +- `Is ?` +- ``` + What molecules in the list are ? + + - + - + - + ``` + + +#### Continuous variables + +- `What is of ?` +- ``` + What is the molecule with largest in the following list? + + - + - + - + ``` + + For datasets that are not in tabular form, we are still discussing the best process, but we also envision that we might perform some named-entity-recognition to also use some of the text datasets in a framework such as LIFT. Otherwise, we will simple use them in the typical GPT pretraining task. @@ -162,3 +223,16 @@ Our first experiments will be based on [Pythia model](https://github.com/Eleuthe If you are not familiar LLM training have a look at this very good guide: [Large-scale language modeling tutorials with PyTorch from TUNiB](https://nbviewer-org.translate.goog/github/tunib-ai/large-scale-lm-tutorials/blob/main/notebooks/01_introduction.ipynb?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=de&_x_tr_pto=wapp) Please have a look for the details in the [corresponding section in our proposal](https://docs.google.com/document/d/1C44EKSJRojm39P2CaxnEq-0FGwDRaknKxJ8lZI6xr5M/edit#heading=h.aww08l8o9tti). + +## Hugging Face Hub + +We have a preference for using the Hugging Face Hub and processing datasets through the [`datasets`](https://github.com/huggingface/datasets) package when storing larger datasets on the [OpenBioML](https://huggingface.co/OpenBioML) hub as it can offer us a lot of nice features such as + +- Easy multiprocessing parallelism for data cleaning +- Version controlling of the datasets as well as our code +- Easy interface into tokenisation & other aspects for model training +- Reuse of utility functions once we have a consistent data structure. + +However, don't feel pressured to use this if you're more comfortable contributing an external dataset in another format. We are primarily thinking of using this functionality for processed, combined datasets which are ready for training. + +Feel free to reach out to one of the team and read [this guide](https://huggingface.co/docs/datasets/upload_dataset#share-a-dataset-to-the-hub) for more information. diff --git a/README.md b/README.md index 68e420515..d59afeca8 100644 --- a/README.md +++ b/README.md @@ -1,46 +1,52 @@ [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md) # ChemNLP project ๐Ÿงช๐Ÿš€ + The ChemNLP project aims to -1) create an extensive chemistry dataset and -1) use it to train large language models (LLMs) that can leverage the data for a wide range of chemistry applications. -For more details see our [information material section below](#information-material). +1. create an extensive chemistry dataset and +1. use it to train large language models (LLMs) that can leverage the data for a wide range of chemistry applications. +For more details see our [information material section below](#information-material). # Information material -* [Introduction presentation](https://docs.google.com/presentation/d/1JkAKJveYsNGtAWoaksU8ykTdrC0aX3FshiFJ13SU6o8/edit?usp=sharing) -* [Project proposal](https://docs.google.com/document/d/1C44EKSJRojm39P2CaxnEq-0FGwDRaknKxJ8lZI6xr5M/edit?usp=sharing) -* [Task board](https://github.com/orgs/OpenBioML/projects/5/views/1) -* [awesome-chemistry-datasets repository](https://github.com/kjappelbaum/awesome-chemistry-datasets) to collect interesting chemistry datasets -* Weekly meetings are set up soon! Please join our [Discord community](#community) for more information. +- [Introduction presentation](https://docs.google.com/presentation/d/1JkAKJveYsNGtAWoaksU8ykTdrC0aX3FshiFJ13SU6o8/edit?usp=sharing) +- [Project proposal](https://docs.google.com/document/d/1C44EKSJRojm39P2CaxnEq-0FGwDRaknKxJ8lZI6xr5M/edit?usp=sharing) +- [Task board](https://github.com/orgs/OpenBioML/projects/5/views/1) +- [awesome-chemistry-datasets repository](https://github.com/kjappelbaum/awesome-chemistry-datasets) to collect interesting chemistry datasets +- Weekly meetings are set up soon! Please join our [Discord community](#community) for more information. # Community -Feel free to join our `#chemnlp` channel on our [OpenBioML discord server](https://discord.com/invite/GgDBFP8ZEt) to start the discussion in more detail. +Feel free to join our `#chemnlp` channel on our [OpenBioML discord server](https://discord.com/invite/GgDBFP8ZEt) to start the discussion in more detail. # Contributing + ChemNLP is an open-source project - your involvement is warmly welcome! If you're excited to join us, we recommend the following steps: -* Join our [Discord server](#community). -* Have a look at our [contributing guide](https://github.com/OpenBioML/chemnlp/blob/main/CONTRIBUTING.md). -* Looking for ideas? See our [task board](https://github.com/orgs/OpenBioML/projects/5/views/1) to see what we may need help with. -* Have an idea? Create an [issue](https://github.com/OpenBioML/chemnlp/issues)! +- Join our [Discord server](#community). +- Have a look at our [contributing guide](https://github.com/OpenBioML/chemnlp/blob/main/CONTRIBUTING.md). +- Looking for ideas? See our [task board](https://github.com/orgs/OpenBioML/projects/5/views/1) to see what we may need help with. +- Have an idea? Create an [issue](https://github.com/OpenBioML/chemnlp/issues)! # Note on the "ChemNLP" name -Our OpenBioML ChemNLP project is not afiliated to the [ChemNLP library from NIST](https://arxiv.org/abs/2209.08203) and we use "ChemNLP" as a general term to highlight our project focus. The datasets and models we create through our project will have a unique and recognizable name when we release them. +Our OpenBioML ChemNLP project is not afiliated to the [ChemNLP library from NIST](https://arxiv.org/abs/2209.08203) and we use "ChemNLP" as a general term to highlight our project focus. The datasets and models we create through our project will have a unique and recognizable name when we release them. # About OpenBioML.org + See https://openbioml.org, especially [our approach and partners](https://openbioml.org/approach-and-partners.html). # Installation and set-up + Create a new conda environment with Python 3.8: + ``` conda create -n chemnlp python=3.8 conda activate chemnlp ``` + To install the `chemnlp` package (and required dependencies): ``` @@ -48,12 +54,22 @@ pip install chemnlp ``` If working on developing the python package: + ``` pip install -e "chemnlp[dev]" # to install development dependencies ``` If extra dependencies are required (e.g. for dataset creation) but are not needed for the main package please add to the `pyproject.toml` in the `dataset_creation` variable and ensure this is reflected in the `conda.yml` file. +Then, please run + +```bash +pre-commit install +``` + +to install the [pre-commit hooks](https://pre-commit.com/). These will automatically format and lint your code upon every commit. +There might be some warnings, e.g., by `flake8`. If you struggle with them, do not hestiate to contact us. + **Note** If working on model training, request access to the `wandb` project `chemnlp` @@ -62,3 +78,30 @@ and log-in to `wandb` with your API key per [here](https://docs.wandb.ai/quickst ### Adding a new dataset (to the model training pipline) We specify datasets by creating a new function [here](src/chemnlp/data/hf_datasets.py) which is named per the dataset on Hugging Face. At present the function must accept a tokenizer and return back the tokenized train and validation datasets. + +### Installing submodules + +In order to ensure you also clone and install the required submodules (i.e. gpt-neox) you will have to do one of the following; + +- Recursively clone the submodule from GitHub + + ``` + # using ssh (if you have your ssh key on GitHub) + git clone --recurse-submodules --remote-submodules git@github.com:OpenBioML/chemnlp.git + + # using https (if you use personal access token) + git clone --recurse-submodules --remote-submodules [git@github.com:OpenBioML/chemnlp.git ](https://github.com/OpenBioML/chemnlp.git) + ``` + + > This will automatically initialize and update each submodule in the repository, including nested submodules if any of the submodules in the repository have submodules themselve + +- Initialise and install the submodule after cloning + + ``` + git submodule init # registers submodule + git submodule update # clones and updates submodule + ``` + +### Experiments + +Follow the guidelines [here](experiments/README.md) for more information about running experiments on the Stability AI cluster. diff --git a/data/Butkiewicz/cav3_t-type_calcium_channels_butkiewicz/meta.yaml b/data/Butkiewicz/cav3_t-type_calcium_channels_butkiewicz/meta.yaml deleted file mode 100644 index 6a35c7aca..000000000 --- a/data/Butkiewicz/cav3_t-type_calcium_channels_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: cav3_t-type_calcium_channels_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: activity_cav3_t_type_calcium_channels - description: whether it active against cav3 t-type calcium channels receptor (1) or not (0). - units: activity - type: categorical - names: - - cav3 t-type calcium channels activity - - cav3 t-type calcium channels Inhibitor - - activity against cav3 t-type calcium channels - - cav3 t-type calcium channels receptor -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 100875 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/cav3_t-type_calcium_channels_butkiewicz/transform.py b/data/Butkiewicz/cav3_t-type_calcium_channels_butkiewicz/transform.py deleted file mode 100644 index ef5f6c806..000000000 --- a/data/Butkiewicz/cav3_t-type_calcium_channels_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "cav3_t-type_calcium_channels_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "activity_cav3_t_type_calcium_channels", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "cav3_t-type_calcium_channels_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "activity_cav3_t_type_calcium_channels", # name of the column in a tabular dataset - "description": "whether it active against cav3 t-type calcium channels receptor (1) or not (0).", # description of what this column means - "units": "activity", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "cav3 t-type calcium channels activity", - "cav3 t-type calcium channels Inhibitor", - "activity against cav3 t-type calcium channels", - "cav3 t-type calcium channels receptor", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/Butkiewicz/choline_transporter_butkiewicz/meta.yaml b/data/Butkiewicz/choline_transporter_butkiewicz/meta.yaml deleted file mode 100644 index 4b95cb28b..000000000 --- a/data/Butkiewicz/choline_transporter_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: choline_transporter_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: activity_choline_transporter - description: whether it active against choline transporter receptor (1) or not (0). - units: activity - type: categorical - names: - - choline transporter activity - - choline transporter Inhibitor - - activity against choline transporter - - choline transporter receptor -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 302306 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/choline_transporter_butkiewicz/transform.py b/data/Butkiewicz/choline_transporter_butkiewicz/transform.py deleted file mode 100644 index 9ab4c9426..000000000 --- a/data/Butkiewicz/choline_transporter_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "choline_transporter_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "activity_choline_transporter", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "choline_transporter_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "activity_choline_transporter", # name of the column in a tabular dataset - "description": "whether it active against choline transporter receptor (1) or not (0).", # description of what this column means - "units": "activity", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "choline transporter activity", - "choline transporter Inhibitor", - "activity against choline transporter", - "choline transporter receptor", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/Butkiewicz/kcnq2_potassium_channel_butkiewicz/meta.yaml b/data/Butkiewicz/kcnq2_potassium_channel_butkiewicz/meta.yaml deleted file mode 100644 index 495f973d1..000000000 --- a/data/Butkiewicz/kcnq2_potassium_channel_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: kcnq2_potassium_channel_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: activity_kcnq2_potassium_channel - description: whether it active against kcnq2 potassium channel receptor (1) or not (0). - units: activity - type: categorical - names: - - kcnq2 potassium channel activity - - kcnq2 potassium channel Inhibitor - - activity against kcnq2 potassium channel - - kcnq2 potassium channel receptor -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 302405 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/kcnq2_potassium_channel_butkiewicz/transform.py b/data/Butkiewicz/kcnq2_potassium_channel_butkiewicz/transform.py deleted file mode 100644 index d38302198..000000000 --- a/data/Butkiewicz/kcnq2_potassium_channel_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "kcnq2_potassium_channel_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "activity_kcnq2_potassium_channel", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "kcnq2_potassium_channel_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "activity_kcnq2_potassium_channel", # name of the column in a tabular dataset - "description": "whether it active against kcnq2 potassium channel receptor (1) or not (0).", # description of what this column means - "units": "activity", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "kcnq2 potassium channel activity", - "kcnq2 potassium channel Inhibitor", - "activity against kcnq2 potassium channel", - "kcnq2 potassium channel receptor", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/Butkiewicz/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml b/data/Butkiewicz/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml deleted file mode 100644 index d69806a1e..000000000 --- a/data/Butkiewicz/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: m1_muscarinic_receptor_agonists_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: m1_muscarinic_agonist - description: whether it agonist on m1 muscarinic receptor (1) or not (0). - units: agonist - type: categorical - names: - - m1 muscarinic activity - - m1 muscarinic agonist - - muscarinic - - m1 muscarinic receptor -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 61833 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/m1_muscarinic_receptor_agonists_butkiewicz/transform.py b/data/Butkiewicz/m1_muscarinic_receptor_agonists_butkiewicz/transform.py deleted file mode 100644 index d904356d5..000000000 --- a/data/Butkiewicz/m1_muscarinic_receptor_agonists_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "m1_muscarinic_receptor_agonists_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "m1_muscarinic_agonist", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "m1_muscarinic_receptor_agonists_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "m1_muscarinic_agonist", # name of the column in a tabular dataset - "description": "whether it agonist on m1 muscarinic receptor (1) or not (0).", # description of what this column means - "units": "agonist", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "m1 muscarinic activity", - "m1 muscarinic agonist", - "muscarinic", - "m1 muscarinic receptor", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/Butkiewicz/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml b/data/Butkiewicz/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml deleted file mode 100644 index 4c8b4aa6c..000000000 --- a/data/Butkiewicz/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: m1_muscarinic_receptor_antagonists_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: m1_muscarinic_antagonists - description: whether it antagonists on m1 muscarinic receptor (1) or not (0). - units: antagonists - type: categorical - names: - - m1 muscarinic activity - - m1 muscarinic antagonists - - muscarinic - - m1 muscarinic receptor -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 61756 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py b/data/Butkiewicz/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py deleted file mode 100644 index db0c1a197..000000000 --- a/data/Butkiewicz/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "m1_muscarinic_receptor_antagonists_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "m1_muscarinic_antagonists", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "m1_muscarinic_receptor_antagonists_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "m1_muscarinic_antagonists", # name of the column in a tabular dataset - "description": "whether it antagonists on m1 muscarinic receptor (1) or not (0).", # description of what this column means - "units": "antagonists", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "m1 muscarinic activity", - "m1 muscarinic antagonists", - "muscarinic", - "m1 muscarinic receptor", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/Butkiewicz/orexin1_receptor_butkiewicz/meta.yaml b/data/Butkiewicz/orexin1_receptor_butkiewicz/meta.yaml deleted file mode 100644 index 74e53ba6c..000000000 --- a/data/Butkiewicz/orexin1_receptor_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: orexin1_receptor_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: activity_orexin1 - description: whether it active against orexin1 receptor (1) or not (0). - units: activity - type: categorical - names: - - orexin1 activity - - orexin1 Inhibitor - - activity against orexin1 - - orexin1 receptor -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 218158 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/orexin1_receptor_butkiewicz/transform.py b/data/Butkiewicz/orexin1_receptor_butkiewicz/transform.py deleted file mode 100644 index c5e49e426..000000000 --- a/data/Butkiewicz/orexin1_receptor_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "orexin1_receptor_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "activity_orexin1", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "orexin1_receptor_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "activity_orexin1", # name of the column in a tabular dataset - "description": "whether it active against orexin1 receptor (1) or not (0).", # description of what this column means - "units": "activity", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "orexin1 activity", - "orexin1 Inhibitor", - "activity against orexin1", - "orexin1 receptor", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/Butkiewicz/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml b/data/Butkiewicz/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml deleted file mode 100644 index 6f964d42f..000000000 --- a/data/Butkiewicz/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: potassium_ion_channel_kir2_1_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: activity_potassium_ion_channel - description: whether it active against potassium ion channel (1) or not (0). - units: activity - type: categorical - names: - - potassium ion channel activity - - potassium ion channel - - activity against potassium ion channel - - activity -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 301493 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/potassium_ion_channel_kir2_1_butkiewicz/transform.py b/data/Butkiewicz/potassium_ion_channel_kir2_1_butkiewicz/transform.py deleted file mode 100644 index e9637f384..000000000 --- a/data/Butkiewicz/potassium_ion_channel_kir2_1_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "potassium_ion_channel_kir2.1_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "activity_potassium_ion_channel", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "potassium_ion_channel_kir2_1_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "activity_potassium_ion_channel", # name of the column in a tabular dataset - "description": "whether it active against potassium ion channel (1) or not (0).", # description of what this column means - "units": "activity", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "potassium ion channel activity", - "potassium ion channel", - "activity against potassium ion channel", - "activity", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/Butkiewicz/serine_threonine_kinase_33_butkiewicz/meta.yaml b/data/Butkiewicz/serine_threonine_kinase_33_butkiewicz/meta.yaml deleted file mode 100644 index c1566d7a0..000000000 --- a/data/Butkiewicz/serine_threonine_kinase_33_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: serine_threonine_kinase_33_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: activity_serine_threonine_kinase33 - description: whether it active against serine threonine kinase 33 receptor (1) or not (0). - units: activity - type: categorical - names: - - serine threonine kinase 33 activity - - serine threonine kinase 33 Inhibitor - - activity against serine threonine kinase 33 - - serine threonine kinase 33 receptor -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 319792 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/serine_threonine_kinase_33_butkiewicz/transform.py b/data/Butkiewicz/serine_threonine_kinase_33_butkiewicz/transform.py deleted file mode 100644 index ffbe6939d..000000000 --- a/data/Butkiewicz/serine_threonine_kinase_33_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "serine_threonine_kinase_33_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "activity_serine_threonine_kinase33", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "serine_threonine_kinase_33_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "activity_serine_threonine_kinase33", # name of the column in a tabular dataset - "description": "whether it active against serine threonine kinase 33 receptor (1) or not (0).", # description of what this column means - "units": "activity", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "serine threonine kinase 33 activity", - "serine threonine kinase 33 Inhibitor", - "activity against serine threonine kinase 33", - "serine threonine kinase 33 receptor", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/Butkiewicz/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml b/data/Butkiewicz/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml deleted file mode 100644 index 15aea7ddb..000000000 --- a/data/Butkiewicz/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: tyrosyl-dna_phosphodiesterase_butkiewicz -description: 'These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database - [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be - falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, - and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates - [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory - tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. - Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse - target classes; (3) Realistic (large number and highly imbalanced label).' -targets: - - id: activity_tyrosyl_dna_phosphodiesterase - description: whether it active against tyrosyl-dna phosphodiesterase receptor (1) or not (0). - units: activity - type: categorical - names: - - tyrosyl-dna phosphodiesterase activity - - tyrosyl-dna phosphodiesterase Inhibitor - - activity against tyrosyl-dna phosphodiesterase - - tyrosyl-dna phosphodiesterase receptor -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.3390/molecules18010735 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gky1033 - description: corresponding publication - - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ - description: corresponding publication -num_points: 341365 -url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al -bibtex: - - |- - @article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}} - - |- - @article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}} - - |- - @article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}} diff --git a/data/Butkiewicz/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py b/data/Butkiewicz/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py deleted file mode 100644 index e03c41d7d..000000000 --- a/data/Butkiewicz/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import HTS - - -def get_and_transform_data(): - # get raw data - label = "tyrosyl-dna_phosphodiesterase_butkiewicz" - data = HTS(name=label) - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) - - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data - - # check if fields are the same - fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] - - # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "activity_tyrosyl_dna_phosphodiesterase", - ] - df.columns = fields_clean - - # # data cleaning - # df.compound_id = ( - # df.compound_id.str.strip() - # ) # remove leading and trailing white space characters - - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - # create meta yaml - meta = { - "name": "tyrosyl-dna_phosphodiesterase_butkiewicz", # unique identifier, we will also use this for directory names - "description": """These are nine high-quality high-throughput screening (HTS) datasets from [1]. These datasets were curated from HTS data at the PubChem database [2]. Typically, HTS categorizes small molecules into hit, inactive, or unspecified against a certain therapeutic target. However, a compound may be falsely classified as a hit due to experimental artifacts such as optical interference. Moreover, because the screening is performed without duplicates, and the cutoff is often set loose to minimize the false negative rates, the results from the primary screens often contain high false positive rates [3]. Hence the result from the primary screen is only used as the first iteration to reduce the compound library to a smaller set of further confirmatory tests. Here each dataset is carefully collated through confirmation screens to validate active compounds. The curation process is documented in [1]. Each dataset is identified by the PubChem Assay ID (AID). Features of the datasets: (1) At least 150 confirmed active compounds present; (2) Diverse target classes; (3) Realistic (large number and highly imbalanced label).""", - "targets": [ - { - "id": "activity_tyrosyl_dna_phosphodiesterase", # name of the column in a tabular dataset - "description": "whether it active against tyrosyl-dna phosphodiesterase receptor (1) or not (0).", # description of what this column means - "units": "activity", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "tyrosyl-dna phosphodiesterase activity", - "tyrosyl-dna phosphodiesterase Inhibitor", - "activity against tyrosyl-dna phosphodiesterase", - "tyrosyl-dna phosphodiesterase receptor", - ], - }, - ], - "identifiers": [ - { - "id": "SMILES", # column name - "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "SMILES", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.3390/molecules18010735", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gky1033", - "description": "corresponding publication", - }, - { - "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", - "description": "corresponding publication", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", - "bibtex": [ - """@article{Butkiewicz2013, - doi = {10.3390/molecules18010735}, - url = {https://doi.org/10.3390/molecules18010735}, - year = {2013}, - month = jan, - publisher = {{MDPI} {AG}}, - volume = {18}, - number = {1}, - pages = {735--756}, - author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, - title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, - journal = {Molecules}}""", - """@article{Kim2018, - doi = {10.1093/nar/gky1033}, - url = {https://doi.org/10.1093/nar/gky1033}, - year = {2018}, - month = oct, - publisher = {Oxford University Press ({OUP})}, - volume = {47}, - number = {D1}, - pages = {D1102--D1109}, - author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, - title = {{PubChem} 2019 update: improved access to chemical data}, - journal = {Nucleic Acids Research}}""", - """@article{Butkiewicz2017, - doi = {}, - url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, - year = {2017}, - publisher = {Chem Inform}, - volume = {3}, - number = {1}, - author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, - title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, - journal = {Chemical Science}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() diff --git a/data/ClinTox/example_processing_and_templates.ipynb b/data/ClinTox/example_processing_and_templates.ipynb deleted file mode 100644 index 490e582f5..000000000 --- a/data/ClinTox/example_processing_and_templates.ipynb +++ /dev/null @@ -1,1263 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "27c08f91-3fa0-4800-8f6a-96a96f665cad", - "metadata": {}, - "source": [ - "# ClinTox" - ] - }, - { - "cell_type": "markdown", - "id": "6ef172b9-aad2-47da-bf4c-844a2a07ee8c", - "metadata": {}, - "source": [ - "Original data repository: https://tdcommons.ai/single_pred_tasks/tox/#clintox" - ] - }, - { - "cell_type": "markdown", - "id": "7d18c95d-2ec6-45e1-addc-54a890097b8e", - "metadata": {}, - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "cf59e3e9-8061-4022-9eae-e978311b4155", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import yaml\n", - "from tdc.single_pred import Tox" - ] - }, - { - "cell_type": "markdown", - "id": "a6751ff9-2e3e-4d01-8395-7a5ae0c200d7", - "metadata": {}, - "source": [ - "# Data processing" - ] - }, - { - "cell_type": "markdown", - "id": "a1169ad2-e4bb-41c6-9625-6d1644c44a5b", - "metadata": {}, - "source": [ - "## Download data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "7bb8eb5e-f513-40d2-a68c-7cda1a51ad31", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_original = \"data_original.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b39a142e-ccbc-49d2-98b0-a5f9bde9fd27", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading...\n", - "100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 110k/110k [00:00<00:00, 292kiB/s]\n", - "Loading...\n", - "Done!\n" - ] - } - ], - "source": [ - "data = Tox(name = 'ClinTox')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "26d9f62a-07f5-4113-8161-d5dfcf0bfb71", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data.get_data().to_csv(fn_data_original, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "43873fc3-20a8-487d-a7c5-33bd58414159", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 160K\r\n", - "drwxrwxr-x 2 melo melo 4.0K ู…ุงุฑ 2 16:52 data\r\n", - "-rw-rw-r-- 1 melo melo 102K ู…ุงุฑ 2 16:52 data_original.csv\r\n", - "-rw-rw-r-- 1 melo melo 37K ู…ุงุฑ 2 16:52 example_processing_and_templates.ipynb\r\n", - "-rw-rw-r-- 1 melo melo 1.4K ู…ุงุฑ 2 07:55 meta.yaml\r\n", - "-rw-rw-r-- 1 melo melo 4.5K ู…ุงุฑ 2 07:53 transform.py\r\n" - ] - } - ], - "source": [ - "!ls -lh" - ] - }, - { - "cell_type": "markdown", - "id": "d9cda29a-a133-4f0e-992b-e77c9070ee93", - "metadata": {}, - "source": [ - "## Load original data" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "77f614e7-b133-40bc-8759-2d930e4c120e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Drug_ID,Drug,Y\r\n", - "Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0\r\n", - "Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)[C@H]1Cl,0\r\n", - "Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)[O-],0\r\n", - "Drug 3,[H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH3+])cc2)C=C1,0\r\n" - ] - } - ], - "source": [ - "!head -n 5 {fn_data_original}" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8f5a0387-f9e3-4e1a-8d14-5df618195f70", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_data_original, delimiter=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "55b0bd63-62a0-469e-9d8a-e9ada3fe01c4", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Drug_IDDrugY
0Drug 0*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC0
1Drug 1Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...0
2Drug 2O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...0
3Drug 3[H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH...0
4Drug 4[H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...0
\n", - "
" - ], - "text/plain": [ - " Drug_ID Drug Y\n", - "0 Drug 0 *C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC 0\n", - "1 Drug 1 Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)... 0\n", - "2 Drug 2 O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(... 0\n", - "3 Drug 3 [H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH... 0\n", - "4 Drug 4 [H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/... 0" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "95158ac1-05d7-4a21-b8e4-7f720306d331", - "metadata": {}, - "source": [ - "## Add column = field names\n", - "Clean column names (`fields_clean`) and keep original names (`fields_orig`)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ec2458e5-455f-4f03-8ce9-c0d12e9ed371", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['Drug_ID', 'Drug', 'Y']" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fields_orig = df.columns.tolist()\n", - "fields_orig" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "08197f62", - "metadata": {}, - "outputs": [], - "source": [ - "assert fields_orig == ['Drug_ID', 'Drug', 'Y']" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "a46dd8ff-37b3-4894-8226-3bf98226dd09", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fields_clean = [\n", - " \"compound_id\",\n", - " \"SMILES\",\n", - " \"clinical_toxicity\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "785d37cb-1fb4-4a91-a923-d5a78a37f36a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.columns = fields_clean" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "8b4cde30", - "metadata": {}, - "outputs": [], - "source": [ - "assert fields_orig != fields_clean" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "1bf212cb-1653-457b-9f5d-416d4dd14b53", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_idSMILESclinical_toxicity
0Drug 0*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC0
1Drug 1Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...0
2Drug 2O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...0
3Drug 3[H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH...0
4Drug 4[H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...0
\n", - "
" - ], - "text/plain": [ - " compound_id SMILES \\\n", - "0 Drug 0 *C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC \n", - "1 Drug 1 Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)... \n", - "2 Drug 2 O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(... \n", - "3 Drug 3 [H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH... \n", - "4 Drug 4 [H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/... \n", - "\n", - " clinical_toxicity \n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "d544fa60-343e-40e1-bd0c-4750f07a7145", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "assert not df.duplicated().sum()" - ] - }, - { - "cell_type": "markdown", - "id": "bc6f52c1-e0f6-48b3-95f4-e36d9a5ecde8", - "metadata": {}, - "source": [ - "## Save to csv" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d6d5efa5-b4b4-4a25-8626-e10f3d691e83", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_csv = \"data_clean.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "727f8d7b-cbb6-43c7-9eab-9d4d65be6b3f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.to_csv(fn_data_csv, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "63c8d4a4-906e-418d-be39-879365b4dfa0", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-rw-rw-r-- 1 melo melo 102K ู…ุงุฑ 2 16:52 data_clean.csv\r\n" - ] - } - ], - "source": [ - "!ls -lh {fn_data_csv}" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "a51b9001-25d7-4e0e-a607-477cfc4a9f1c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "compound_id,SMILES,clinical_toxicity\r\n", - "Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0\r\n", - "Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)[C@H]1Cl,0\r\n", - "Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)[O-],0\r\n", - "Drug 3,[H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH3+])cc2)C=C1,0\r\n" - ] - } - ], - "source": [ - "!head -n 5 {fn_data_csv}" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "1a512943-4909-4d56-867d-50c151d8d607", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_idSMILESclinical_toxicity
0Drug 0*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC0
1Drug 1Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...0
2Drug 2O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...0
3Drug 3[H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH...0
4Drug 4[H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...0
\n", - "
" - ], - "text/plain": [ - " compound_id SMILES \\\n", - "0 Drug 0 *C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC \n", - "1 Drug 1 Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)... \n", - "2 Drug 2 O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(... \n", - "3 Drug 3 [H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH... \n", - "4 Drug 4 [H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/... \n", - "\n", - " clinical_toxicity \n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f3d730ce-fce0-49df-9eb8-b917e945fa9a", - "metadata": {}, - "source": [ - "## Load from csv" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "077b0c5f-8772-4879-9317-3fa28799689b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_csv = \"data_clean.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "6eaef0e6-2115-4793-ac43-a196b25d47a0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_data_csv)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "43619e7c-9c82-4ff0-ae25-403861304635", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_idSMILESclinical_toxicity
0Drug 0*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC0
1Drug 1Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...0
2Drug 2O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...0
3Drug 3[H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH...0
4Drug 4[H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...0
\n", - "
" - ], - "text/plain": [ - " compound_id SMILES \\\n", - "0 Drug 0 *C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC \n", - "1 Drug 1 Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)... \n", - "2 Drug 2 O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(... \n", - "3 Drug 3 [H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH... \n", - "4 Drug 4 [H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/... \n", - "\n", - " clinical_toxicity \n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "2f06e57c-02c5-493b-af65-c8bb9ac59421", - "metadata": {}, - "source": [ - "# meta YAML" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "49771077-471d-4d71-a9a7-d6b094bbc4f3", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_idSMILESclinical_toxicity
0Drug 0*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC0
1Drug 1Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...0
2Drug 2O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...0
3Drug 3[H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH...0
4Drug 4[H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...0
\n", - "
" - ], - "text/plain": [ - " compound_id SMILES \\\n", - "0 Drug 0 *C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC \n", - "1 Drug 1 Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)... \n", - "2 Drug 2 O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(... \n", - "3 Drug 3 [H]/[NH+]=C(\\N)C1=CC(=O)/C(=C\\C=c2ccc(=C(N)[NH... \n", - "4 Drug 4 [H]/[NH+]=C(\\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/... \n", - "\n", - " clinical_toxicity \n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 " - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "meta = {\n", - " \"name\": \"ClinTox\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"\"\"The ClinTox dataset includes drugs that have failed clinical trials for toxicity reasons and also drugs that are associated with successful trials.\"\"\",\n", - " \"targets\": [\n", - " {\n", - " \"id\": \"clinical_toxicity\", # name of the column in a tabular dataset\n", - " \"description\": \"whether it can cause clinical toxicity (1) or not (0).\", # description of what this column means\n", - " \"units\": \"clinical_toxicity\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"categorical\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"clinical toxicity\",\n", - " \"toxicity\",\n", - " \"drug Induced clinical toxicity\",\n", - " \"drug failed in clinical trials\"\n", - " ],\n", - " },\n", - " ],\n", - " \"identifiers\": [\n", - " {\n", - " \"id\": \"SMILES\", # column name\n", - " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"Other\"\n", - " \"description\": \"SMILES\", # description (optional, except for \"Other\")\n", - " },\n", - " ],\n", - " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", - " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", - " {\n", - " \"url\": \"https://doi.org/10.1016/j.chembiol.2016.07.023\",\n", - " \"description\": \"corresponding publication\",\n", - " },\n", - " ],\n", - " \"num_points\": len(df), # number of datapoints in this dataset\n", - " \"url\": \"https://tdcommons.ai/single_pred_tasks/tox/#clintox\",\n", - " \"bibtex\": [\n", - " \"\"\"@article{Gayvert2016,\n", - " doi = {10.1016/j.chembiol.2016.07.023},\n", - " url = {https://doi.org/10.1016/j.chembiol.2016.07.023},\n", - " year = {2016},\n", - " month = oct,\n", - " publisher = {Elsevier {BV}},\n", - " volume = {23},\n", - " number = {10},\n", - " pages = {1294--1301},\n", - " author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento},\n", - " title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials},\n", - " journal = {Cell Chemical Biology}}\"\"\",\n", - " ],\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "ec455cf0-962a-4c0d-bb3e-066e415ffd9b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def str_presenter(dumper, data):\n", - " \"\"\"configures yaml for dumping multiline strings\n", - " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", - " \"\"\"\n", - " if data.count(\"\\n\") > 0: # check for multiline string\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", - "\n", - "\n", - "yaml.add_representer(str, str_presenter)\n", - "yaml.representer.SafeRepresenter.add_representer(\n", - " str, str_presenter\n", - ") # to use with safe_dum" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "580bbd79-4845-4515-be94-3e4a9815d048", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_meta = \"meta.yaml\"" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "873fa5dd-9b60-40f5-b537-4d7a206414ea", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "d01686c0-6746-4fc4-b019-350270dfc26f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-rw-rw-r-- 1 melo melo 1.3K ู…ุงุฑ 2 16:52 meta.yaml\r\n" - ] - } - ], - "source": [ - "!ls -lh {fn_meta}" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "ef6063c5-7a8b-4344-bccf-a073443feebf", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "name: ClinTox\r\n", - "description: The ClinTox dataset includes drugs that have failed clinical trials for\r\n", - " toxicity reasons and also drugs that are associated with successful trials.\r\n", - "targets:\r\n", - "- id: clinical_toxicity\r\n", - " description: whether it can cause clinical toxicity (1) or not (0).\r\n", - " units: clinical_toxicity\r\n", - " type: categorical\r\n", - " names:\r\n", - " - clinical toxicity\r\n", - " - toxicity\r\n", - " - drug Induced clinical toxicity\r\n", - " - drug failed in clinical trials\r\n", - "identifiers:\r\n", - "- id: SMILES\r\n", - " type: SMILES\r\n", - " description: SMILES\r\n", - "license: CC BY 4.0\r\n", - "links:\r\n", - "- url: https://doi.org/10.1016/j.chembiol.2016.07.023\r\n", - " description: corresponding publication\r\n", - "num_points: 1478\r\n", - "url: https://tdcommons.ai/single_pred_tasks/tox/#clintox\r\n", - "bibtex:\r\n", - "- |-\r\n", - " @article{Gayvert2016,\r\n", - " doi = {10.1016/j.chembiol.2016.07.023},\r\n", - " url = {https://doi.org/10.1016/j.chembiol.2016.07.023},\r\n", - " year = {2016},\r\n", - " month = oct,\r\n", - " publisher = {Elsevier {BV}},\r\n", - " volume = {23},\r\n", - " number = {10},\r\n", - " pages = {1294--1301},\r\n", - " author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento},\r\n", - " title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials},\r\n", - " journal = {Cell Chemical Biology}}\r\n" - ] - } - ], - "source": [ - "!cat {fn_meta}" - ] - }, - { - "cell_type": "markdown", - "id": "bd3f930a-638b-4bb7-a1d2-80688f2f6891", - "metadata": {}, - "source": [ - "# create transform.py" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "path_file = \"transform.py\"" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting transform.py\n" - ] - } - ], - "source": [ - "%%writefile $path_file\n", - "import pandas as pd\n", - "import yaml\n", - "from tdc.single_pred import Tox\n", - "\n", - "\n", - "def get_and_transform_data():\n", - " # get raw data\n", - " data = Tox(name = 'ClinTox')\n", - " fn_data_original = \"data_original.csv\"\n", - " data.get_data().to_csv(fn_data_original, index=False)\n", - "\n", - " # create dataframe\n", - " df = pd.read_csv(\n", - " fn_data_original,\n", - " delimiter=\",\",\n", - " ) # not necessary but ensure we can load the saved data\n", - "\n", - " # check if fields are the same\n", - " fields_orig = df.columns.tolist()\n", - " assert fields_orig == [\n", - " \"Drug_ID\",\n", - " \"Drug\",\n", - " \"Y\",\n", - " ]\n", - "\n", - " # overwrite column names = fields\n", - " fields_clean =[\n", - " \"compound_id\",\n", - " \"SMILES\",\n", - " \"clinical_toxicity\",\n", - " ]\n", - " df.columns = fields_clean\n", - "\n", - " # data cleaning\n", - " df.compound_id = (\n", - " df.compound_id.str.strip()\n", - " ) # remove leading and trailing white space characters\n", - "\n", - " assert not df.duplicated().sum()\n", - "\n", - " # save to csv\n", - " fn_data_csv = \"data_clean.csv\"\n", - " df.to_csv(fn_data_csv, index=False)\n", - "\n", - " # create meta yaml\n", - " meta = {\"name\": \"ClinTox\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"\"\"The ClinTox dataset includes drugs that have failed clinical trials for toxicity reasons and also drugs that are associated with successful trials.\"\"\",\n", - " \"targets\": [\n", - " {\n", - " \"id\": \"clinical_toxicity\", # name of the column in a tabular dataset\n", - " \"description\": \"whether it can cause clinical toxicity (1) or not (0).\", # description of what this column means\n", - " \"units\": \"clinical_toxicity\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"categorical\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"clinical toxicity\",\n", - " \"toxicity\",\n", - " \"drug Induced clinical toxicity\",\n", - " \"drug failed in clinical trials\"\n", - " ],\n", - " },\n", - " ],\n", - " \"identifiers\": [\n", - " {\n", - " \"id\": \"SMILES\", # column name\n", - " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"Other\"\n", - " \"description\": \"SMILES\", # description (optional, except for \"Other\")\n", - " },\n", - " ],\n", - " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", - " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", - " {\n", - " \"url\": \"https://doi.org/10.1016/j.chembiol.2016.07.023\",\n", - " \"description\": \"corresponding publication\",\n", - " },\n", - " ],\n", - " \"num_points\": len(df), # number of datapoints in this dataset\n", - " \"url\": \"https://tdcommons.ai/single_pred_tasks/tox/#clintox\",\n", - " \"bibtex\": [\n", - " \"\"\"@article{Gayvert2016,\n", - " doi = {10.1016/j.chembiol.2016.07.023},\n", - " url = {https://doi.org/10.1016/j.chembiol.2016.07.023},\n", - " year = {2016},\n", - " month = oct,\n", - " publisher = {Elsevier {BV}},\n", - " volume = {23},\n", - " number = {10},\n", - " pages = {1294--1301},\n", - " author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento},\n", - " title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials},\n", - " journal = {Cell Chemical Biology}}\"\"\",\n", - " ],\n", - " }\n", - "\n", - " def str_presenter(dumper, data):\n", - " \"\"\"configures yaml for dumping multiline strings\n", - " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", - " \"\"\"\n", - " if data.count(\"\\n\") > 0: # check for multiline string\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", - "\n", - " yaml.add_representer(str, str_presenter)\n", - " yaml.representer.SafeRepresenter.add_representer(\n", - " str, str_presenter\n", - " ) # to use with safe_dum\n", - " fn_meta = \"meta.yaml\"\n", - " with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)\n", - "\n", - " print(f\"Finished processing {meta['name']} dataset!\")\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " get_and_transform_data()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found local copy...\r\n", - "Loading...\r\n", - "Done!\r\n", - "Finished processing ClinTox dataset!\r\n" - ] - } - ], - "source": [ - "!python3 transform.py" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "953e7bee-bd5e-41d0-a2be-506e0bc97727", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 264K\r\n", - "drwxrwxr-x 2 melo melo 4.0K ู…ุงุฑ 2 16:52 \u001b[0m\u001b[01;34mdata\u001b[0m/\r\n", - "-rw-rw-r-- 1 melo melo 102K ู…ุงุฑ 2 16:52 data_clean.csv\r\n", - "-rw-rw-r-- 1 melo melo 102K ู…ุงุฑ 2 16:52 data_original.csv\r\n", - "-rw-rw-r-- 1 melo melo 37K ู…ุงุฑ 2 16:52 example_processing_and_templates.ipynb\r\n", - "-rw-rw-r-- 1 melo melo 1.3K ู…ุงุฑ 2 16:52 meta.yaml\r\n", - "-rw-rw-r-- 1 melo melo 4.2K ู…ุงุฑ 2 16:52 transform.py\r\n" - ] - } - ], - "source": [ - "ls -lh # fmt: skip" - ] - }, - { - "cell_type": "markdown", - "id": "0b08ed06-ba66-4f76-bde1-368ea77d1739", - "metadata": {}, - "source": [ - "# End" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/data/ClinTox/meta.yaml b/data/ClinTox/meta.yaml deleted file mode 100644 index 963dcd108..000000000 --- a/data/ClinTox/meta.yaml +++ /dev/null @@ -1,38 +0,0 @@ ---- -name: ClinTox -description: The ClinTox dataset includes drugs that have failed clinical trials for toxicity reasons and also drugs that are associated with successful - trials. -targets: - - id: clinical_toxicity - description: whether it can cause clinical toxicity (1) or not (0). - units: clinical_toxicity - type: categorical - names: - - clinical toxicity - - toxicity - - drug Induced clinical toxicity - - drug failed in clinical trials -identifiers: - - id: SMILES - type: SMILES - description: SMILES -license: CC BY 4.0 -links: - - url: https://doi.org/10.1016/j.chembiol.2016.07.023 - description: corresponding publication -num_points: 1478 -url: https://tdcommons.ai/single_pred_tasks/tox/#clintox -bibtex: - - |- - @article{Gayvert2016, - doi = {10.1016/j.chembiol.2016.07.023}, - url = {https://doi.org/10.1016/j.chembiol.2016.07.023}, - year = {2016}, - month = oct, - publisher = {Elsevier {BV}}, - volume = {23}, - number = {10}, - pages = {1294--1301}, - author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento}, - title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials}, - journal = {Cell Chemical Biology}} diff --git a/data/caco2_wang/example_processing_and_templates.ipynb b/data/caco2_wang/example_processing_and_templates.ipynb deleted file mode 100644 index 33f86ed20..000000000 --- a/data/caco2_wang/example_processing_and_templates.ipynb +++ /dev/null @@ -1,1253 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "27c08f91-3fa0-4800-8f6a-96a96f665cad", - "metadata": {}, - "source": [ - "# Caco-2 (Cell Effective Permeability), Wang et al." - ] - }, - { - "cell_type": "markdown", - "id": "6ef172b9-aad2-47da-bf4c-844a2a07ee8c", - "metadata": {}, - "source": [ - "Original data repository: https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al" - ] - }, - { - "cell_type": "markdown", - "id": "7d18c95d-2ec6-45e1-addc-54a890097b8e", - "metadata": {}, - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "cf59e3e9-8061-4022-9eae-e978311b4155", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import yaml\n", - "from tdc.single_pred import ADME" - ] - }, - { - "cell_type": "markdown", - "id": "a6751ff9-2e3e-4d01-8395-7a5ae0c200d7", - "metadata": {}, - "source": [ - "# Data processing" - ] - }, - { - "cell_type": "markdown", - "id": "a1169ad2-e4bb-41c6-9625-6d1644c44a5b", - "metadata": {}, - "source": [ - "## Download data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "7bb8eb5e-f513-40d2-a68c-7cda1a51ad31", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_original = \"data_original.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b39a142e-ccbc-49d2-98b0-a5f9bde9fd27", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found local copy...\n", - "Loading...\n", - "Done!\n" - ] - } - ], - "source": [ - "data = ADME(name=\"Caco2_Wang\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "26d9f62a-07f5-4113-8161-d5dfcf0bfb71", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data.get_data().to_csv(fn_data_original, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "43873fc3-20a8-487d-a7c5-33bd58414159", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh" - ] - }, - { - "cell_type": "markdown", - "id": "d9cda29a-a133-4f0e-992b-e77c9070ee93", - "metadata": {}, - "source": [ - "## Load original data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "77f614e7-b133-40bc-8759-2d930e4c120e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Drug_ID,Drug,Y\n", - "(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.2199998\n", - "\"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -ynamide\",C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C,-3.8599999\n", - "codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2)N(C)CC[C@]314,-4.0900002\n", - "creatinine,CN1CC(=O)NC1=N,-5.935409099999998\n" - ] - } - ], - "source": [ - "!head -n 5 {fn_data_original}" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "8f5a0387-f9e3-4e1a-8d14-5df618195f70", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_data_original, delimiter=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "55b0bd63-62a0-469e-9d8a-e9ada3fe01c4", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Drug_IDDrugY
0(-)-epicatechinOc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2-6.220000
1(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C-3.860000
2codeineCOc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...-4.090000
3creatinineCN1CC(=O)NC1=N-5.935409
4danazolC#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...-4.840000
\n", - "
" - ], - "text/plain": [ - " Drug_ID \\\n", - "0 (-)-epicatechin \n", - "1 (2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y... \n", - "2 codeine \n", - "3 creatinine \n", - "4 danazol \n", - "\n", - " Drug Y \n", - "0 Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2 -6.220000 \n", - "1 C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C -3.860000 \n", - "2 COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2... -4.090000 \n", - "3 CN1CC(=O)NC1=N -5.935409 \n", - "4 C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(... -4.840000 " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "95158ac1-05d7-4a21-b8e4-7f720306d331", - "metadata": {}, - "source": [ - "## Add column = field names\n", - "Clean column names (`fields_clean`) and keep original names (`fields_orig`)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "ec2458e5-455f-4f03-8ce9-c0d12e9ed371", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['Drug_ID', 'Drug', 'Y']" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fields_orig = df.columns.tolist()\n", - "fields_orig" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "a46dd8ff-37b3-4894-8226-3bf98226dd09", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fields_clean = [\n", - " \"compound_name\",\n", - " \"SMILES\",\n", - " \"permeability\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "785d37cb-1fb4-4a91-a923-d5a78a37f36a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.columns = fields_clean" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "1bf212cb-1653-457b-9f5d-416d4dd14b53", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_nameSMILESpermeability
0(-)-epicatechinOc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2-6.220000
1(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C-3.860000
2codeineCOc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...-4.090000
3creatinineCN1CC(=O)NC1=N-5.935409
4danazolC#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...-4.840000
\n", - "
" - ], - "text/plain": [ - " compound_name \\\n", - "0 (-)-epicatechin \n", - "1 (2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y... \n", - "2 codeine \n", - "3 creatinine \n", - "4 danazol \n", - "\n", - " SMILES permeability \n", - "0 Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2 -6.220000 \n", - "1 C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C -3.860000 \n", - "2 COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2... -4.090000 \n", - "3 CN1CC(=O)NC1=N -5.935409 \n", - "4 C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(... -4.840000 " - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "1bfaeb22-26fb-4964-a71f-cae8335e5372", - "metadata": {}, - "source": [ - "## Data cleaning" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "7e746003-cb1f-434f-bba6-00f0c439c4ac", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.compound_name = (\n", - " df.compound_name.str.strip()\n", - ") # remove leading and trailing white space characters" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "d544fa60-343e-40e1-bd0c-4750f07a7145", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "assert not df.duplicated().sum()" - ] - }, - { - "cell_type": "markdown", - "id": "bc6f52c1-e0f6-48b3-95f4-e36d9a5ecde8", - "metadata": {}, - "source": [ - "## Save to csv" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "d6d5efa5-b4b4-4a25-8626-e10f3d691e83", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_csv = \"data_clean.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "727f8d7b-cbb6-43c7-9eab-9d4d65be6b3f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.to_csv(fn_data_csv, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63c8d4a4-906e-418d-be39-879365b4dfa0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh {fn_data_csv}" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "a51b9001-25d7-4e0e-a607-477cfc4a9f1c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "compound_name,SMILES,permeability\n", - "(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.2199998\n", - "\"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -ynamide\",C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C,-3.8599999\n", - "codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2)N(C)CC[C@]314,-4.0900002\n", - "creatinine,CN1CC(=O)NC1=N,-5.935409099999998\n" - ] - } - ], - "source": [ - "!head -n 5 {fn_data_csv}" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "1a512943-4909-4d56-867d-50c151d8d607", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_nameSMILESpermeability
0(-)-epicatechinOc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2-6.220000
1(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C-3.860000
2codeineCOc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...-4.090000
3creatinineCN1CC(=O)NC1=N-5.935409
4danazolC#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...-4.840000
\n", - "
" - ], - "text/plain": [ - " compound_name \\\n", - "0 (-)-epicatechin \n", - "1 (2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y... \n", - "2 codeine \n", - "3 creatinine \n", - "4 danazol \n", - "\n", - " SMILES permeability \n", - "0 Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2 -6.220000 \n", - "1 C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C -3.860000 \n", - "2 COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2... -4.090000 \n", - "3 CN1CC(=O)NC1=N -5.935409 \n", - "4 C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(... -4.840000 " - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f3d730ce-fce0-49df-9eb8-b917e945fa9a", - "metadata": {}, - "source": [ - "## Load from csv" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "077b0c5f-8772-4879-9317-3fa28799689b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_csv = \"data_clean.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "6eaef0e6-2115-4793-ac43-a196b25d47a0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_data_csv)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "43619e7c-9c82-4ff0-ae25-403861304635", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_nameSMILESpermeability
0(-)-epicatechinOc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2-6.220000
1(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C-3.860000
2codeineCOc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...-4.090000
3creatinineCN1CC(=O)NC1=N-5.935409
4danazolC#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...-4.840000
\n", - "
" - ], - "text/plain": [ - " compound_name \\\n", - "0 (-)-epicatechin \n", - "1 (2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y... \n", - "2 codeine \n", - "3 creatinine \n", - "4 danazol \n", - "\n", - " SMILES permeability \n", - "0 Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2 -6.220000 \n", - "1 C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C -3.860000 \n", - "2 COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2... -4.090000 \n", - "3 CN1CC(=O)NC1=N -5.935409 \n", - "4 C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(... -4.840000 " - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "2f06e57c-02c5-493b-af65-c8bb9ac59421", - "metadata": {}, - "source": [ - "# meta YAML" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "49771077-471d-4d71-a9a7-d6b094bbc4f3", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_nameSMILESpermeability
0(-)-epicatechinOc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2-6.220000
1(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C-3.860000
2codeineCOc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...-4.090000
3creatinineCN1CC(=O)NC1=N-5.935409
4danazolC#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...-4.840000
\n", - "
" - ], - "text/plain": [ - " compound_name \\\n", - "0 (-)-epicatechin \n", - "1 (2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y... \n", - "2 codeine \n", - "3 creatinine \n", - "4 danazol \n", - "\n", - " SMILES permeability \n", - "0 Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2 -6.220000 \n", - "1 C/C=C\\C#CCC/C=C\\C=C\\C(=O)NCC(C)C -3.860000 \n", - "2 COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2... -4.090000 \n", - "3 CN1CC(=O)NC1=N -5.935409 \n", - "4 C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(... -4.840000 " - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "meta = {\n", - " \"name\": \"caco2_wang\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"\"\"The human colon epithelial cancer cell line, Caco-2, is used as an in vitro model\n", - " to simulate the human intestinal tissue. The experimental result on the rate of drug passing through\n", - " the Caco-2 cells can approximate the rate at which the drug permeates through the human intestinal tissue.\"\"\",\n", - " \"targets\": [\n", - " {\n", - " \"id\": \"permeability\", # name of the column in a tabular dataset\n", - " \"description\": \"Caco-2 cell effective permeability.\", # description of what this column means\n", - " \"units\": \"logPapp\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"Caco-2 cell effective permeability\",\n", - " \"Caco-2 cell permeability\",\n", - " \"Caco-2 permeability\",\n", - " \"permeability\",\n", - " ],\n", - " },\n", - " ],\n", - " \"identifiers\": [\n", - " {\n", - " \"id\": \"SMILES\", # column name\n", - " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"Other\"\n", - " \"description\": \"SMILES\", # description (optional, except for \"Other\")\n", - " },\n", - " {\n", - " \"id\": \"compound_name\",\n", - " \"type\": \"Other\",\n", - " \"description\": \"compound name\",\n", - " \"names\": [\n", - " \"compound\",\n", - " \"compound name\",\n", - " ],\n", - " },\n", - " ],\n", - " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", - " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", - " {\n", - " \"url\": \"https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al\",\n", - " \"description\": \"original data set link\",\n", - " },\n", - " {\n", - " \"url\": \"https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642\",\n", - " \"description\": \"corresponding publication\",\n", - " },\n", - " ],\n", - " \"num_points\": len(df), # number of datapoints in this dataset\n", - " \"bibtex\": [\n", - " \"\"\"@article{wang2016adme,\n", - "title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability\n", - "using a combination of NSGA-II and boosting},\n", - "author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao,\n", - "Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng},\n", - "journal={Journal of Chemical Information and Modeling},\n", - "volume={56},\n", - "number={4},\n", - "pages={763--773},\n", - "year={2016},\n", - "publisher={ACS Publications}\n", - "}\"\"\",\n", - " ],\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "ec455cf0-962a-4c0d-bb3e-066e415ffd9b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def str_presenter(dumper, data):\n", - " \"\"\"configures yaml for dumping multiline strings\n", - " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", - " \"\"\"\n", - " if data.count(\"\\n\") > 0: # check for multiline string\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", - "\n", - "\n", - "yaml.add_representer(str, str_presenter)\n", - "yaml.representer.SafeRepresenter.add_representer(\n", - " str, str_presenter\n", - ") # to use with safe_dum" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "580bbd79-4845-4515-be94-3e4a9815d048", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_meta = \"meta.yaml\"" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "873fa5dd-9b60-40f5-b537-4d7a206414ea", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d01686c0-6746-4fc4-b019-350270dfc26f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh {fn_meta}" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "ef6063c5-7a8b-4344-bccf-a073443feebf", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "name: caco2_wang\n", - "description: |-\n", - " The human colon epithelial cancer cell line, Caco-2, is used as an in vitro model\n", - " to simulate the human intestinal tissue. The experimental result on the rate of drug passing through\n", - " the Caco-2 cells can approximate the rate at which the drug permeates through the human intestinal tissue.\n", - "targets:\n", - "- id: permeability\n", - " description: Caco-2 cell effective permeability.\n", - " units: logPapp\n", - " type: continuous\n", - " names:\n", - " - Caco-2 cell effective permeability\n", - " - Caco-2 cell permeability\n", - " - Caco-2 permeability\n", - " - permeability\n", - "identifiers:\n", - "- id: SMILES\n", - " type: SMILES\n", - " description: SMILES\n", - "- id: compound_name\n", - " type: Other\n", - " description: compound name\n", - " names:\n", - " - compound\n", - " - compound name\n", - "license: CC BY 4.0\n", - "links:\n", - "- url: https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642\n", - " description: corresponding publication\n", - "num_points: 910\n", - "url: https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al\n", - "bibtex:\n", - "- |-\n", - " @article{wang2016adme,\n", - " title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability\n", - " using a combination of NSGA-II and boosting},\n", - " author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao,\n", - " Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng},\n", - " journal={Journal of Chemical Information and Modeling},\n", - " volume={56},\n", - " number={4},\n", - " pages={763--773},\n", - " year={2016},\n", - " publisher={ACS Publications}\n", - " }\n" - ] - } - ], - "source": [ - "!cat {fn_meta}" - ] - }, - { - "cell_type": "markdown", - "id": "bd3f930a-638b-4bb7-a1d2-80688f2f6891", - "metadata": {}, - "source": [ - "# create transform.py" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "path_file = \"transform.py\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting transform.py\n" - ] - } - ], - "source": [ - "%%writefile $path_file\n", - "import pandas as pd\n", - "import yaml\n", - "from tdc.single_pred import ADME\n", - "\n", - "\n", - "def get_and_transform_data():\n", - " # get raw data\n", - " data = ADME(name=\"Caco2_Wang\")\n", - " fn_data_original = \"data_original.csv\"\n", - " data.get_data().to_csv(fn_data_original, index=False)\n", - "\n", - " # create dataframe\n", - " df = pd.read_csv(\n", - " fn_data_original,\n", - " delimiter=\",\",\n", - " ) # not necessary but ensure we can load the saved data\n", - "\n", - " # check if fields are the same\n", - " fields_orig = df.columns.tolist()\n", - " assert fields_orig == [\n", - " \"Drug_ID\",\n", - " \"Drug\",\n", - " \"Y\",\n", - " ]\n", - "\n", - " # overwrite column names = fields\n", - " fields_clean = [\n", - " \"compound_name\",\n", - " \"SMILES\",\n", - " \"permeability\",\n", - " ]\n", - " df.columns = fields_clean\n", - "\n", - " # data cleaning\n", - " df.compound_name = (\n", - " df.compound_name.str.strip()\n", - " ) # remove leading and trailing white space characters\n", - "\n", - " assert not df.duplicated().sum()\n", - "\n", - " # save to csv\n", - " fn_data_csv = \"data_clean.csv\"\n", - " df.to_csv(fn_data_csv, index=False)\n", - "\n", - " # create meta yaml\n", - " meta = {\n", - " \"name\": \"caco2_wang\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"\"\"The human colon epithelial cancer cell line, Caco-2, is used as an in vitro model\n", - " to simulate the human intestinal tissue. The experimental result on the rate of drug passing through\n", - " the Caco-2 cells can approximate the rate at which the drug permeates through the human intestinal tissue.\"\"\",\n", - " \"targets\": [\n", - " {\n", - " \"id\": \"permeability\", # name of the column in a tabular dataset\n", - " \"description\": \"Caco-2 cell effective permeability.\", # description of what this column means\n", - " \"units\": \"logPapp\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"Caco-2 cell effective permeability\",\n", - " \"Caco-2 cell permeability\",\n", - " \"Caco-2 permeability\",\n", - " \"permeability\",\n", - " ],\n", - " },\n", - " ],\n", - " \"identifiers\": [\n", - " {\n", - " \"id\": \"SMILES\", # column name\n", - " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", - " \"description\": \"SMILES\", # description (optional, except for \"OTHER\")\n", - " },\n", - " {\n", - " \"id\": \"compound_name\",\n", - " \"type\": \"Other\",\n", - " \"description\": \"compound name\",\n", - " \"names\": [\n", - " \"compound\",\n", - " \"compound name\",\n", - " ],\n", - " },\n", - " ],\n", - " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", - " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", - " {\n", - " \"url\": \"https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al\",\n", - " \"description\": \"original data set link\",\n", - " },\n", - " {\n", - " \"url\": \"https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642\",\n", - " \"description\": \"corresponding publication\",\n", - " },\n", - " ],\n", - " \"num_points\": len(df), # number of datapoints in this dataset\n", - " \"bibtex\": [\n", - " \"\"\"@article{wang2016adme,\n", - " title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability\n", - " using a combination of NSGA-II and boosting},\n", - " author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao,\n", - " Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng},\n", - " journal={Journal of Chemical Information and Modeling},\n", - " volume={56},\n", - " number={4},\n", - " pages={763--773},\n", - " year={2016},\n", - " publisher={ACS Publications}\n", - " }\"\"\",\n", - " ],\n", - " }\n", - "\n", - " def str_presenter(dumper, data):\n", - " \"\"\"configures yaml for dumping multiline strings\n", - " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", - " \"\"\"\n", - " if data.count(\"\\n\") > 0: # check for multiline string\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", - "\n", - " yaml.add_representer(str, str_presenter)\n", - " yaml.representer.SafeRepresenter.add_representer(\n", - " str, str_presenter\n", - " ) # to use with safe_dum\n", - " fn_meta = \"meta.yaml\"\n", - " with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)\n", - "\n", - " print(f\"Finished processing {meta['name']} dataset!\")\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " get_and_transform_data()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!python3 transform.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "953e7bee-bd5e-41d0-a2be-506e0bc97727", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ls -lh # fmt: skip" - ] - }, - { - "cell_type": "markdown", - "id": "0b08ed06-ba66-4f76-bde1-368ea77d1739", - "metadata": {}, - "source": [ - "# End" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "chemnlp", - "language": "python", - "name": "chemnlp" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/data/caco2_wang/meta.yaml b/data/caco2_wang/meta.yaml index a511b437c..68091a5fc 100644 --- a/data/caco2_wang/meta.yaml +++ b/data/caco2_wang/meta.yaml @@ -1,19 +1,29 @@ --- name: caco2_wang description: |- - The human colon epithelial cancer cell line, Caco-2, is used as an in vitro model - to simulate the human intestinal tissue. The experimental result on the rate of drug passing through - the Caco-2 cells can approximate the rate at which the drug permeates through the human intestinal tissue. + The human colon epithelial cancer cell line, Caco-2, + is used as an in vitro model to simulate the human intestinal tissue. + The experimental result on the rate of drug passing through + the Caco-2 cells can approximate the rate at which the drug permeates + through the human intestinal tissue. targets: - id: permeability description: Caco-2 cell effective permeability. - units: logPapp + units: cm/s type: continuous names: - Caco-2 cell effective permeability - Caco-2 cell permeability - Caco-2 permeability - - permeability + pubchem_aids: + - 678378 + uris: + - http://www.bioassayontology.org/bao#BAO_0010008 + - http://purl.obolibrary.org/obo/MI_2162 +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - id: SMILES type: SMILES @@ -26,21 +36,22 @@ identifiers: - compound name license: CC BY 4.0 links: + - url: https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al + description: original data set link - url: https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642 description: corresponding publication num_points: 910 -url: https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al bibtex: - |- @article{wang2016adme, - title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability - using a combination of NSGA-II and boosting}, - author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao, - Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng}, - journal={Journal of Chemical Information and Modeling}, - volume={56}, - number={4}, - pages={763--773}, - year={2016}, - publisher={ACS Publications} - } + title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability + using a combination of NSGA-II and boosting}, + author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao, + Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng}, + journal={Journal of Chemical Information and Modeling}, + volume={56}, + number={4}, + pages={763--773}, + year={2016}, + publisher={ACS Publications} + } diff --git a/data/caco2_wang/transform.py b/data/caco2_wang/transform.py index 392589863..efdb5cad9 100644 --- a/data/caco2_wang/transform.py +++ b/data/caco2_wang/transform.py @@ -5,15 +5,15 @@ def get_and_transform_data(): # get raw data - data = ADME(name="Caco2_Wang") - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) + splits = ADME(name="Caco2_Wang").get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data + df = pd.concat([df_train, df_valid, df_test], axis=0) # check if fields are the same fields_orig = df.columns.tolist() @@ -21,6 +21,7 @@ def get_and_transform_data(): "Drug_ID", "Drug", "Y", + "split", ] # overwrite column names = fields @@ -28,6 +29,7 @@ def get_and_transform_data(): "compound_name", "SMILES", "permeability", + "split", ] df.columns = fields_clean @@ -45,21 +47,34 @@ def get_and_transform_data(): # create meta yaml meta = { "name": "caco2_wang", # unique identifier, we will also use this for directory names - "description": """The human colon epithelial cancer cell line, Caco-2, is used as an in vitro model - to simulate the human intestinal tissue. The experimental result on the rate of drug passing through - the Caco-2 cells can approximate the rate at which the drug permeates through the human intestinal tissue.""", + "description": """The human colon epithelial cancer cell line, Caco-2, +is used as an in vitro model to simulate the human intestinal tissue. +The experimental result on the rate of drug passing through +the Caco-2 cells can approximate the rate at which the drug permeates +through the human intestinal tissue.""", "targets": [ { "id": "permeability", # name of the column in a tabular dataset "description": "Caco-2 cell effective permeability.", # description of what this column means - "units": "logPapp", # units of the values in this column (leave empty if unitless) + "units": "cm/s", "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) "Caco-2 cell effective permeability", "Caco-2 cell permeability", "Caco-2 permeability", - "permeability", ], + "pubchem_aids": [678378], + "uris": [ + "http://www.bioassayontology.org/bao#BAO_0010008", + "http://purl.obolibrary.org/obo/MI_2162", + ], + }, + ], + "benchmarks": [ + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information }, ], "identifiers": [ @@ -92,23 +107,24 @@ def get_and_transform_data(): "num_points": len(df), # number of datapoints in this dataset "bibtex": [ """@article{wang2016adme, - title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability - using a combination of NSGA-II and boosting}, - author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao, - Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng}, - journal={Journal of Chemical Information and Modeling}, - volume={56}, - number={4}, - pages={763--773}, - year={2016}, - publisher={ACS Publications} - }""", +title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability +using a combination of NSGA-II and boosting}, +author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao, +Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng}, +journal={Journal of Chemical Information and Modeling}, +volume={56}, +number={4}, +pages={763--773}, +year={2016}, +publisher={ACS Publications} +}""", ], } def str_presenter(dumper, data): """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + Ref: + https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data """ if data.count("\n") > 0: # check for multiline string return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") diff --git a/data/cav3_t-type_calcium_channels_butkiewicz/meta.yaml b/data/cav3_t-type_calcium_channels_butkiewicz/meta.yaml new file mode 100644 index 000000000..dc3d29b21 --- /dev/null +++ b/data/cav3_t-type_calcium_channels_butkiewicz/meta.yaml @@ -0,0 +1,87 @@ +--- +name: cav3_t-type_calcium_channels_butkiewicz +description: |- + This dataset was initially curated from HTS data at the PubChem database. + The curation process is documented in Butkiewicz et al. + Primary screening with AID 449739 identified inhibitors of Cav3 T-type calcium channels. + Four follow-up screens were performed to confirm inhibitory effects on smaller sets of compounds + involving AID 493021, AID 493022, AID 493023, and AID 493041. + AID 489005 was performed as counter screen validating active compounds of the primary screen. +targets: + - id: activity_cav3_t_type_calcium_channels + description: whether it active against cav3 t-type calcium channels receptor (1) or not (0) + units: + type: boolean + names: + - a inhibitor of cav3 t-type calcium channels activity + - inhibiting cav3 t-type calcium channels activity + - a t-type calcium channel blocker + pubchem_aids: + - 1053190 + - 489005 + - 493021 + - 493022 + - 493023 + - 493041 + uris: + - http://purl.obolibrary.org/obo/CHEBI_194338 +identifiers: + - id: SMILES + type: SMILES + description: SMILES +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +num_points: 100875 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall + and Pedro Teixeira and C. Weaver and Jens Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He + and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky + and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. + and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/cav3_t-type_calcium_channels_butkiewicz/transform.py b/data/cav3_t-type_calcium_channels_butkiewicz/transform.py new file mode 100644 index 000000000..83385b48d --- /dev/null +++ b/data/cav3_t-type_calcium_channels_butkiewicz/transform.py @@ -0,0 +1,156 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "cav3_t-type_calcium_channels_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == ["Drug_ID", "Drug", "Y", "split"] + + # overwrite column names = fields + fields_clean = [ + "compound_id", + "SMILES", + "activity_cav3_t_type_calcium_channels", + "split", + ] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "cav3_t-type_calcium_channels_butkiewicz", + "description": """This dataset was initially curated from HTS data at the PubChem database. +The curation process is documented in Butkiewicz et al. +Primary screening with AID 449739 identified inhibitors of Cav3 T-type calcium channels. +Four follow-up screens were performed to confirm inhibitory effects on smaller sets of compounds +involving AID 493021, AID 493022, AID 493023, and AID 493041. +AID 489005 was performed as counter screen validating active compounds of the primary screen.""", + "targets": [ + { + "id": "activity_cav3_t_type_calcium_channels", # name of the column in a tabular dataset + "description": "whether it active against cav3 t-type calcium channels receptor (1) or not (0)", + "units": None, + "type": "boolean", + "names": [ + "a inhibitor of cav3 t-type calcium channels activity", + "inhibiting cav3 t-type calcium channels activity", + "a t-type calcium channel blocker", + ], + "pubchem_aids": [1053190, 489005, 493021, 493022, 493023, 493041], + "uris": ["http://purl.obolibrary.org/obo/CHEBI_194338"], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and Jeffrey Mendenhall +and Pedro Teixeira and C. Weaver and Jens Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput Screening with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte and Jia He and Siqian He +and Qingliang Li and Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky +and Jian Zhang and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, E. W. and Weaver, D. C. +and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: + https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/choline_transporter_butkiewicz/meta.yaml b/data/choline_transporter_butkiewicz/meta.yaml new file mode 100644 index 000000000..489d37142 --- /dev/null +++ b/data/choline_transporter_butkiewicz/meta.yaml @@ -0,0 +1,94 @@ +--- +name: choline_transporter_butkiewicz +description: |- + This dataset was originally curated from HTS data at + the PubChem database. The primary screen AID 488975 identified + inhibitors of CHT. The counter screen AID 493221 was used as a + validation screen to confirm the active compounds that inhibit CHT. + AID504840 and AID588401 experiments were used as additional validation + experiments. The screen AID 493222 evaluated remaining active compounds + for non-specific activity in parental HEK293 cells. AID602208 tested a + selected set of compounds for 3H choline uptake. The final set of 254 + active compounds was determined by the overlap of active compounds in + screens AID 493221, AID504840, and AID588401 subtracting any + non-specific hits from AID 49322 and all inactive compounds in the + re-confirmation screen AID602208. +targets: + - id: activity_choline_transporter + description: inhibition of choline transporter receptor (1) or not (0). + units: + type: boolean + names: + - a inhibitor of choline transporter activity + - inhibitor of choline transporter activity + pubchem_aids: + - 488975 + - 493221 + - 504840 + - 588401 + - 493222 + - 602208 +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: + - id: SMILES + type: SMILES + description: SMILES +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +num_points: 302306 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and + Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens + Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput + Screening with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and + Asta Gindulyte and Jia He and Siqian He and Qingliang Li and + Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid + Zaslavsky and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, + E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from + the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/choline_transporter_butkiewicz/transform.py b/data/choline_transporter_butkiewicz/transform.py new file mode 100644 index 000000000..ebc899ea5 --- /dev/null +++ b/data/choline_transporter_butkiewicz/transform.py @@ -0,0 +1,168 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "choline_transporter_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Drug_ID", + "Drug", + "Y", + "split", + ] + + # overwrite column names = fields + fields_clean = [ + "compound_id", + "SMILES", + "activity_choline_transporter", + "split", + ] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "choline_transporter_butkiewicz", + "description": """This dataset was originally curated from HTS data at +the PubChem database. The primary screen AID 488975 identified +inhibitors of CHT. The counter screen AID 493221 was used as a +validation screen to confirm the active compounds that inhibit CHT. +AID504840 and AID588401 experiments were used as additional validation +experiments. The screen AID 493222 evaluated remaining active compounds +for non-specific activity in parental HEK293 cells. AID602208 tested a +selected set of compounds for 3H choline uptake. The final set of 254 +active compounds was determined by the overlap of active compounds in +screens AID 493221, AID504840, and AID588401 subtracting any +non-specific hits from AID 49322 and all inactive compounds in the +re-confirmation screen AID602208.""", + "targets": [ + { + "id": "activity_choline_transporter", # name of the column in a tabular dataset + "description": "inhibition of choline transporter receptor (1) or not (0).", + "units": None, # units of the values in this column (leave empty if unitless) + "type": "boolean", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "a inhibitor of choline transporter activity", + "inhibitor of choline transporter activity", + ], + "pubchem_aids": [488975, 493221, 504840, 588401, 493222, 602208], + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and +Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens +Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput +Screening with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and +Asta Gindulyte and Jia He and Siqian He and Qingliang Li and +Benjamin A Shoemaker and Paul A Thiessen and Bo Yu and Leonid +Zaslavsky and Jian Zhang and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, +E. W. and Weaver, D. C. and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from +the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: + https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/clintox/meta.yaml b/data/clintox/meta.yaml new file mode 100644 index 000000000..75de05f05 --- /dev/null +++ b/data/clintox/meta.yaml @@ -0,0 +1,47 @@ +--- +name: clintox +description: |- + The ClinTox dataset includes drugs that have failed + clinical trials for toxicity reasons and also drugs that are associated + with successful trials. +targets: + - id: clinical_toxicity + description: whether it can cause clinical toxicity (1) or not (0). + units: + type: boolean + names: + - clinically toxic + - displaying clinical toxicity + - toxic + uris: + - http://purl.bioontology.org/ontology/MESH/Q000633 + - https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=C27990 + - https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=C27955 +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: + - id: SMILES + type: SMILES + description: SMILES +license: CC BY 4.0 +links: + - url: https://doi.org/10.1016/j.chembiol.2016.07.023 + description: corresponding publication +num_points: 1478 +url: https://tdcommons.ai/single_pred_tasks/tox/#clintox +bibtex: + - |- + @article{Gayvert2016, + doi = {10.1016/j.chembiol.2016.07.023}, + url = {https://doi.org/10.1016/j.chembiol.2016.07.023}, + year = {2016}, + month = oct, + publisher = {Elsevier {BV}}, + volume = {23}, + number = {10}, + pages = {1294--1301}, + author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento}, + title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials}, + journal = {Cell Chemical Biology}} diff --git a/data/ClinTox/transform.py b/data/clintox/transform.py similarity index 56% rename from data/ClinTox/transform.py rename to data/clintox/transform.py index d83b5d66f..8487f1212 100644 --- a/data/ClinTox/transform.py +++ b/data/clintox/transform.py @@ -5,30 +5,22 @@ def get_and_transform_data(): # get raw data - data = Tox(name="ClinTox") - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) + splits = Tox(name="ClinTox").get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data + df = pd.concat([df_train, df_valid, df_test], axis=0) # check if fields are the same fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] + assert fields_orig == ["Drug_ID", "Drug", "Y", "split"] # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "clinical_toxicity", - ] + fields_clean = ["compound_id", "SMILES", "clinical_toxicity", "split"] df.columns = fields_clean # data cleaning @@ -44,22 +36,35 @@ def get_and_transform_data(): # create meta yaml meta = { - "name": "ClinTox", # unique identifier, we will also use this for directory names - "description": """The ClinTox dataset includes drugs that have failed clinical trials for toxicity reasons and also drugs that are associated with successful trials.""", + "name": "clintox", # unique identifier, we will also use this for directory names + "description": """The ClinTox dataset includes drugs that have failed +clinical trials for toxicity reasons and also drugs that are associated +with successful trials.""", "targets": [ { "id": "clinical_toxicity", # name of the column in a tabular dataset - "description": "whether it can cause clinical toxicity (1) or not (0).", # description of what this column means - "units": "clinical_toxicity", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" + "description": "whether it can cause clinical toxicity (1) or not (0).", + "units": None, # units of the values in this column (leave empty if unitless) + "type": "boolean", "names": [ # names for the property (to sample from for building the prompts) - "clinical toxicity", - "toxicity", - "drug Induced clinical toxicity", - "drug failed in clinical trials", + "clinically toxic", + "displaying clinical toxicity", + "toxic", + ], + "uris": [ + "http://purl.bioontology.org/ontology/MESH/Q000633", + "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=C27990", # noqa: E501 + "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=C27955", # noqa: E501 ], }, ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], "identifiers": [ { "id": "SMILES", # column name @@ -78,23 +83,25 @@ def get_and_transform_data(): "url": "https://tdcommons.ai/single_pred_tasks/tox/#clintox", "bibtex": [ """@article{Gayvert2016, - doi = {10.1016/j.chembiol.2016.07.023}, - url = {https://doi.org/10.1016/j.chembiol.2016.07.023}, - year = {2016}, - month = oct, - publisher = {Elsevier {BV}}, - volume = {23}, - number = {10}, - pages = {1294--1301}, - author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento}, - title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials}, - journal = {Cell Chemical Biology}}""", +doi = {10.1016/j.chembiol.2016.07.023}, +url = {https://doi.org/10.1016/j.chembiol.2016.07.023}, +year = {2016}, +month = oct, +publisher = {Elsevier {BV}}, +volume = {23}, +number = {10}, +pages = {1294--1301}, +author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento}, +title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials}, +journal = {Cell Chemical Biology}}""", ], } def str_presenter(dumper, data): """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + + Ref: + https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data """ if data.count("\n") > 0: # check for multiline string return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") diff --git a/data/drug_target_interaction/example_processing_and_templates.ipynb b/data/drug_target_interaction/example_processing_and_templates.ipynb new file mode 100644 index 000000000..2dbb03d19 --- /dev/null +++ b/data/drug_target_interaction/example_processing_and_templates.ipynb @@ -0,0 +1,1389 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "27c08f91-3fa0-4800-8f6a-96a96f665cad", + "metadata": {}, + "source": [ + "# Drug-Target Interaction, Liu et al." + ] + }, + { + "cell_type": "markdown", + "id": "6ef172b9-aad2-47da-bf4c-844a2a07ee8c", + "metadata": {}, + "source": [ + "Original data repository: https://tdcommons.ai/multi_pred_tasks/dti/" + ] + }, + { + "cell_type": "markdown", + "id": "7d18c95d-2ec6-45e1-addc-54a890097b8e", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "cf59e3e9-8061-4022-9eae-e978311b4155", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import yaml\n", + "from tdc.multi_pred import DTI" + ] + }, + { + "cell_type": "markdown", + "id": "a6751ff9-2e3e-4d01-8395-7a5ae0c200d7", + "metadata": {}, + "source": [ + "# Data processing" + ] + }, + { + "cell_type": "markdown", + "id": "a1169ad2-e4bb-41c6-9625-6d1644c44a5b", + "metadata": {}, + "source": [ + "## Download data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7bb8eb5e-f513-40d2-a68c-7cda1a51ad31", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_data_original = \"data_original.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b39a142e-ccbc-49d2-98b0-a5f9bde9fd27", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading...\n", + "100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 54.4M/54.4M [00:15<00:00, 3.41MiB/s]\n", + "Loading...\n", + "Done!\n" + ] + } + ], + "source": [ + "data = DTI(name = 'BindingDB_Kd')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "26d9f62a-07f5-4113-8161-d5dfcf0bfb71", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data.get_data().to_csv(fn_data_original, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "43873fc3-20a8-487d-a7c5-33bd58414159", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 82856\r\n", + "drwxr-xr-x 3 cody staff 96B Mar 18 19:00 \u001b[34mdata\u001b[m\u001b[m\r\n", + "-rw-r--r-- 1 cody staff 39M Mar 18 19:00 data_original.csv\r\n", + "-rw-r--r-- 1 cody staff 46K Mar 18 18:58 example_processing_and_templates.ipynb\r\n", + "-rw-r--r--@ 1 cody staff 1.8K Mar 15 22:47 meta.yaml\r\n", + "-rw-r--r--@ 1 cody staff 5.1K Mar 18 19:00 transform.py\r\n" + ] + } + ], + "source": [ + "!ls -lh" + ] + }, + { + "cell_type": "markdown", + "id": "d9cda29a-a133-4f0e-992b-e77c9070ee93", + "metadata": {}, + "source": [ + "## Load original data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "77f614e7-b133-40bc-8759-2d930e4c120e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Drug_ID,Drug,Target_ID,Target,Y\r\n", + "444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.46\r\n", + "4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.49\r\n", + "4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.83\r\n", + "1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.2\r\n" + ] + } + ], + "source": [ + "!head -n 5 {fn_data_original}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8f5a0387-f9e3-4e1a-8d14-5df618195f70", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.read_csv(fn_data_original, delimiter=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "55b0bd63-62a0-469e-9d8a-e9ada3fe01c4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Drug_IDDrugTarget_IDTargetY
0444607.0Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.46
14316.0COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.49
24293.0NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.83
31611.0NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2OP00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.20
41612.0COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.16
\n", + "
" + ], + "text/plain": [ + " Drug_ID Drug Target_ID \\\n", + "0 444607.0 Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "1 4316.0 COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "2 4293.0 NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1 P00918 \n", + "3 1611.0 NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O P00918 \n", + "4 1612.0 COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1 P00918 \n", + "\n", + " Target Y \n", + "0 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.46 \n", + "1 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.49 \n", + "2 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.83 \n", + "3 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.20 \n", + "4 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.16 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "95158ac1-05d7-4a21-b8e4-7f720306d331", + "metadata": {}, + "source": [ + "## Add column = field names\n", + "Clean column names (`fields_clean`) and keep original names (`fields_orig`)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ec2458e5-455f-4f03-8ce9-c0d12e9ed371", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Drug_ID', 'Drug', 'Target_ID', 'Target', 'Y']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fields_orig = df.columns.tolist()\n", + "fields_orig" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a46dd8ff-37b3-4894-8226-3bf98226dd09", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fields_clean = [\n", + " \"compound_name\",\n", + " \"SMILES\",\n", + " \"target_name\",\n", + " \"Target_aa\",\n", + " \"binding\",\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "785d37cb-1fb4-4a91-a923-d5a78a37f36a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.columns = fields_clean" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1bf212cb-1653-457b-9f5d-416d4dd14b53", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
compound_nameSMILEStarget_nameTarget_aabinding
0444607.0Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.46
14316.0COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.49
24293.0NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.83
31611.0NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2OP00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.20
41612.0COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.16
\n", + "
" + ], + "text/plain": [ + " compound_name SMILES target_name \\\n", + "0 444607.0 Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "1 4316.0 COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "2 4293.0 NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1 P00918 \n", + "3 1611.0 NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O P00918 \n", + "4 1612.0 COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1 P00918 \n", + "\n", + " Target_aa binding \n", + "0 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.46 \n", + "1 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.49 \n", + "2 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.83 \n", + "3 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.20 \n", + "4 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.16 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "1bfaeb22-26fb-4964-a71f-cae8335e5372", + "metadata": {}, + "source": [ + "## Data cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7e746003-cb1f-434f-bba6-00f0c439c4ac", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.columns = fields_clean\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d544fa60-343e-40e1-bd0c-4750f07a7145", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "assert not df.duplicated().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "bc6f52c1-e0f6-48b3-95f4-e36d9a5ecde8", + "metadata": {}, + "source": [ + "## Save to csv" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d6d5efa5-b4b4-4a25-8626-e10f3d691e83", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_data_csv = \"data_clean.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "727f8d7b-cbb6-43c7-9eab-9d4d65be6b3f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.to_csv(fn_data_csv, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "63c8d4a4-906e-418d-be39-879365b4dfa0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r-- 1 cody staff 39M Mar 18 19:00 data_clean.csv\r\n" + ] + } + ], + "source": [ + "!ls -lh {fn_data_csv}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a51b9001-25d7-4e0e-a607-477cfc4a9f1c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "compound_name,SMILES,target_name,Target_aa,binding\r\n", + "444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.46\r\n", + "4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.49\r\n", + "4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.83\r\n", + "1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.2\r\n" + ] + } + ], + "source": [ + "!head -n 5 {fn_data_csv}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1a512943-4909-4d56-867d-50c151d8d607", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
compound_nameSMILEStarget_nameTarget_aabinding
0444607.0Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.46
14316.0COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.49
24293.0NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.83
31611.0NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2OP00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.20
41612.0COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.16
\n", + "
" + ], + "text/plain": [ + " compound_name SMILES target_name \\\n", + "0 444607.0 Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "1 4316.0 COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "2 4293.0 NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1 P00918 \n", + "3 1611.0 NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O P00918 \n", + "4 1612.0 COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1 P00918 \n", + "\n", + " Target_aa binding \n", + "0 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.46 \n", + "1 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.49 \n", + "2 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.83 \n", + "3 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.20 \n", + "4 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.16 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f3d730ce-fce0-49df-9eb8-b917e945fa9a", + "metadata": {}, + "source": [ + "## Load from csv" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "077b0c5f-8772-4879-9317-3fa28799689b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_data_csv = \"data_clean.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6eaef0e6-2115-4793-ac43-a196b25d47a0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.read_csv(fn_data_csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "43619e7c-9c82-4ff0-ae25-403861304635", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
compound_nameSMILEStarget_nameTarget_aabinding
0444607.0Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.46
14316.0COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.49
24293.0NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.83
31611.0NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2OP00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.20
41612.0COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.16
\n", + "
" + ], + "text/plain": [ + " compound_name SMILES target_name \\\n", + "0 444607.0 Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "1 4316.0 COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "2 4293.0 NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1 P00918 \n", + "3 1611.0 NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O P00918 \n", + "4 1612.0 COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1 P00918 \n", + "\n", + " Target_aa binding \n", + "0 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.46 \n", + "1 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.49 \n", + "2 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.83 \n", + "3 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.20 \n", + "4 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.16 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2f06e57c-02c5-493b-af65-c8bb9ac59421", + "metadata": {}, + "source": [ + "# meta YAML" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "49771077-471d-4d71-a9a7-d6b094bbc4f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
compound_nameSMILEStarget_nameTarget_aabinding
0444607.0Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.46
14316.0COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.49
24293.0NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.83
31611.0NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2OP00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.20
41612.0COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1P00918MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...0.16
\n", + "
" + ], + "text/plain": [ + " compound_name SMILES target_name \\\n", + "0 444607.0 Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "1 4316.0 COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1 P00918 \n", + "2 4293.0 NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1 P00918 \n", + "3 1611.0 NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O P00918 \n", + "4 1612.0 COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1 P00918 \n", + "\n", + " Target_aa binding \n", + "0 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.46 \n", + "1 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.49 \n", + "2 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.83 \n", + "3 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.20 \n", + "4 MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP... 0.16 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "meta = {\n", + " \"name\": \"Drug-Target Interaction\", # unique identifier, we will also use this for directory names\n", + " \"description\": \"\"\"The activity of a small-molecule drug is measured by its binding affinity with the target protein.\n", + " Given a new target protein, the very first step is to screen a set of potential compounds to find their activity.\n", + " Traditional method to gauge the affinities are through high-throughput screening wet-lab experiments.\n", + " However, they are very expensive and are thus restricted by their abilities to search over a large set of candidates\n", + " Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound structural information and protein amino acid sequence.\"\"\",\n", + " \"targets\": [\n", + " {\n", + " \"id\": \"binding\", # name of the column in a tabular dataset\n", + " \"description\": \"small-molecule protein interaction.\", # description of what this column means\n", + " \"units\": \"Kd\", # units of the values in this column (leave empty if unitless)\n", + " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", + " \"uris\" : [\"\thttp://purl.obolibrary.org/obo/NCIT_C20604\"],\n", + " \"names\": [ # names for the property (to sample from for building the prompts)\n", + " \"Drug-Target Interaction\"\n", + " \"small-molecule binding affinity\",\n", + " \"small-molecule binding\",\n", + " \"protein-ligand binding\",\n", + " \"protein-ligand\"\n", + " \"binding affinity\",\n", + " \"binding\",\n", + "\n", + " ],\n", + " },\n", + " ],\n", + " \"identifiers\": [\n", + " {\n", + " \"id\": \"SMILES\", # column name\n", + " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", + " \"description\": \"small-molecule\", # description (optional, except for \"OTHER\")\n", + " },\n", + " {\n", + " \"id\": \"Target\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"Target amino acid sequence\",\n", + "\n", + " },\n", + " ],\n", + " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", + " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", + " {\n", + " \"url\": \"https://tdcommons.ai/multi_pred_tasks/dti/\",\n", + " \"description\": \"original data set link\",\n", + " },\n", + " {\n", + " \"url\": \"https://doi.org/10.1093/nar/gkl999\",\n", + " \"description\": \"corresponding publication\",\n", + " },\n", + " ],\n", + " \"benchmarks\": [\n", + " {\n", + " \"name\": \"TDC\",\n", + " \"link\": \"https://tdcommons.ai/\",\n", + " \"split_column\": \"split\",\n", + " },\n", + " ],\n", + " \"num_points\": len(df), # number of datapoints in this dataset\n", + " \"bibtex\": [\n", + " \"\"\"@article{Liu2006bindingdb,\n", + " title={BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities},\n", + " author={Tiqing Liu, Yuhmei Lin, Xin Wen, Robert N. Jorissen, Micahel, K. Gilson},\n", + " journal={Journal of Chemical Information and Modeling},\n", + " volume={35},\n", + " number={4},\n", + " pages={D198-D201},\n", + " year={2006},\n", + " publisher={Oxford Academic}\n", + " }\"\"\",\n", + " ],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ec455cf0-962a-4c0d-bb3e-066e415ffd9b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def str_presenter(dumper, data):\n", + " \"\"\"configures yaml for dumping multiline strings\n", + " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", + " \"\"\"\n", + " if data.count(\"\\n\") > 0: # check for multiline string\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", + "\n", + "\n", + "yaml.add_representer(str, str_presenter)\n", + "yaml.representer.SafeRepresenter.add_representer(\n", + " str, str_presenter\n", + ") # to use with safe_dum" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "580bbd79-4845-4515-be94-3e4a9815d048", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_meta = \"meta.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "873fa5dd-9b60-40f5-b537-4d7a206414ea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open(fn_meta, \"w\") as f:\n", + " yaml.dump(meta, f, sort_keys=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d01686c0-6746-4fc4-b019-350270dfc26f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r--@ 1 cody staff 1.8K Mar 18 19:00 meta.yaml\r\n" + ] + } + ], + "source": [ + "!ls -lh {fn_meta}" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ef6063c5-7a8b-4344-bccf-a073443feebf", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name: Drug-Target Interaction\r\n", + "description: |-\r\n", + " The activity of a small-molecule drug is measured by its binding affinity with the target protein.\r\n", + " Given a new target protein, the very first step is to screen a set of potential compounds to find their activity.\r\n", + " Traditional method to gauge the affinities are through high-throughput screening wet-lab experiments.\r\n", + " However, they are very expensive and are thus restricted by their abilities to search over a large set of candidates\r\n", + " Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound structural information and protein amino acid sequence.\r\n", + "targets:\r\n", + "- id: binding\r\n", + " description: small-molecule protein interaction.\r\n", + " units: Kd\r\n", + " type: continuous\r\n", + " uris:\r\n", + " - \"\\thttp://purl.obolibrary.org/obo/NCIT_C20604\"\r\n", + " names:\r\n", + " - Drug-Target Interactionsmall-molecule binding affinity\r\n", + " - small-molecule binding\r\n", + " - protein-ligand binding\r\n", + " - protein-ligandbinding affinity\r\n", + " - binding\r\n", + "identifiers:\r\n", + "- id: SMILES\r\n", + " type: SMILES\r\n", + " description: small-molecule\r\n", + "- id: Target\r\n", + " type: Other\r\n", + " description: Target amino acid sequence\r\n", + "license: CC BY 4.0\r\n", + "links:\r\n", + "- url: https://tdcommons.ai/multi_pred_tasks/dti/\r\n", + " description: original data set link\r\n", + "- url: https://doi.org/10.1093/nar/gkl999\r\n", + " description: corresponding publication\r\n", + "split_col: split\r\n", + "num_points: 52274\r\n", + "bibtex:\r\n", + "- |-\r\n", + " @article{Liu2006bindingdb,\r\n", + " title={BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities},\r\n", + " author={Tiqing Liu, Yuhmei Lin, Xin Wen, Robert N. Jorissen, Micahel, K. Gilson},\r\n", + " journal={Journal of Chemical Information and Modeling},\r\n", + " volume={35},\r\n", + " number={4},\r\n", + " pages={D198-D201},\r\n", + " year={2006},\r\n", + " publisher={Oxford Academic}\r\n", + " }\r\n" + ] + } + ], + "source": [ + "!cat {fn_meta}" + ] + }, + { + "cell_type": "markdown", + "id": "bd3f930a-638b-4bb7-a1d2-80688f2f6891", + "metadata": {}, + "source": [ + "# create transform.py" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "path_file = \"transform.py\"" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting transform.py\n" + ] + } + ], + "source": [ + "%%writefile $path_file\n", + "import pandas as pd\n", + "import yaml\n", + "from tdc.multi_pred import DTI\n", + "\n", + "def get_and_transform_data():\n", + " # get raw data\n", + " data = DTI(name=\"BindingDB_Kd\")\n", + " splits = data.get_split()\n", + " df_train = splits[\"train\"]\n", + " df_valid = splits[\"valid\"]\n", + " df_test = splits[\"test\"]\n", + " df_train[\"split\"] = \"train\"\n", + " df_valid[\"split\"] = \"valid\"\n", + " df_test[\"split\"] = \"test\"\n", + "\n", + " df = pd.concat([df_train, df_valid, df_test], axis=0)\n", + "\n", + " # check if fields are the same\n", + " fields_orig = df.columns.tolist()\n", + " assert fields_orig == [\n", + " \"Drug_ID\",\n", + " \"Drug\",\n", + " \"Target_ID\",\n", + " \"Target\",\n", + " \"Y\",\n", + " \"split\",\n", + " ]\n", + "\n", + " # overwrite column names = fields\n", + " fields_clean = [\n", + " \"compound_name\",\n", + " \"SMILES\",\n", + " \"target_name\",\n", + " \"Target_aa\",\n", + " \"binding\",\n", + " \"split\"\n", + " ]\n", + " df.columns = fields_clean\n", + "\n", + " # data cleaning\n", + " '''\n", + " df.compound_name = (\n", + " df.compound_name.str.strip()\n", + " ) # remove leading and trailing white space characters\n", + " '''\n", + " assert not df.duplicated().sum()\n", + "\n", + " # save to csv\n", + " fn_data_csv = \"data_clean.csv\"\n", + " df.to_csv(fn_data_csv, index=False)\n", + "\n", + " # create meta yaml\n", + " meta = {\n", + " \"name\": \"Drug-Target Interaction\", # unique identifier, we will also use this for directory names\n", + " \"description\": \"\"\"The activity of a small-molecule drug is measured by its binding affinity with the target protein.\n", + " Given a new target protein, the very first step is to screen a set of potential compounds to find their activity.\n", + " Traditional method to gauge the affinities are through high-throughput screening wet-lab experiments.\n", + " However, they are very expensive and are thus restricted by their abilities to search over a large set of candidates\n", + " Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound structural information and protein amino acid sequence.\"\"\",\n", + " \"targets\": [\n", + " {\n", + " \"id\": \"binding\", # name of the column in a tabular dataset\n", + " \"description\": \"small-molecule protein interaction.\", # description of what this column means\n", + " \"units\": \"Kd\", # units of the values in this column (leave empty if unitless)\n", + " \"type\": \"regression\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", + " \"names\": [ # names for the property (to sample from for building the prompts)\n", + " \"Drug-Target Interaction\"\n", + " \"small-molecule binding affinity\",\n", + " \"small-molecule binding\",\n", + " \"protein-ligand binding\",\n", + " \"protein-ligand\"\n", + " \"binding affinity\",\n", + " \"binding\",\n", + "\n", + " ],\n", + " },\n", + " ],\n", + " \"identifiers\": [\n", + " {\n", + " \"id\": \"SMILES\", # column name\n", + " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", + " \"description\": \"small-molecule\", # description (optional, except for \"OTHER\")\n", + " },\n", + " {\n", + " \"id\": \"Target\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"Target amino acid sequence\",\n", + " \n", + " },\n", + " ],\n", + " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", + " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", + " {\n", + " \"url\": \"https://tdcommons.ai/multi_pred_tasks/dti/\",\n", + " \"description\": \"original data set link\",\n", + " },\n", + " {\n", + " \"url\": \"https://doi.org/10.1093/nar/gkl999\",\n", + " \"description\": \"corresponding publication\",\n", + " },\n", + " ],\n", + " \"num_points\": len(df), # number of datapoints in this dataset\n", + " \"bibtex\": [\n", + " \"\"\"@article{Liu2006bindingdb,\n", + " title={BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities},\n", + " author={Tiqing Liu, Yuhmei Lin, Xin Wen, Robert N. Jorissen, Micahel, K. Gilson},\n", + " journal={Journal of Chemical Information and Modeling},\n", + " volume={35},\n", + " number={4},\n", + " pages={D198-D201},\n", + " year={2006},\n", + " publisher={Oxford Academic}\n", + " }\"\"\",\n", + " ],\n", + " }\n", + "\n", + " def str_presenter(dumper, data):\n", + " \"\"\"configures yaml for dumping multiline strings\n", + " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", + " \"\"\"\n", + "\n", + " if data.count(\"\\n\") > 0: # check for multiline string\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", + "\n", + " yaml.add_representer(str, str_presenter)\n", + " yaml.representer.SafeRepresenter.add_representer(\n", + " str, str_presenter\n", + " ) # to use with safe_dum\n", + " fn_meta = \"meta.yaml\"\n", + " with open(fn_meta, \"w\") as f:\n", + " yaml.dump(meta, f, sort_keys=False)\n", + "\n", + " print(f\"Finished processing {meta['name']} dataset!\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " get_and_transform_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found local copy...\n", + "Loading...\n", + "Done!\n", + "Finished processing Drug-Target Interaction dataset!\n" + ] + } + ], + "source": [ + "!python3 transform.py" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "953e7bee-bd5e-41d0-a2be-506e0bc97727", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 163984\r\n", + "drwxr-xr-x 3 cody staff 96B Mar 11 11:42 \u001b[34mdata\u001b[m\u001b[m/\r\n", + "-rw-r--r-- 1 cody staff 39M Mar 11 11:43 data_clean.csv\r\n", + "-rw-r--r-- 1 cody staff 39M Mar 11 11:43 data_original.csv\r\n", + "-rw-r--r-- 1 cody staff 36K Mar 11 11:42 example_processing_and_templates.ipynb\r\n", + "-rw-r--r--@ 1 cody staff 1.8K Mar 11 11:43 meta.yaml\r\n", + "-rw-r--r-- 1 cody staff 1.7K Mar 11 11:15 meta_.yaml\r\n", + "-rw-r--r--@ 1 cody staff 5.0K Mar 11 11:43 transform.py\r\n" + ] + } + ], + "source": [ + "ls -lh # fmt: skip" + ] + }, + { + "cell_type": "markdown", + "id": "0b08ed06-ba66-4f76-bde1-368ea77d1739", + "metadata": {}, + "source": [ + "# End" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/drug_target_interaction/meta.yaml b/data/drug_target_interaction/meta.yaml new file mode 100644 index 000000000..a16496210 --- /dev/null +++ b/data/drug_target_interaction/meta.yaml @@ -0,0 +1,59 @@ +--- +name: drug-target interaction +description: |- + The activity of a small-molecule drug is measured by + its binding affinity with the target protein. Given a new target + protein, the very first step is to screen a set of potential compounds + to find their activity.Traditional method to gauge the affinities are + through high-throughput screening wet-lab experiments. However, they + are very expensive and are thus restricted by their abilities to search + over a large set of candidates. Drug-target interaction prediction task + aims to predict the interaction activity score in silico given only the + accessible compound structural information and protein amino acid + sequence. +targets: + - id: binding + description: small-molecule protein interaction + units: Kd + type: continuous + names: + - drug-target interaction + - small-molecule binding affinity + - protein-ligand binding affinity +identifiers: + - id: SMILES + type: SMILES + description: small-molecule + - id: target_name + type: Other + names: + - target name + description: target amino acid name + - id: target_aa + type: Other + names: + - target amino acid sequence + description: target amino acid sequence +license: CC BY 4.0 +links: + - url: https://tdcommons.ai/multi_pred_tasks/dti/ + description: original data set link + - url: https://doi.org/10.1093/nar/gkl999 + description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +num_points: 52274 +bibtex: + - |- + @article{Liu2006bindingdb, + title={BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities}, + author={Tiqing Liu, Yuhmei Lin, Xin Wen, Robert N. Jorissen, Micahel, K. Gilson}, + journal={Journal of Chemical Information and Modeling}, + volume={35}, + number={4}, + pages={D198-D201}, + year={2006}, + publisher={Oxford Academic} + } diff --git a/data/drug_target_interaction/transform.py b/data/drug_target_interaction/transform.py new file mode 100644 index 000000000..2401609f6 --- /dev/null +++ b/data/drug_target_interaction/transform.py @@ -0,0 +1,151 @@ +import pandas as pd +import yaml +from tdc.multi_pred import DTI + + +def get_and_transform_data(): + # get raw data + data = DTI(name="BindingDB_Kd") + splits = data.get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Drug_ID", + "Drug", + "Target_ID", + "Target", + "Y", + "split", + ] + + # overwrite column names = fields + fields_clean = [ + "compound_name", + "SMILES", + "target_name", + "target_aa", + "binding", + "split", + ] + df.columns = fields_clean + + # data cleaning + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "drug-target interaction", # unique identifier, we will also use this for directory names + "description": """The activity of a small-molecule drug is measured by +its binding affinity with the target protein. Given a new target +protein, the very first step is to screen a set of potential compounds +to find their activity.Traditional method to gauge the affinities are +through high-throughput screening wet-lab experiments. However, they +are very expensive and are thus restricted by their abilities to search +over a large set of candidates. Drug-target interaction prediction task +aims to predict the interaction activity score in silico given only the +accessible compound structural information and protein amino acid +sequence.""", + "targets": [ + { + "id": "binding", # name of the column in a tabular dataset + "description": "small-molecule protein interaction", # description of what this column means + "units": "Kd", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "drug-target interaction", + "small-molecule binding affinity", + "protein-ligand binding affinity", + ], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "OTHER" + "description": "small-molecule", # description (optional, except for "OTHER") + }, + { + "id": "target_name", + "type": "Other", + "names": [ + "target name", + ], + "description": "target amino acid name", + }, + { + "id": "target_aa", + "type": "Other", + "names": [ + "target amino acid sequence", + ], + "description": "target amino acid sequence", + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://tdcommons.ai/multi_pred_tasks/dti/", + "description": "original data set link", + }, + { + "url": "https://doi.org/10.1093/nar/gkl999", + "description": "corresponding publication", + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Liu2006bindingdb, +title={BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities}, +author={Tiqing Liu, Yuhmei Lin, Xin Wen, Robert N. Jorissen, Micahel, K. Gilson}, +journal={Journal of Chemical Information and Modeling}, +volume={35}, +number={4}, +pages={D198-D201}, +year={2006}, +publisher={Oxford Academic} +}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/flashpoint/meta.yaml b/data/flashpoint/meta.yaml index 52c65a6ba..eeaada16c 100644 --- a/data/flashpoint/meta.yaml +++ b/data/flashpoint/meta.yaml @@ -14,8 +14,10 @@ targets: type: continuous names: - flash point + uris: + - http://semanticscience.org/resource/CHEMINF_000417 identifiers: - - id: smiles + - id: SMILES type: SMILES license: CC BY 4.0 num_points: 9878 # downloaded dataset has 14696 datapoints, but there are duplicate smiles diff --git a/data/flashpoint/transform.py b/data/flashpoint/transform.py index 6b6fc4497..98ff6dfe9 100644 --- a/data/flashpoint/transform.py +++ b/data/flashpoint/transform.py @@ -80,7 +80,10 @@ def get_and_transform_data( # These are the only output columns cols_to_write = ["smiles", "flashpoint"] - processed_df[cols_to_write].to_csv(output_data_path, index=False) + processed_df = processed_df[cols_to_write] + processed_df.columns = ["SMILES", "flashpoint"] + + processed_df.to_csv(output_data_path, index=False) print( f"Finished processing {dataset_name} dataset! ({len(processed_df)} datapoints)" ) diff --git a/data/freesolv/example_processing_and_templates.ipynb b/data/freesolv/example_processing_and_templates.ipynb deleted file mode 100644 index 67dbba735..000000000 --- a/data/freesolv/example_processing_and_templates.ipynb +++ /dev/null @@ -1,1461 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "27c08f91-3fa0-4800-8f6a-96a96f665cad", - "metadata": {}, - "source": [ - "# FreeSolv dataset preparation" - ] - }, - { - "cell_type": "markdown", - "id": "6ef172b9-aad2-47da-bf4c-844a2a07ee8c", - "metadata": {}, - "source": [ - "Original data repository: https://github.com/MobleyLab/FreeSolv" - ] - }, - { - "cell_type": "markdown", - "id": "7d18c95d-2ec6-45e1-addc-54a890097b8e", - "metadata": {}, - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "cf59e3e9-8061-4022-9eae-e978311b4155", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import requests\n", - "import yaml" - ] - }, - { - "cell_type": "markdown", - "id": "a6751ff9-2e3e-4d01-8395-7a5ae0c200d7", - "metadata": {}, - "source": [ - "# Data processing" - ] - }, - { - "cell_type": "markdown", - "id": "a1169ad2-e4bb-41c6-9625-6d1644c44a5b", - "metadata": {}, - "source": [ - "## Download data" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "775f60af-1d44-49da-b082-50c5ad77e649", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data_path = \"https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt\"" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "7bb8eb5e-f513-40d2-a68c-7cda1a51ad31", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_original = \"data_original.txt\"" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "413f72f2-9f5e-4ba8-919d-7ff3908f745a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = requests.get(data_path)\n", - "with open(fn_data_original, \"wb\") as f:\n", - " f.write(data.content)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "43873fc3-20a8-487d-a7c5-33bd58414159", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh" - ] - }, - { - "cell_type": "markdown", - "id": "d9cda29a-a133-4f0e-992b-e77c9070ee93", - "metadata": {}, - "source": [ - "## Load original data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "77f614e7-b133-40bc-8759-2d930e4c120e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#Hydration free energy datbase v0.52, 6/11/17.\n", - "#Semicolon-delimited text file with fields in the following format:\n", - "# compound id (and file prefix); SMILES; iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem); experimental value (kcal/mol); experimental uncertainty (kcal/mol); Mobley group calculated value (GAFF) (kcal/mol); calculated uncertainty (kcal/mol); experimental reference (original or paper this value was taken from); calculated reference; text notes.\n", - "mobley_1017962; CCCCCC(=O)OC; methyl hexanoate; -2.49; 0.60; -3.30; 0.03; 10.1021/ct050097l; 10.1021/acs.jced.7b00104; Experimental uncertainty not presently available, so assigned a default value. \n", - "mobley_1019269; CCCCO; butan-1-ol; -4.72; 0.60; -3.23; 0.03; 10.1021/ct050097l; 10.1021/acs.jced.7b00104; Experimental uncertainty not presently available, so assigned a default value. \n" - ] - } - ], - "source": [ - "!head -n 5 {fn_data_original}" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "8f5a0387-f9e3-4e1a-8d14-5df618195f70", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_data_original, delimiter=\";\", skiprows=2)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "55b0bd63-62a0-469e-9d8a-e9ada3fe01c4", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
# compound id (and file prefix)SMILESiupac name (or alternative if IUPAC is unavailable or not parseable by OEChem)experimental value (kcal/mol)experimental uncertainty (kcal/mol)Mobley group calculated value (GAFF) (kcal/mol)calculated uncertainty (kcal/mol)experimental reference (original or paper this value was taken from)calculated referencetext notes.
0mobley_1017962CCCCCC(=O)OCmethyl hexanoate-2.490.6-3.300.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
1mobley_1019269CCCCObutan-1-ol-4.720.6-3.230.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
2mobley_1034539c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben...-3.040.1-1.080.0410.1007/s10822-012-9568-810.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
3mobley_1036761C1CCC(CC1)Ncyclohexanamine-4.590.6-3.950.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
4mobley_1046331c1ccc(cc1)OC=Ophenyl formate-3.820.6-5.440.03J. Peter Guthrie, unpublished data, as provid...10.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
\n", - "
" - ], - "text/plain": [ - " # compound id (and file prefix) SMILES \\\n", - "0 mobley_1017962 CCCCCC(=O)OC \n", - "1 mobley_1019269 CCCCO \n", - "2 mobley_1034539 c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl \n", - "3 mobley_1036761 C1CCC(CC1)N \n", - "4 mobley_1046331 c1ccc(cc1)OC=O \n", - "\n", - " iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem) \\\n", - "0 methyl hexanoate \n", - "1 butan-1-ol \n", - "2 1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben... \n", - "3 cyclohexanamine \n", - "4 phenyl formate \n", - "\n", - " experimental value (kcal/mol) experimental uncertainty (kcal/mol) \\\n", - "0 -2.49 0.6 \n", - "1 -4.72 0.6 \n", - "2 -3.04 0.1 \n", - "3 -4.59 0.6 \n", - "4 -3.82 0.6 \n", - "\n", - " Mobley group calculated value (GAFF) (kcal/mol) \\\n", - "0 -3.30 \n", - "1 -3.23 \n", - "2 -1.08 \n", - "3 -3.95 \n", - "4 -5.44 \n", - "\n", - " calculated uncertainty (kcal/mol) \\\n", - "0 0.03 \n", - "1 0.03 \n", - "2 0.04 \n", - "3 0.03 \n", - "4 0.03 \n", - "\n", - " experimental reference (original or paper this value was taken from) \\\n", - "0 10.1021/ct050097l \n", - "1 10.1021/ct050097l \n", - "2 10.1007/s10822-012-9568-8 \n", - "3 10.1021/ct050097l \n", - "4 J. Peter Guthrie, unpublished data, as provid... \n", - "\n", - " calculated reference \\\n", - "0 10.1021/acs.jced.7b00104 \n", - "1 10.1021/acs.jced.7b00104 \n", - "2 10.1021/acs.jced.7b00104 \n", - "3 10.1021/acs.jced.7b00104 \n", - "4 10.1021/acs.jced.7b00104 \n", - "\n", - " text notes. \n", - "0 Experimental uncertainty not presently availa... \n", - "1 Experimental uncertainty not presently availa... \n", - "2 Experimental uncertainty not presently availa... \n", - "3 Experimental uncertainty not presently availa... \n", - "4 Experimental uncertainty not presently availa... " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "95158ac1-05d7-4a21-b8e4-7f720306d331", - "metadata": {}, - "source": [ - "## Add column = field names\n", - "Clean column names (`fields_clean`) and keep original names (`fields_orig`)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "ec2458e5-455f-4f03-8ce9-c0d12e9ed371", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['# compound id (and file prefix)',\n", - " ' SMILES',\n", - " ' iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem)',\n", - " ' experimental value (kcal/mol)',\n", - " ' experimental uncertainty (kcal/mol)',\n", - " ' Mobley group calculated value (GAFF) (kcal/mol)',\n", - " ' calculated uncertainty (kcal/mol)',\n", - " ' experimental reference (original or paper this value was taken from)',\n", - " ' calculated reference',\n", - " ' text notes.']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fields_orig = df.columns.tolist()\n", - "fields_orig" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "a46dd8ff-37b3-4894-8226-3bf98226dd09", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fields_clean = [\n", - " \"compound_id\",\n", - " \"SMILES\",\n", - " \"iupac_name\",\n", - " \"exp_value\",\n", - " \"exp_uncertainty\",\n", - " \"GAFF\",\n", - " \"calc_uncertainty\",\n", - " \"exp_ref\",\n", - " \"calc_reference\",\n", - " \"notes\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "785d37cb-1fb4-4a91-a923-d5a78a37f36a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.columns = fields_clean" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "1bf212cb-1653-457b-9f5d-416d4dd14b53", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_idSMILESiupac_nameexp_valueexp_uncertaintyGAFFcalc_uncertaintyexp_refcalc_referencenotes
0mobley_1017962CCCCCC(=O)OCmethyl hexanoate-2.490.6-3.300.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
1mobley_1019269CCCCObutan-1-ol-4.720.6-3.230.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
2mobley_1034539c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben...-3.040.1-1.080.0410.1007/s10822-012-9568-810.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
3mobley_1036761C1CCC(CC1)Ncyclohexanamine-4.590.6-3.950.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
4mobley_1046331c1ccc(cc1)OC=Ophenyl formate-3.820.6-5.440.03J. Peter Guthrie, unpublished data, as provid...10.1021/acs.jced.7b00104Experimental uncertainty not presently availa...
\n", - "
" - ], - "text/plain": [ - " compound_id SMILES \\\n", - "0 mobley_1017962 CCCCCC(=O)OC \n", - "1 mobley_1019269 CCCCO \n", - "2 mobley_1034539 c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl \n", - "3 mobley_1036761 C1CCC(CC1)N \n", - "4 mobley_1046331 c1ccc(cc1)OC=O \n", - "\n", - " iupac_name exp_value \\\n", - "0 methyl hexanoate -2.49 \n", - "1 butan-1-ol -4.72 \n", - "2 1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben... -3.04 \n", - "3 cyclohexanamine -4.59 \n", - "4 phenyl formate -3.82 \n", - "\n", - " exp_uncertainty GAFF calc_uncertainty \\\n", - "0 0.6 -3.30 0.03 \n", - "1 0.6 -3.23 0.03 \n", - "2 0.1 -1.08 0.04 \n", - "3 0.6 -3.95 0.03 \n", - "4 0.6 -5.44 0.03 \n", - "\n", - " exp_ref \\\n", - "0 10.1021/ct050097l \n", - "1 10.1021/ct050097l \n", - "2 10.1007/s10822-012-9568-8 \n", - "3 10.1021/ct050097l \n", - "4 J. Peter Guthrie, unpublished data, as provid... \n", - "\n", - " calc_reference \\\n", - "0 10.1021/acs.jced.7b00104 \n", - "1 10.1021/acs.jced.7b00104 \n", - "2 10.1021/acs.jced.7b00104 \n", - "3 10.1021/acs.jced.7b00104 \n", - "4 10.1021/acs.jced.7b00104 \n", - "\n", - " notes \n", - "0 Experimental uncertainty not presently availa... \n", - "1 Experimental uncertainty not presently availa... \n", - "2 Experimental uncertainty not presently availa... \n", - "3 Experimental uncertainty not presently availa... \n", - "4 Experimental uncertainty not presently availa... " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "1bfaeb22-26fb-4964-a71f-cae8335e5372", - "metadata": {}, - "source": [ - "## Data cleaning" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "7e746003-cb1f-434f-bba6-00f0c439c4ac", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.notes = df.notes.str.strip() # remove leading and trailing white space characters" - ] - }, - { - "cell_type": "markdown", - "id": "bc6f52c1-e0f6-48b3-95f4-e36d9a5ecde8", - "metadata": {}, - "source": [ - "## Save to csv" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "d6d5efa5-b4b4-4a25-8626-e10f3d691e83", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_csv = \"data_clean.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "727f8d7b-cbb6-43c7-9eab-9d4d65be6b3f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.to_csv(fn_data_csv, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63c8d4a4-906e-418d-be39-879365b4dfa0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh {fn_data_csv}" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "a51b9001-25d7-4e0e-a607-477cfc4a9f1c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "compound_id,SMILES,iupac_name,exp_value,exp_uncertainty,GAFF,calc_uncertainty,exp_ref,calc_reference,notes\n", - "mobley_1017962, CCCCCC(=O)OC, methyl hexanoate,-2.49,0.6,-3.3,0.03, 10.1021/ct050097l, 10.1021/acs.jced.7b00104,\"Experimental uncertainty not presently available, so assigned a default value.\"\n", - "mobley_1019269, CCCCO, butan-1-ol,-4.72,0.6,-3.23,0.03, 10.1021/ct050097l, 10.1021/acs.jced.7b00104,\"Experimental uncertainty not presently available, so assigned a default value.\"\n", - "mobley_1034539, c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl,\" 1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)benzene\",-3.04,0.1,-1.08,0.04, 10.1007/s10822-012-9568-8, 10.1021/acs.jced.7b00104,\"Experimental uncertainty not presently available, so assigned a default value.\"\n", - "mobley_1036761, C1CCC(CC1)N, cyclohexanamine,-4.59,0.6,-3.95,0.03, 10.1021/ct050097l, 10.1021/acs.jced.7b00104,\"Experimental uncertainty not presently available, so assigned a default value.\"\n" - ] - } - ], - "source": [ - "!head -n 5 {fn_data_csv}" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "1a512943-4909-4d56-867d-50c151d8d607", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_idSMILESiupac_nameexp_valueexp_uncertaintyGAFFcalc_uncertaintyexp_refcalc_referencenotes
0mobley_1017962CCCCCC(=O)OCmethyl hexanoate-2.490.6-3.300.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
1mobley_1019269CCCCObutan-1-ol-4.720.6-3.230.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
2mobley_1034539c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben...-3.040.1-1.080.0410.1007/s10822-012-9568-810.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
3mobley_1036761C1CCC(CC1)Ncyclohexanamine-4.590.6-3.950.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
4mobley_1046331c1ccc(cc1)OC=Ophenyl formate-3.820.6-5.440.03J. Peter Guthrie, unpublished data, as provid...10.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
\n", - "
" - ], - "text/plain": [ - " compound_id SMILES \\\n", - "0 mobley_1017962 CCCCCC(=O)OC \n", - "1 mobley_1019269 CCCCO \n", - "2 mobley_1034539 c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl \n", - "3 mobley_1036761 C1CCC(CC1)N \n", - "4 mobley_1046331 c1ccc(cc1)OC=O \n", - "\n", - " iupac_name exp_value \\\n", - "0 methyl hexanoate -2.49 \n", - "1 butan-1-ol -4.72 \n", - "2 1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben... -3.04 \n", - "3 cyclohexanamine -4.59 \n", - "4 phenyl formate -3.82 \n", - "\n", - " exp_uncertainty GAFF calc_uncertainty \\\n", - "0 0.6 -3.30 0.03 \n", - "1 0.6 -3.23 0.03 \n", - "2 0.1 -1.08 0.04 \n", - "3 0.6 -3.95 0.03 \n", - "4 0.6 -5.44 0.03 \n", - "\n", - " exp_ref \\\n", - "0 10.1021/ct050097l \n", - "1 10.1021/ct050097l \n", - "2 10.1007/s10822-012-9568-8 \n", - "3 10.1021/ct050097l \n", - "4 J. Peter Guthrie, unpublished data, as provid... \n", - "\n", - " calc_reference \\\n", - "0 10.1021/acs.jced.7b00104 \n", - "1 10.1021/acs.jced.7b00104 \n", - "2 10.1021/acs.jced.7b00104 \n", - "3 10.1021/acs.jced.7b00104 \n", - "4 10.1021/acs.jced.7b00104 \n", - "\n", - " notes \n", - "0 Experimental uncertainty not presently availab... \n", - "1 Experimental uncertainty not presently availab... \n", - "2 Experimental uncertainty not presently availab... \n", - "3 Experimental uncertainty not presently availab... \n", - "4 Experimental uncertainty not presently availab... " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f3d730ce-fce0-49df-9eb8-b917e945fa9a", - "metadata": {}, - "source": [ - "## Load from csv" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "077b0c5f-8772-4879-9317-3fa28799689b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_csv = \"data_clean.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "6eaef0e6-2115-4793-ac43-a196b25d47a0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_data_csv)" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "43619e7c-9c82-4ff0-ae25-403861304635", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
compound_idSMILESiupac_nameexp_valueexp_uncertaintyGAFFcalc_uncertaintyexp_refcalc_referencenotes
0mobley_1017962CCCCCC(=O)OCmethyl hexanoate-2.490.6-3.300.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
1mobley_1019269CCCCObutan-1-ol-4.720.6-3.230.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
2mobley_1034539c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben...-3.040.1-1.080.0410.1007/s10822-012-9568-810.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
3mobley_1036761C1CCC(CC1)Ncyclohexanamine-4.590.6-3.950.0310.1021/ct050097l10.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
4mobley_1046331c1ccc(cc1)OC=Ophenyl formate-3.820.6-5.440.03J. Peter Guthrie, unpublished data, as provid...10.1021/acs.jced.7b00104Experimental uncertainty not presently availab...
\n", - "
" - ], - "text/plain": [ - " compound_id SMILES \\\n", - "0 mobley_1017962 CCCCCC(=O)OC \n", - "1 mobley_1019269 CCCCO \n", - "2 mobley_1034539 c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl \n", - "3 mobley_1036761 C1CCC(CC1)N \n", - "4 mobley_1046331 c1ccc(cc1)OC=O \n", - "\n", - " iupac_name exp_value \\\n", - "0 methyl hexanoate -2.49 \n", - "1 butan-1-ol -4.72 \n", - "2 1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben... -3.04 \n", - "3 cyclohexanamine -4.59 \n", - "4 phenyl formate -3.82 \n", - "\n", - " exp_uncertainty GAFF calc_uncertainty \\\n", - "0 0.6 -3.30 0.03 \n", - "1 0.6 -3.23 0.03 \n", - "2 0.1 -1.08 0.04 \n", - "3 0.6 -3.95 0.03 \n", - "4 0.6 -5.44 0.03 \n", - "\n", - " exp_ref \\\n", - "0 10.1021/ct050097l \n", - "1 10.1021/ct050097l \n", - "2 10.1007/s10822-012-9568-8 \n", - "3 10.1021/ct050097l \n", - "4 J. Peter Guthrie, unpublished data, as provid... \n", - "\n", - " calc_reference \\\n", - "0 10.1021/acs.jced.7b00104 \n", - "1 10.1021/acs.jced.7b00104 \n", - "2 10.1021/acs.jced.7b00104 \n", - "3 10.1021/acs.jced.7b00104 \n", - "4 10.1021/acs.jced.7b00104 \n", - "\n", - " notes \n", - "0 Experimental uncertainty not presently availab... \n", - "1 Experimental uncertainty not presently availab... \n", - "2 Experimental uncertainty not presently availab... \n", - "3 Experimental uncertainty not presently availab... \n", - "4 Experimental uncertainty not presently availab... " - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "2f06e57c-02c5-493b-af65-c8bb9ac59421", - "metadata": {}, - "source": [ - "# meta YAML" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "meta = {\n", - " \"name\": \"freesolv\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"Experimental and calculated small molecule hydration free energies\",\n", - " \"targets\": [\n", - " {\n", - " \"id\": \"exp_value\", # name of the column in a tabular dataset\n", - " \"description\": \"experimental hydration free energy value\", # description of what this column means\n", - " \"units\": \"kcal/mol\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"hydration free energy\",\n", - " ],\n", - " },\n", - " {\n", - " \"id\": \"exp_uncertainty\",\n", - " \"description\": \"experimental hydration free energy uncertainty\",\n", - " \"units\": \"kcal/mol\",\n", - " \"type\": \"continuous\",\n", - " \"names\": [\n", - " \"hydration free energy uncertainty\",\n", - " ],\n", - " },\n", - " {\n", - " \"id\": \"GAFF\", # name of the column in a tabular dataset\n", - " \"description\": \"mobley group calculated value\", # description of what this column means\n", - " \"units\": \"kcal/mol\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"GAFF\",\n", - " \"mobley group calculated value\",\n", - " ],\n", - " },\n", - " {\n", - " \"id\": \"calc_uncertainty\",\n", - " \"description\": \"mobley group calculated value calculated uncertainty\",\n", - " \"units\": \"kcal/mol\",\n", - " \"type\": \"continuous\",\n", - " \"names\": [\n", - " \"GAFF uncertainty\",\n", - " \"mobley group calculated value uncertainty\",\n", - " ],\n", - " },\n", - " ],\n", - " \"identifiers\": [\n", - " {\n", - " \"id\": \"SMILES\", # column name\n", - " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", - " \"description\": \"SMILES\", # description (optional, except for \"OTHER\")\n", - " },\n", - " {\n", - " \"id\": \"iupac_name\",\n", - " \"type\": \"IUPAC\",\n", - " \"description\": \"IUPAC\",\n", - " },\n", - " ],\n", - " \"license\": \"CC BY-NC-SA 4.0\", # license under which the original dataset was published\n", - " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", - " {\n", - " \"url\": \"https://github.com/MobleyLab/FreeSolv\",\n", - " \"description\": \"issue tracker and source data\",\n", - " },\n", - " {\n", - " \"url\": \"https://escholarship.org/uc/item/6sd403pz\",\n", - " \"description\": \"repository with data\",\n", - " },\n", - " ],\n", - " \"num_points\": len(df), # number of datapoints in this dataset\n", - "f\n", - " \"bibtex\": [\n", - " \"\"\"@article{mobley2013experimental,\n", - " title={Experimental and calculated small molecule hydration free energies},\n", - " author={Mobley, David L},\n", - " year={2013}\"\"\",\n", - " ],\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "580bbd79-4845-4515-be94-3e4a9815d048", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_meta = \"meta.yaml\"" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "873fa5dd-9b60-40f5-b537-4d7a206414ea", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d01686c0-6746-4fc4-b019-350270dfc26f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh {fn_meta}" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "ef6063c5-7a8b-4344-bccf-a073443feebf", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "bibtex: \"@article{mobley2013experimental,\\n title={Experimental and calculated\\\n", - " \\ small molecule hydration free energies},\\n author={Mobley, David L},\\n year={2013}\\n\\\n", - " \\ \"\n", - "description: Experimental and calculated small molecule hydration free energies\n", - "identifiers:\n", - "- description: SMILES\n", - " id: SMILES\n", - " type: SMILES\n", - "- description: IUPAC\n", - " id: iupac_name\n", - " type: IUPAC\n", - "license: CC BY-NC-SA 4.0\n", - "links:\n", - "- https://github.com/MobleyLab/FreeSolv\n", - "- https://escholarship.org/uc/item/6sd403pz\n", - "name: freesolv\n", - "num_points: 642\n", - "targets:\n", - "- description: experimental hydration free energy value\n", - " id: exp_value\n", - " names:\n", - " - hydration free energy\n", - " type: continuous\n", - " units: kcal/mol\n", - "- description: experimental hydration free energy uncertainty\n", - " id: exp_uncertainty\n", - " names:\n", - " - hydration free energy uncertainty\n", - " type: continuos\n", - " units: kcal/mol\n", - "- description: mobley group calculated value\n", - " id: GAFF\n", - " names:\n", - " - GAFF\n", - " - mobley group calculated value\n", - " type: continuous\n", - " units: kcal/mol\n", - "- description: mobley group calculated value calculated uncertainty\n", - " id: calc_uncertainty\n", - " names:\n", - " - GAFF uncertainty\n", - " - mobley group calculated value uncertainty\n", - " type: continuos\n", - " units: kcal/mol\n", - "url: https://github.com/MobleyLab/FreeSolv\n" - ] - } - ], - "source": [ - "!cat {fn_meta}" - ] - }, - { - "cell_type": "markdown", - "id": "bd3f930a-638b-4bb7-a1d2-80688f2f6891", - "metadata": {}, - "source": [ - "# create transform.py" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "path_file = \"transform.py\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting transform.py\n" - ] - } - ], - "source": [ - "%%writefile $path_file\n", - "import pandas as pd\n", - "import requests\n", - "import yaml\n", - "\n", - "\n", - "def get_and_transform_data():\n", - " # get raw data\n", - " data_path = (\n", - " \"https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt\"\n", - " )\n", - " fn_data_original = \"data_original.txt\"\n", - " data = requests.get(data_path)\n", - " with open(fn_data_original, \"wb\") as f:\n", - " f.write(data.content)\n", - "\n", - " # create dataframe\n", - " df = pd.read_csv(fn_data_original, delimiter=\";\", skiprows=2)\n", - "\n", - " # check if fields are the same\n", - " fields_orig = df.columns.tolist()\n", - " assert fields_orig == [\n", - " \"# compound id (and file prefix)\",\n", - " \" SMILES\",\n", - " \" iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem)\",\n", - " \" experimental value (kcal/mol)\",\n", - " \" experimental uncertainty (kcal/mol)\",\n", - " \" Mobley group calculated value (GAFF) (kcal/mol)\",\n", - " \" calculated uncertainty (kcal/mol)\",\n", - " \" experimental reference (original or paper this value was taken from)\",\n", - " \" calculated reference\",\n", - " \" text notes.\",\n", - " ]\n", - "\n", - " # overwrite column names = fields\n", - " fields_clean = [\n", - " \"compound_id\",\n", - " \"SMILES\",\n", - " \"iupac_name\",\n", - " \"exp_value\",\n", - " \"exp_uncertainty\",\n", - " \"GAFF\",\n", - " \"calc_uncertainty\",\n", - " \"exp_ref\",\n", - " \"calc_reference\",\n", - " \"notes\",\n", - " ]\n", - " df.columns = fields_clean\n", - "\n", - " # data cleaning\n", - " df.notes = (\n", - " df.notes.str.strip()\n", - " ) # remove leading and trailing white space characters\n", - "\n", - " # save to csv\n", - " fn_data_csv = \"data_clean.csv\"\n", - " df.to_csv(fn_data_csv, index=False)\n", - "\n", - " # create meta yaml\n", - " meta = {\n", - " \"name\": \"freesolv\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"Experimental and calculated small molecule hydration free energies\",\n", - " \"targets\": [\n", - " {\n", - " \"id\": \"exp_value\", # name of the column in a tabular dataset\n", - " \"description\": \"experimental hydration free energy value\", # description of what this column means\n", - " \"units\": \"kcal/mol\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"hydration free energy\",\n", - " ],\n", - " },\n", - " {\n", - " \"id\": \"exp_uncertainty\",\n", - " \"description\": \"experimental hydration free energy uncertainty\",\n", - " \"units\": \"kcal/mol\",\n", - " \"type\": \"continuous\",\n", - " \"names\": [\n", - " \"hydration free energy uncertainty\",\n", - " ],\n", - " },\n", - " {\n", - " \"id\": \"GAFF\", # name of the column in a tabular dataset\n", - " \"description\": \"mobley group calculated value\", # description of what this column means\n", - " \"units\": \"kcal/mol\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"GAFF\",\n", - " \"mobley group calculated value\",\n", - " ],\n", - " },\n", - " {\n", - " \"id\": \"calc_uncertainty\",\n", - " \"description\": \"mobley group calculated value calculated uncertainty\",\n", - " \"units\": \"kcal/mol\",\n", - " \"type\": \"continuous\",\n", - " \"names\": [\n", - " \"GAFF uncertainty\",\n", - " \"mobley group calculated value uncertainty\",\n", - " ],\n", - " },\n", - " ],\n", - " \"identifiers\": [\n", - " {\n", - " \"id\": \"SMILES\", # column name\n", - " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", - " \"description\": \"SMILES\", # description (optional, except for \"OTHER\")\n", - " },\n", - " {\n", - " \"id\": \"iupac_name\",\n", - " \"type\": \"IUPAC\",\n", - " \"description\": \"IUPAC\",\n", - " },\n", - " ],\n", - " \"license\": \"CC BY-NC-SA 4.0\", # license under which the original dataset was published\n", - " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", - " {\n", - " \"url\": \"https://github.com/MobleyLab/FreeSolv\",\n", - " \"description\": \"issue tracker and source data\",\n", - " },\n", - " {\n", - " \"url\": \"https://escholarship.org/uc/item/6sd403pz\",\n", - " \"description\": \"repository with data\",\n", - " },\n", - " ],\n", - " \"num_points\": len(df), # number of datapoints in this dataset\n", - " \"bibtex\": [\n", - " \"\"\"@article{mobley2013experimental,\n", - " title={Experimental and calculated small molecule hydration free energies},\n", - " author={Mobley, David L},\n", - " year={2013}\"\"\",\n", - " ],\n", - " }\n", - " fn_meta = \"meta.yaml\"\n", - " with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)\n", - "\n", - " print(f\"Finished processing {meta['name']} dataset!\")\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " get_and_transform_data()" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Finished processing freesolv dataset!\n" - ] - } - ], - "source": [ - "!python3 transform.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "953e7bee-bd5e-41d0-a2be-506e0bc97727", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ls -lh # fmt: skip" - ] - }, - { - "cell_type": "markdown", - "id": "0b08ed06-ba66-4f76-bde1-368ea77d1739", - "metadata": {}, - "source": [ - "# End" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.16" - }, - "vscode": { - "interpreter": { - "hash": "99197aebba4a0a15729e9a2e093487ccaa59e252e46f9644f5ce2641cc66b01f" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/data/freesolv/meta.yaml b/data/freesolv/meta.yaml index 524270b04..1ddc326d9 100644 --- a/data/freesolv/meta.yaml +++ b/data/freesolv/meta.yaml @@ -19,15 +19,13 @@ targets: units: kcal/mol type: continuous names: - - GAFF - - mobley group calculated value + - hydration free energy computed using the GAFF force field - id: calc_uncertainty description: mobley group calculated value calculated uncertainty units: kcal/mol type: continuous names: - - GAFF uncertainty - - mobley group calculated value uncertainty + - uncertainty in hydration free energy computed using the GAFF force field identifiers: - id: SMILES type: SMILES @@ -42,7 +40,5 @@ links: - url: https://escholarship.org/uc/item/6sd403pz description: repository with data num_points: 642 -url: https://github.com/MobleyLab/FreeSolv bibtex: - - "@article{mobley2013experimental,\n title={Experimental and calculated small molecule hydration free energies},\n author={Mobley,\ - \ David L},\n year={2013}" + - "@article{mobley2013experimental,\ntitle={Experimental and calculated small molecule hydration free energies},\nauthor={Mobley, David L},\nyear={2013}" diff --git a/data/freesolv/transform.py b/data/freesolv/transform.py index 348f934a8..9fad5b0b8 100644 --- a/data/freesolv/transform.py +++ b/data/freesolv/transform.py @@ -84,8 +84,7 @@ def get_and_transform_data(): "units": "kcal/mol", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "GAFF", - "mobley group calculated value", + "hydration free energy computed using the GAFF force field", ], }, { @@ -94,8 +93,7 @@ def get_and_transform_data(): "units": "kcal/mol", "type": "continuous", "names": [ - "GAFF uncertainty", - "mobley group calculated value uncertainty", + "uncertainty in hydration free energy computed using the GAFF force field", ], }, ], @@ -125,9 +123,9 @@ def get_and_transform_data(): "num_points": len(df), # number of datapoints in this dataset "bibtex": [ """@article{mobley2013experimental, - title={Experimental and calculated small molecule hydration free energies}, - author={Mobley, David L}, - year={2013}""", +title={Experimental and calculated small molecule hydration free energies}, +author={Mobley, David L}, +year={2013}""", ], } fn_meta = "meta.yaml" diff --git a/data/kcnq2_potassium_channel_butkiewicz/meta.yaml b/data/kcnq2_potassium_channel_butkiewicz/meta.yaml new file mode 100644 index 000000000..572359a62 --- /dev/null +++ b/data/kcnq2_potassium_channel_butkiewicz/meta.yaml @@ -0,0 +1,89 @@ +--- +name: kcnq2_potassium_channel_butkiewicz +description: |- + This dataset was initially curated from HTS data at + the PubChem database. Details are reported by Butkiewicz et al. (2013). + Primary screen AID 2239, AID 2287 validated active compounds to be + potentiators. Counter screens are AID 2282, AID 2283, and AID 2558. + Final set of 213 active compounds was acquired by removing the active + compounds of AID 2282, AID 2283 and AID 2558 from the confirmatory + screen active set of compounds (AID 2287). +targets: + - id: activity_kcnq2_potassium_channel + description: whether it is active against kcnq2 potassium channel receptor (1) or not (0). + units: + type: boolean + names: + - inhibitor of the kcnq2 potassium channel activity + - displaying activity against the kcnq2 potassium channel + pubchem_aids: + - 2239 + - 2287 + - 2282 + - 2283 + - 2558 + uris: [] +identifiers: + - id: SMILES + type: SMILES + description: SMILES +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +num_points: 302405 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and + Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens + Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput + Screening with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta + Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin + A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky + and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, + E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from + the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/kcnq2_potassium_channel_butkiewicz/transform.py b/data/kcnq2_potassium_channel_butkiewicz/transform.py new file mode 100644 index 000000000..d27ba4264 --- /dev/null +++ b/data/kcnq2_potassium_channel_butkiewicz/transform.py @@ -0,0 +1,165 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "kcnq2_potassium_channel_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # create dataframe + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Drug_ID", + "Drug", + "Y", + "split", + ] + + # overwrite column names = fields + fields_clean = [ + "compound_id", + "SMILES", + "activity_kcnq2_potassium_channel", + "split", + ] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "kcnq2_potassium_channel_butkiewicz", # unique identifier, we will also use this for directory names + "description": """This dataset was initially curated from HTS data at +the PubChem database. Details are reported by Butkiewicz et al. (2013). +Primary screen AID 2239, AID 2287 validated active compounds to be +potentiators. Counter screens are AID 2282, AID 2283, and AID 2558. +Final set of 213 active compounds was acquired by removing the active +compounds of AID 2282, AID 2283 and AID 2558 from the confirmatory +screen active set of compounds (AID 2287).""", + "targets": [ + { + "id": "activity_kcnq2_potassium_channel", + "description": "whether it is active against kcnq2 potassium channel receptor (1) or not (0).", + "units": None, + "type": "boolean", + "names": [ + "inhibitor of the kcnq2 potassium channel activity", + "displaying activity against the kcnq2 potassium channel", + ], + "pubchem_aids": [2239, 2287, 2282, 2283, 2558], + "uris": [], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and +Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens +Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput +Screening with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta +Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin +A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky +and Jian Zhang and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, +E. W. and Weaver, D. C. and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from +the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/ld50_zhu/meta.yaml b/data/ld50_zhu/meta.yaml index ae4d7f70f..aa1d98bb2 100644 --- a/data/ld50_zhu/meta.yaml +++ b/data/ld50_zhu/meta.yaml @@ -1,18 +1,19 @@ --- name: ld50_zhu -description: Acute toxicity LD50 measures the most conservative dose that can lead to lethal adverse effects. The higher the dose, the more lethal of a - drug. +description: |- + Acute toxicity LD50 measures + the most conservative dose that can lead to lethal adverse effects. + The higher the dose, the more lethal of a drug. targets: - id: acute_toxicity description: Acute Toxicity LD50. - units: ld50 + units: log(1/(mol/kg)) type: continuous names: - - Acute Toxicity LD50 - - ld50 - - conservative dose that can lead to lethal adverse effects. - - Rat Acute Toxicity by Oral Exposure - - Toxicity + - acute toxicity rat LD50 + - rat ld50 + uri: + - http://www.bioassayontology.org/bao#BAO_0002117 identifiers: - id: SMILES type: SMILES @@ -28,19 +29,25 @@ license: CC BY 4.0 links: - url: https://doi.org/10.1021/tx900189p description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split num_points: 7385 url: https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50 bibtex: - |- @article{Zhu2009, - doi = {10.1021/tx900189p}, - url = {https://doi.org/10.1021/tx900189p}, - year = {2009}, - month = oct, - publisher = {American Chemical Society ({ACS})}, - volume = {22}, - number = {12}, - pages = {1913--1921}, - author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander Sedykh and Douglas M. Young and Alexander Tropsha}, - title = {Quantitative Structure-Activity Relationship Modeling of Rat Acute Toxicity by Oral Exposure}, - journal = {Chemical Research in Toxicology}} + doi = {10.1021/tx900189p}, + url = {https://doi.org/10.1021/tx900189p}, + year = {2009}, + month = oct, + publisher = {American Chemical Society ({ACS})}, + volume = {22}, + number = {12}, + pages = {1913--1921}, + author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander + Sedykh and Douglas M. Young and Alexander Tropsha}, + title = {Quantitative Structure-Activity Relationship Modeling + of Rat Acute Toxicity by Oral Exposure}, + journal = {Chemical Research in Toxicology}} diff --git a/data/ld50_zhu/transform.py b/data/ld50_zhu/transform.py index 99f79e8e2..d9f3e23c2 100644 --- a/data/ld50_zhu/transform.py +++ b/data/ld50_zhu/transform.py @@ -5,15 +5,15 @@ def get_and_transform_data(): # get raw data - data = Tox(name="LD50_Zhu") - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) + splits = Tox(name="LD50_Zhu").get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data + df = pd.concat([df_train, df_valid, df_test], axis=0) # check if fields are the same fields_orig = df.columns.tolist() @@ -21,6 +21,7 @@ def get_and_transform_data(): "Drug_ID", "Drug", "Y", + "split", ] # overwrite column names = fields @@ -28,6 +29,7 @@ def get_and_transform_data(): "compound_name", "SMILES", "acute_toxicity", + "split", ] df.columns = fields_clean @@ -45,20 +47,20 @@ def get_and_transform_data(): # create meta yaml meta = { "name": "ld50_zhu", # unique identifier, we will also use this for directory names - "description": """Acute toxicity LD50 measures the most conservative dose that can lead to lethal adverse effects. The higher the dose, the more lethal of a drug.""", + "description": """Acute toxicity LD50 measures +the most conservative dose that can lead to lethal adverse effects. +The higher the dose, the more lethal of a drug.""", "targets": [ { "id": "acute_toxicity", # name of the column in a tabular dataset "description": "Acute Toxicity LD50.", # description of what this column means - "units": "ld50", # units of the values in this column (leave empty if unitless) + "units": "log(1/(mol/kg))", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "Acute Toxicity LD50", - "ld50", - "conservative dose that can lead to lethal adverse effects.", - "Rat Acute Toxicity by Oral Exposure", - "Toxicity", + "names": [ + "acute toxicity rat LD50", + "rat ld50", ], + "uri": ["http://www.bioassayontology.org/bao#BAO_0002117"], }, ], "identifiers": [ @@ -69,7 +71,7 @@ def get_and_transform_data(): }, { "id": "compound_name", - "type": "Synonyms", + "type": "Other", "description": "compound name", "names": [ "compound", @@ -85,27 +87,37 @@ def get_and_transform_data(): "description": "corresponding publication", }, ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], "num_points": len(df), # number of datapoints in this dataset "url": "https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50", "bibtex": [ """@article{Zhu2009, - doi = {10.1021/tx900189p}, - url = {https://doi.org/10.1021/tx900189p}, - year = {2009}, - month = oct, - publisher = {American Chemical Society ({ACS})}, - volume = {22}, - number = {12}, - pages = {1913--1921}, - author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander Sedykh and Douglas M. Young and Alexander Tropsha}, - title = {Quantitative Structure-Activity Relationship Modeling of Rat Acute Toxicity by Oral Exposure}, - journal = {Chemical Research in Toxicology}}""", +doi = {10.1021/tx900189p}, +url = {https://doi.org/10.1021/tx900189p}, +year = {2009}, +month = oct, +publisher = {American Chemical Society ({ACS})}, +volume = {22}, +number = {12}, +pages = {1913--1921}, +author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander +Sedykh and Douglas M. Young and Alexander Tropsha}, +title = {Quantitative Structure-Activity Relationship Modeling +of Rat Acute Toxicity by Oral Exposure}, +journal = {Chemical Research in Toxicology}}""", ], } def str_presenter(dumper, data): """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + Ref: + https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data """ if data.count("\n") > 0: # check for multiline string return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") diff --git a/data/lipophilicity/example_processing_and_templates.ipynb b/data/lipophilicity/example_processing_and_templates.ipynb deleted file mode 100644 index 85f716ac9..000000000 --- a/data/lipophilicity/example_processing_and_templates.ipynb +++ /dev/null @@ -1,1152 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "27c08f91-3fa0-4800-8f6a-96a96f665cad", - "metadata": {}, - "source": [ - "# Lipophilicity dataset preparation" - ] - }, - { - "cell_type": "markdown", - "id": "6ef172b9-aad2-47da-bf4c-844a2a07ee8c", - "metadata": {}, - "source": [ - "Original data repository: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv" - ] - }, - { - "cell_type": "markdown", - "id": "7d18c95d-2ec6-45e1-addc-54a890097b8e", - "metadata": {}, - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "cf59e3e9-8061-4022-9eae-e978311b4155", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import requests\n", - "import pandas as pd\n", - "import yaml" - ] - }, - { - "cell_type": "markdown", - "id": "a6751ff9-2e3e-4d01-8395-7a5ae0c200d7", - "metadata": {}, - "source": [ - "# Data processing" - ] - }, - { - "cell_type": "markdown", - "id": "a1169ad2-e4bb-41c6-9625-6d1644c44a5b", - "metadata": {}, - "source": [ - "## Download data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "775f60af-1d44-49da-b082-50c5ad77e649", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data_path = \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "7bb8eb5e-f513-40d2-a68c-7cda1a51ad31", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_original = \"data_original.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "413f72f2-9f5e-4ba8-919d-7ff3908f745a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = requests.get(data_path)\n", - "with open(fn_data_original, \"wb\") as f:\n", - " f.write(data.content)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "43873fc3-20a8-487d-a7c5-33bd58414159", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh" - ] - }, - { - "cell_type": "markdown", - "id": "d9cda29a-a133-4f0e-992b-e77c9070ee93", - "metadata": {}, - "source": [ - "## Load original data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "77f614e7-b133-40bc-8759-2d930e4c120e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CMPD_CHEMBLID,exp,smiles\n", - "CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14\n", - "CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23\n", - "CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl\n", - "CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(Cl)sc4[nH]3\n" - ] - } - ], - "source": [ - "!head -n 5 {fn_data_original}" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "8f5a0387-f9e3-4e1a-8d14-5df618195f70", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_data_original, delimiter=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "id": "28096592-ed31-4fa1-b400-3a48e6a84ec1", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['CMPD_CHEMBLID', 'exp', 'SMILES']" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.columns.tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61efe8bb-7505-4d63-936f-94e3deecf29a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "assert df.columns.tolist() == [\"CMPD_CHEMBLID\", \"exp\", \"smiles\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "55b0bd63-62a0-469e-9d8a-e9ada3fe01c4", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CMPD_CHEMBLIDexpsmiles
0CHEMBL5962713.54Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1CHEMBL1951080-1.18COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2CHEMBL17713.69COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3CHEMBL2349513.37OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4CHEMBL5650793.10Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
\n", - "
" - ], - "text/plain": [ - " CMPD_CHEMBLID exp smiles\n", - "0 CHEMBL596271 3.54 Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14\n", - "1 CHEMBL1951080 -1.18 COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...\n", - "2 CHEMBL1771 3.69 COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl\n", - "3 CHEMBL234951 3.37 OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...\n", - "4 CHEMBL565079 3.10 Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N..." - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "95158ac1-05d7-4a21-b8e4-7f720306d331", - "metadata": {}, - "source": [ - "## Add column = field names\n", - "Clean column names (`fields_clean`) and keep original names (`fields_orig`)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "ec2458e5-455f-4f03-8ce9-c0d12e9ed371", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['CMPD_CHEMBLID', 'exp', 'smiles']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fields_orig = df.columns.tolist()\n", - "fields_orig" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "a46dd8ff-37b3-4894-8226-3bf98226dd09", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fields_clean = [\n", - " \"CMPD_CHEMBLID\",\n", - " \"exp\",\n", - " \"SMILES\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "785d37cb-1fb4-4a91-a923-d5a78a37f36a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.columns = fields_clean" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "1bf212cb-1653-457b-9f5d-416d4dd14b53", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CMPD_CHEMBLIDexpSMILES
0CHEMBL5962713.54Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1CHEMBL1951080-1.18COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2CHEMBL17713.69COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3CHEMBL2349513.37OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4CHEMBL5650793.10Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
\n", - "
" - ], - "text/plain": [ - " CMPD_CHEMBLID exp SMILES\n", - "0 CHEMBL596271 3.54 Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14\n", - "1 CHEMBL1951080 -1.18 COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...\n", - "2 CHEMBL1771 3.69 COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl\n", - "3 CHEMBL234951 3.37 OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...\n", - "4 CHEMBL565079 3.10 Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N..." - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "1bfaeb22-26fb-4964-a71f-cae8335e5372", - "metadata": {}, - "source": [ - "## Data cleaning" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "100d1357-229f-45b9-8063-f89aeb4c5f08", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.duplicated().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "5e93ec35-9d04-49e4-ae6d-dcd5605cae23", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "assert not df.duplicated().sum()" - ] - }, - { - "cell_type": "markdown", - "id": "bc6f52c1-e0f6-48b3-95f4-e36d9a5ecde8", - "metadata": {}, - "source": [ - "## Save to csv" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "d6d5efa5-b4b4-4a25-8626-e10f3d691e83", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_csv = \"data_clean.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "727f8d7b-cbb6-43c7-9eab-9d4d65be6b3f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.to_csv(fn_data_csv, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63c8d4a4-906e-418d-be39-879365b4dfa0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh {fn_data_csv}" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "a51b9001-25d7-4e0e-a607-477cfc4a9f1c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CMPD_CHEMBLID,exp,SMILES\n", - "CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14\n", - "CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23\n", - "CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl\n", - "CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(Cl)sc4[nH]3\n" - ] - } - ], - "source": [ - "!head -n 5 {fn_data_csv}" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "1a512943-4909-4d56-867d-50c151d8d607", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CMPD_CHEMBLIDexpSMILES
0CHEMBL5962713.54Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1CHEMBL1951080-1.18COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2CHEMBL17713.69COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3CHEMBL2349513.37OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4CHEMBL5650793.10Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
\n", - "
" - ], - "text/plain": [ - " CMPD_CHEMBLID exp SMILES\n", - "0 CHEMBL596271 3.54 Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14\n", - "1 CHEMBL1951080 -1.18 COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...\n", - "2 CHEMBL1771 3.69 COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl\n", - "3 CHEMBL234951 3.37 OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...\n", - "4 CHEMBL565079 3.10 Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N..." - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f3d730ce-fce0-49df-9eb8-b917e945fa9a", - "metadata": {}, - "source": [ - "## Load from csv" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "077b0c5f-8772-4879-9317-3fa28799689b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_data_csv = \"data_clean.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "6eaef0e6-2115-4793-ac43-a196b25d47a0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_data_csv)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "43619e7c-9c82-4ff0-ae25-403861304635", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CMPD_CHEMBLIDexpSMILES
0CHEMBL5962713.54Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1CHEMBL1951080-1.18COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2CHEMBL17713.69COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3CHEMBL2349513.37OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4CHEMBL5650793.10Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
\n", - "
" - ], - "text/plain": [ - " CMPD_CHEMBLID exp SMILES\n", - "0 CHEMBL596271 3.54 Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14\n", - "1 CHEMBL1951080 -1.18 COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...\n", - "2 CHEMBL1771 3.69 COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl\n", - "3 CHEMBL234951 3.37 OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...\n", - "4 CHEMBL565079 3.10 Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N..." - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "2f06e57c-02c5-493b-af65-c8bb9ac59421", - "metadata": {}, - "source": [ - "# meta YAML" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "meta = {\n", - " \"name\": \"lipophilicity\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"Experimental results of octanol/water distribution coefficient (logD at pH 7.4).\",\n", - " \"targets\": [\n", - " {\n", - " \"id\": \"exp\", # name of the column in a tabular dataset\n", - " \"description\": \"experimental results of octanol/water distribution coefficient (logD at pH 7.4)\",\n", - " \"units\": \"logD\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"octanol/water distribution coefficient (logD at pH 7.4)\",\n", - " \"octanol/water distribution coefficient\",\n", - " ],\n", - " },\n", - " ],\n", - " \"identifiers\": [\n", - " {\n", - " \"id\": \"SMILES\", # column name\n", - " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", - " \"description\": \"SMILES\", # description (optional, except for \"OTHER\")\n", - " },\n", - " ],\n", - " \"license\": \"CC BY-SA 3.0\", # license under which the original dataset was published\n", - " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", - " {\n", - " \"url\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\",\n", - " \"description\": \"original dataset link\",\n", - " },\n", - " {\n", - " \"url\": \"https://github.com/cheminfo/molecule-features/blob/main/data/lipophilicity/meta.yaml\",\n", - " \"description\": \"original meta data\",\n", - " },\n", - " {\n", - " \"url\": \"https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html#lipo-datasets\",\n", - " \"description\": \"original dataset link from moleculenet\",\n", - " },\n", - " {\n", - " \"url\": \"https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/\",\n", - " \"description\": \"original report card\",\n", - " },\n", - " {\n", - " \"url\": \"https://chembl.gitbook.io/chembl-interface-documentation/about#data-licensing\",\n", - " \"description\": \"original dataset license from chembl\",\n", - " },\n", - " {\n", - " \"url\": \"https://creativecommons.org/licenses/by-sa/3.0/\",\n", - " \"description\": \"used dataset license\",\n", - " },\n", - " ],\n", - " \"num_points\": len(df), # number of datapoints in this dataset\n", - " \"bibtex\": [\n", - " \"\"\"@techreport{hersey2015chembl,\n", - " title={ChEMBL Deposited Data Set-AZ dataset},\n", - " author={Hersey, Anne},\n", - " year={2015},\n", - " institution={Technical Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk\n", - " }}\"\"\"\n", - " ],\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "id": "580bbd79-4845-4515-be94-3e4a9815d048", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn_meta = \"meta.yaml\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "646f2f57-1461-4473-a9ef-0c53e5b78dfe", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def str_presenter(dumper, data):\n", - " \"\"\"configures yaml for dumping multiline strings\n", - " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", - " \"\"\"\n", - " if data.count(\"\\n\") > 0: # check for multiline string\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", - "\n", - "\n", - "yaml.add_representer(str, str_presenter)\n", - "yaml.representer.SafeRepresenter.add_representer(\n", - " str, str_presenter\n", - ") # to use with safe_dum" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "id": "873fa5dd-9b60-40f5-b537-4d7a206414ea", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d01686c0-6746-4fc4-b019-350270dfc26f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!ls -lh {fn_meta}" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "id": "ef6063c5-7a8b-4344-bccf-a073443feebf", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "name: lipophilicity\n", - "description: Experimental results of octanol/water distribution coefficient (logD\n", - " at pH 7.4).\n", - "targets:\n", - "- id: exp\n", - " description: experimental results of octanol/water distribution coefficient (logD\n", - " at pH 7.4)\n", - " units: logD\n", - " type: continuous\n", - " names:\n", - " - octanol/water distribution coefficient (logD at pH 7.4)\n", - " - octanol/water distribution coefficient\n", - "identifiers:\n", - "- id: SMILES\n", - " type: SMILES\n", - " description: SMILES\n", - "license: CC BY-SA 3.0\n", - "links:\n", - "- url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\n", - " description: original dataset\n", - "- url: https://github.com/cheminfo/molecule-features/blob/main/data/lipophilicity/meta.yaml\n", - " description: original meta data\n", - "- url: https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html#lipo-datasets\n", - " description: original dataset link from moleculenet\n", - "- url: https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/\n", - " description: original report card\n", - "- url: https://chembl.gitbook.io/chembl-interface-documentation/about#data-licensing\n", - " description: original dataset license from chembl\n", - "- url: https://creativecommons.org/licenses/by-sa/3.0/\n", - " description: used dataset license\n", - "num_points: 4200\n", - "url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\n", - "bibtex:\n", - "- \"@techreport{hersey2015chembl,\\n title={ChEMBL Deposited Data Set-AZ dataset},\\n\\\n", - " \\ author={Hersey, Anne},\\n year={2015},\\n institution={Technical\\\n", - " \\ Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk~\\u2026}}\"\n" - ] - } - ], - "source": [ - "!cat {fn_meta}" - ] - }, - { - "cell_type": "markdown", - "id": "bd3f930a-638b-4bb7-a1d2-80688f2f6891", - "metadata": {}, - "source": [ - "# create transform.py" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "path_file = \"transform.py\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting transform.py\n" - ] - } - ], - "source": [ - "%%writefile $path_file\n", - "import pandas as pd\n", - "import requests\n", - "import yaml\n", - "\n", - "\n", - "def get_and_transform_data():\n", - " # get raw data\n", - " data_path = (\n", - " \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\"\n", - " )\n", - " fn_data_original = \"data_original.txt\"\n", - " data = requests.get(data_path)\n", - " with open(fn_data_original, \"wb\") as f:\n", - " f.write(data.content)\n", - "\n", - " # create dataframe\n", - " df = pd.read_csv(fn_data_original, delimiter=\",\")\n", - "\n", - " # check if fields are the same\n", - " assert df.columns.tolist() == [\"CMPD_CHEMBLID\", \"exp\", \"smiles\"]\n", - "\n", - " # check if no duplicated\n", - " assert not df.duplicated().sum()\n", - "\n", - " # overwrite column names = fields\n", - " fields_clean = [\n", - " \"CMPD_CHEMBLID\",\n", - " \"exp\",\n", - " \"SMILES\",\n", - " ]\n", - " df.columns = fields_clean\n", - "\n", - " # save to csv\n", - " fn_data_csv = \"data_clean.csv\"\n", - " df.to_csv(fn_data_csv, index=False)\n", - "\n", - " # create meta yaml\n", - " meta = {\n", - " \"name\": \"lipophilicity\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"Experimental results of octanol/water distribution coefficient (logD at pH 7.4).\",\n", - " \"targets\": [\n", - " {\n", - " \"id\": \"exp\", # name of the column in a tabular dataset\n", - " \"description\": \"experimental results of octanol/water distribution coefficient (logD at pH 7.4)\",\n", - " \"units\": \"logD\", # units of the values in this column (leave empty if unitless)\n", - " \"type\": \"continuous\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", - " \"names\": [ # names for the property (to sample from for building the prompts)\n", - " \"octanol/water distribution coefficient (logD at pH 7.4)\",\n", - " \"octanol/water distribution coefficient\",\n", - " ],\n", - " },\n", - " ],\n", - " \"identifiers\": [\n", - " {\n", - " \"id\": \"SMILES\", # column name\n", - " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", - " \"description\": \"SMILES\", # description (optional, except for \"OTHER\")\n", - " },\n", - " ],\n", - " \"license\": \"CC BY-SA 3.0\", # license under which the original dataset was published\n", - " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", - " {\n", - " \"url\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\",\n", - " \"description\": \"original dataset link\",\n", - " },\n", - " {\n", - " \"url\": \"https://github.com/cheminfo/molecule-features/blob/main/data/lipophilicity/meta.yaml\",\n", - " \"description\": \"original meta data\",\n", - " },\n", - " {\n", - " \"url\": \"https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html#lipo-datasets\",\n", - " \"description\": \"original dataset link from moleculenet\",\n", - " },\n", - " {\n", - " \"url\": \"https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/\",\n", - " \"description\": \"original report card\",\n", - " },\n", - " {\n", - " \"url\": \"https://chembl.gitbook.io/chembl-interface-documentation/about#data-licensing\",\n", - " \"description\": \"original dataset license from chembl\",\n", - " },\n", - " {\n", - " \"url\": \"https://creativecommons.org/licenses/by-sa/3.0/\",\n", - " \"description\": \"used dataset license\",\n", - " },\n", - " ],\n", - " \"num_points\": len(df), # number of datapoints in this dataset\n", - " \"bibtex\": [\n", - " \"\"\"@techreport{hersey2015chembl,\n", - " title={ChEMBL Deposited Data Set-AZ dataset},\n", - " author={Hersey, Anne},\n", - " year={2015},\n", - " institution={Technical Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk}\n", - " }\"\"\"\n", - " ],\n", - " }\n", - "\n", - " def str_presenter(dumper, data):\n", - " \"\"\"configures yaml for dumping multiline strings\n", - " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", - " \"\"\"\n", - " if data.count(\"\\n\") > 0: # check for multiline string\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", - " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", - "\n", - " yaml.add_representer(str, str_presenter)\n", - " yaml.representer.SafeRepresenter.add_representer(\n", - " str, str_presenter\n", - " ) # to use with safe_dum\n", - " fn_meta = \"meta.yaml\"\n", - " with open(fn_meta, \"w\") as f:\n", - " yaml.dump(meta, f, sort_keys=False)\n", - "\n", - " print(f\"Finished processing {meta['name']} dataset!\")\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " get_and_transform_data()" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Finished processing lipophilicity dataset!\n" - ] - } - ], - "source": [ - "!python3 transform.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "953e7bee-bd5e-41d0-a2be-506e0bc97727", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ls -lh # fmt: skip" - ] - }, - { - "cell_type": "markdown", - "id": "0b08ed06-ba66-4f76-bde1-368ea77d1739", - "metadata": {}, - "source": [ - "# End" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "hfdataset", - "language": "python", - "name": "hfdataset" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/data/lipophilicity/meta.yaml b/data/lipophilicity/meta.yaml index bae4ab96a..191994278 100644 --- a/data/lipophilicity/meta.yaml +++ b/data/lipophilicity/meta.yaml @@ -4,11 +4,14 @@ description: Experimental results of octanol/water distribution coefficient (log targets: - id: exp description: experimental results of octanol/water distribution coefficient (logD at pH 7.4) - units: logD + units: type: continuous names: - octanol/water distribution coefficient (logD at pH 7.4) - octanol/water distribution coefficient + uris: + - http://www.bioassayontology.org/bao#BAO_0002129 + - http://purl.obolibrary.org/obo/MI_2107 identifiers: - id: SMILES type: SMILES @@ -31,8 +34,8 @@ num_points: 4200 bibtex: - |- @techreport{hersey2015chembl, - title={ChEMBL Deposited Data Set-AZ dataset}, - author={Hersey, Anne}, - year={2015}, - institution={Technical Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk} - } + title={ChEMBL Deposited Data Set-AZ dataset}, + author={Hersey, Anne}, + year={2015}, + institution={Technical Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk} + } diff --git a/data/lipophilicity/transform.py b/data/lipophilicity/transform.py index ef88cb159..7a1b63f79 100644 --- a/data/lipophilicity/transform.py +++ b/data/lipophilicity/transform.py @@ -42,12 +42,16 @@ def get_and_transform_data(): { "id": "exp", # name of the column in a tabular dataset "description": "experimental results of octanol/water distribution coefficient (logD at pH 7.4)", - "units": "logD", # units of the values in this column (leave empty if unitless) + "units": None, "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) "octanol/water distribution coefficient (logD at pH 7.4)", "octanol/water distribution coefficient", ], + "uris": [ + "http://www.bioassayontology.org/bao#BAO_0002129", + "http://purl.obolibrary.org/obo/MI_2107", + ], }, ], "identifiers": [ @@ -87,11 +91,11 @@ def get_and_transform_data(): "num_points": len(df), # number of datapoints in this dataset "bibtex": [ """@techreport{hersey2015chembl, - title={ChEMBL Deposited Data Set-AZ dataset}, - author={Hersey, Anne}, - year={2015}, - institution={Technical Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk} - }""" +title={ChEMBL Deposited Data Set-AZ dataset}, +author={Hersey, Anne}, +year={2015}, +institution={Technical Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk} +}""", ], } diff --git a/data/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml b/data/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml new file mode 100644 index 000000000..c0fd5f7bf --- /dev/null +++ b/data/m1_muscarinic_receptor_agonists_butkiewicz/meta.yaml @@ -0,0 +1,84 @@ +--- +name: m1_muscarinic_receptor_agonists_butkiewicz +description: |- + Positive allosteric modulation of the M1 Muscarinic + receptor screened with AID626. Confirmed by screen AID 1488. A second + counter screen AID 1741. The final set of selective positive + allosteric modulators of M1 was obtained by removing compounds active + in AID 1741 from the compounds active in AID 1488 resulting in 188 + compounds. +targets: + - id: m1_muscarinic_agonist + description: whether it agonist on m1 muscarinic receptor (1) or not (0). + units: + type: boolean + names: + - a positive allosteric modulator of m1 muscarinic activity + - displaying positive allosteric modulation of the m1 muscarinic receptor + pubchem_aids: + - 626 + - 1488 + - 1741 + uris: [] +identifiers: + - id: SMILES + type: SMILES + description: SMILES +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +num_points: 61833 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller + and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput Screening + with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte + and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker + and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. + and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets + from the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/m1_muscarinic_receptor_agonists_butkiewicz/transform.py b/data/m1_muscarinic_receptor_agonists_butkiewicz/transform.py new file mode 100644 index 000000000..5885ecd24 --- /dev/null +++ b/data/m1_muscarinic_receptor_agonists_butkiewicz/transform.py @@ -0,0 +1,161 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "m1_muscarinic_receptor_agonists_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Drug_ID", + "Drug", + "Y", + "split", + ] + + # overwrite column names = fields + fields_clean = [ + "compound_id", + "SMILES", + "m1_muscarinic_agonist", + "split", + ] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "m1_muscarinic_receptor_agonists_butkiewicz", + "description": """Positive allosteric modulation of the M1 Muscarinic +receptor screened with AID626. Confirmed by screen AID 1488. A second +counter screen AID 1741. The final set of selective positive +allosteric modulators of M1 was obtained by removing compounds active +in AID 1741 from the compounds active in AID 1488 resulting in 188 +compounds.""", + "targets": [ + { + "id": "m1_muscarinic_agonist", + "description": "whether it agonist on m1 muscarinic receptor (1) or not (0).", + "units": None, + "type": "boolean", + "names": [ + "a positive allosteric modulator of m1 muscarinic activity", + "displaying positive allosteric modulation of the m1 muscarinic receptor", + ], + "pubchem_aids": [626, 1488, 1741], + "uris": [], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller +and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput Screening +with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte +and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker +and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. +and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets +from the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml b/data/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml new file mode 100644 index 000000000..ca2caa4d5 --- /dev/null +++ b/data/m1_muscarinic_receptor_antagonists_butkiewicz/meta.yaml @@ -0,0 +1,86 @@ +--- +name: m1_muscarinic_receptor_antagonists_butkiewicz +description: |- + Primary screen AID628 confirmed by screen AID677. + AID859 confirmed activity on rat M1 receptor. + The counter screen AID860 removed non-selective compounds + being active also at the rat M4 receptor. + Final set of active compoundsobtained by subtracting active compounds of AID860 + from those in AID677, resulting in 448 total active compounds. +targets: + - id: m1_muscarinic_antagonists + description: whether it negatively modulates the m1 muscarinic receptor (1) or not (0). + units: + type: boolean + names: + - a negative modulator of M1 muscarinic receptors + - negatively modulating M1 muscarinic receptors + pubchem_aids: + - 628 + - 677 + - 860 + uris: [] +identifiers: + - id: SMILES + type: SMILES + description: SMILES +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +num_points: 61756 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and + Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens + Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput + Screening with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta + Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin + A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky + and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, + E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from + the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py b/data/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py new file mode 100644 index 000000000..91c6a51e0 --- /dev/null +++ b/data/m1_muscarinic_receptor_antagonists_butkiewicz/transform.py @@ -0,0 +1,153 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "m1_muscarinic_receptor_aantagonists_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == ["Drug_ID", "Drug", "Y", "split"] + + # overwrite column names = fields + fields_clean = ["compound_id", "SMILES", "m1_muscarinic_antagonists", "split"] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "m1_muscarinic_receptor_antagonists_butkiewicz", + "description": """Primary screen AID628 confirmed by screen AID677. +AID859 confirmed activity on rat M1 receptor. +The counter screen AID860 removed non-selective compounds +being active also at the rat M4 receptor. +Final set of active compoundsobtained by subtracting active compounds of AID860 +from those in AID677, resulting in 448 total active compounds.""", + "targets": [ + { + "id": "m1_muscarinic_antagonists", + "description": "whether it negatively modulates the m1 muscarinic receptor (1) or not (0).", + "units": None, + "type": "boolean", + "names": [ + "a negative modulator of M1 muscarinic receptors", + "negatively modulating M1 muscarinic receptors", + ], + "pubchem_aids": [628, 677, 860], + "uris": [], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and +Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens +Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput +Screening with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta +Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin +A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky +and Jian Zhang and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, +E. W. and Weaver, D. C. and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from +the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/orexin1_receptor_butkiewicz/meta.yaml b/data/orexin1_receptor_butkiewicz/meta.yaml new file mode 100644 index 000000000..9a2fa8169 --- /dev/null +++ b/data/orexin1_receptor_butkiewicz/meta.yaml @@ -0,0 +1,91 @@ +--- +name: orexin1_receptor_butkiewicz +description: |- + "GPCR Orexin 1 is relevant for behavioral plasticity, + the sleep-wake cycle, and gastric acid secretion.Three primary screens, + AID 485270, AID 463079, AID 434989, were performed. Validation assay + AID504701, AD492963. Counter screen 493232. More specific assay + AID504699. AID504701 and AID504699 were combined to identify 234 active + compounds excluding an overlap of 155 molecules. +targets: + - id: activity_orexin1 + description: whether it is active against orexin1 receptor (1) or not (0). + units: + type: boolean + names: + - is a orexin 1 inhibitor + - is a orexin 1 receptor antagonist + - inhibits orexin 1 receptor + pubchem_aids: + - 485270 + - 463079 + - 434989 + - 504701 + - 493232 + - 504699 + uris: + - http://purl.bioontology.org/ontology/SNOMEDCT/838464006 +identifiers: + - id: SMILES + type: SMILES + description: SMILES +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +num_points: 218158 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and + Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens + Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput + Screening with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta + Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin + A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky + and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, + E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from + the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/orexin1_receptor_butkiewicz/transform.py b/data/orexin1_receptor_butkiewicz/transform.py new file mode 100644 index 000000000..113e471ae --- /dev/null +++ b/data/orexin1_receptor_butkiewicz/transform.py @@ -0,0 +1,154 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "orexin1_receptor_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == ["Drug_ID", "Drug", "Y", "split"] + + # overwrite column names = fields + fields_clean = ["compound_id", "SMILES", "activity_orexin1", "split"] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "orexin1_receptor_butkiewicz", + "description": """"GPCR Orexin 1 is relevant for behavioral plasticity, +the sleep-wake cycle, and gastric acid secretion.Three primary screens, +AID 485270, AID 463079, AID 434989, were performed. Validation assay +AID504701, AD492963. Counter screen 493232. More specific assay +AID504699. AID504701 and AID504699 were combined to identify 234 active +compounds excluding an overlap of 155 molecules.""", + "targets": [ + { + "id": "activity_orexin1", # name of the column in a tabular dataset + "description": "whether it is active against orexin1 receptor (1) or not (0).", + "units": None, + "type": "boolean", + "names": [ + "is a orexin 1 inhibitor", + "is a orexin 1 receptor antagonist", + "inhibits orexin 1 receptor", + ], + "pubchem_aids": [485270, 463079, 434989, 504701, 493232, 504699], + "uris": ["http://purl.bioontology.org/ontology/SNOMEDCT/838464006"], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and +Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens +Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput +Screening with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta +Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin +A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky +and Jian Zhang and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, +E. W. and Weaver, D. C. and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from +the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/pampa_ncats/meta.yaml b/data/pampa_ncats/meta.yaml index 9c0369332..55571b8a2 100644 --- a/data/pampa_ncats/meta.yaml +++ b/data/pampa_ncats/meta.yaml @@ -2,20 +2,23 @@ name: pampa_ncats description: |- PAMPA (parallel artificial membrane permeability assay) is a commonly - employed assay to evaluate drug permeability across the cellular membrane. PAMPA is a - non-cell-based, low-cost and high-throughput alternative to cellular models. Although - PAMPA does not model active and efflux transporters, it still provides permeability values - that are useful for absorption prediction because the majority of drugs are absorbed by - passive diffusion through the membrane. + employed assay to evaluate drug permeability across the cellular membrane. + PAMPA is a non-cell-based, low-cost and high-throughput alternative to cellular models. + Although PAMPA does not model active and efflux transporters, it still provides permeability values + that are useful for absorption prediction because the majority of drugs are absorbed + by passive diffusion through the membrane. targets: - id: permeability description: Binary permeability in PAMPA assay. - units: Bool - type: categorical + units: + type: boolean names: - - binary permeability in PAMPA assay - - permeability in PAMPA assay - - PAMPA permeability + - is permeable in the PAMPA assay + - shows permeability in parallel artificial membrane permeability assay (PAMPA) assay + pubchem_aids: + - 1508612 + uris: + - http://purl.bioontology.org/ontology/MESH/D002463 identifiers: - id: SMILES type: SMILES @@ -26,17 +29,21 @@ links: description: original dataset link - url: https://journals.sagepub.com/doi/full/10.1177/24725552211017520 description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split num_points: 2034 bibtex: - |- @article{siramshetty2021validating, - title={Validating ADME QSAR Models Using Marketed Drugs}, - author={Siramshetty, Vishal and Williams, Jordan and Nguyen, DHac-Trung and Neyra, Jorge and Southall, - Noel and Math'e, Ewy and Xu, Xin and Shah, Pranav}, - journal={SLAS DISCOVERY: Advancing the Science of Drug Discovery}, - volume={26}, - number={10}, - pages={1326--1336}, - year={2021}, - publisher={SAGE Publications Sage CA: Los Angeles, CA} - } + title={Validating ADME QSAR Models Using Marketed Drugs}, + author={Siramshetty, Vishal and Williams, Jordan and Nguyen, DHac-Trung and Neyra, Jorge and Southall, + Noel and Math'e, Ewy and Xu, Xin and Shah, Pranav}, + journal={SLAS DISCOVERY: Advancing the Science of Drug Discovery}, + volume={26}, + number={10}, + pages={1326--1336}, + year={2021}, + publisher={SAGE Publications Sage CA: Los Angeles, CA} + } diff --git a/data/pampa_ncats/transform.py b/data/pampa_ncats/transform.py index 0fa7208c0..1ab35e124 100644 --- a/data/pampa_ncats/transform.py +++ b/data/pampa_ncats/transform.py @@ -5,30 +5,22 @@ def get_and_transform_data(): # get raw data - data = ADME(name="PAMPA_NCATS") - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) + splits = ADME(name="PAMPA_NCATS").get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" - # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data + df = pd.concat([df_train, df_valid, df_test], axis=0) # check if fields are the same fields_orig = df.columns.tolist() - assert fields_orig == [ - "Drug_ID", - "Drug", - "Y", - ] + assert fields_orig == ["Drug_ID", "Drug", "Y", "split"] # overwrite column names = fields - fields_clean = [ - "compound_id", - "SMILES", - "permeability", - ] + fields_clean = ["compound_id", "SMILES", "permeability", "split"] df.columns = fields_clean # data cleaning @@ -43,22 +35,23 @@ def get_and_transform_data(): meta = { "name": "pampa_ncats", # unique identifier, we will also use this for directory names "description": """PAMPA (parallel artificial membrane permeability assay) is a commonly - employed assay to evaluate drug permeability across the cellular membrane. PAMPA is a - non-cell-based, low-cost and high-throughput alternative to cellular models. Although - PAMPA does not model active and efflux transporters, it still provides permeability values - that are useful for absorption prediction because the majority of drugs are absorbed by - passive diffusion through the membrane.""", +employed assay to evaluate drug permeability across the cellular membrane. +PAMPA is a non-cell-based, low-cost and high-throughput alternative to cellular models. +Although PAMPA does not model active and efflux transporters, it still provides permeability values +that are useful for absorption prediction because the majority of drugs are absorbed +by passive diffusion through the membrane.""", "targets": [ { "id": "permeability", # name of the column in a tabular dataset "description": "Binary permeability in PAMPA assay.", # description of what this column means - "units": "Bool", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" + "units": None, + "type": "boolean", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "binary permeability in PAMPA assay", - "permeability in PAMPA assay", - "PAMPA permeability", + "is permeable in the PAMPA assay", + "shows permeability in parallel artificial membrane permeability assay (PAMPA) assay", ], + "pubchem_aids": [1508612], + "uris": ["http://purl.bioontology.org/ontology/MESH/D002463"], }, ], "identifiers": [ @@ -79,25 +72,33 @@ def get_and_transform_data(): "description": "corresponding publication", }, ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], "num_points": len(df), # number of datapoints in this dataset "bibtex": [ """@article{siramshetty2021validating, - title={Validating ADME QSAR Models Using Marketed Drugs}, - author={Siramshetty, Vishal and Williams, Jordan and Nguyen, DHac-Trung and Neyra, Jorge and Southall, - Noel and Math'e, Ewy and Xu, Xin and Shah, Pranav}, - journal={SLAS DISCOVERY: Advancing the Science of Drug Discovery}, - volume={26}, - number={10}, - pages={1326--1336}, - year={2021}, - publisher={SAGE Publications Sage CA: Los Angeles, CA} - }""", +title={Validating ADME QSAR Models Using Marketed Drugs}, +author={Siramshetty, Vishal and Williams, Jordan and Nguyen, DHac-Trung and Neyra, Jorge and Southall, +Noel and Math'e, Ewy and Xu, Xin and Shah, Pranav}, +journal={SLAS DISCOVERY: Advancing the Science of Drug Discovery}, +volume={26}, +number={10}, +pages={1326--1336}, +year={2021}, +publisher={SAGE Publications Sage CA: Los Angeles, CA} +}""", ], } def str_presenter(dumper, data): """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + Ref: + https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data """ if data.count("\n") > 0: # check for multiline string return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") diff --git a/data/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml b/data/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml new file mode 100644 index 000000000..db1d6da4e --- /dev/null +++ b/data/potassium_ion_channel_kir2_1_butkiewicz/meta.yaml @@ -0,0 +1,92 @@ +--- +name: potassium_ion_channel_kir2_1_butkiewicz +description: |- + The Kir2.1 inward-rectifier potassium ion channel is + a target in the treatment of cardiovascular, neurological, renal and + metabolic disorders. Primary assay AID 1672. Validation screens AID + 2032 and AID 463252. Counter screens AID 2105, AID 2345, AID 2236, and + AID 2329. The final set of 172 active compounds was constructed + subtracting the actives in AID 2105, AID 2345, AID 2236, and AID 2329 + from the molecules found active in both, AID 2032 and AID 463252 +targets: + - id: activity_potassium_ion_channel + description: whether it is active against potassium ion channel (1) or not (0). + units: + type: boolean + names: + - is blocking potassium ion channel activity + - blocks potassium ion channel activity + - inhibts the I nward-Rectifying Potassium Ion Channel Kir2.1 + pubchem_aids: + - 1672 + - 2032 + - 463252 + - 2105 + - 2345 + - 2236 + - 2329 + uris: + - http://purl.obolibrary.org/obo/XCO_0000225 +identifiers: + - id: SMILES + type: SMILES + description: SMILES +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +num_points: 301493 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller + and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput Screening + with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte + and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker + and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang + and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. + and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets + from the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/potassium_ion_channel_kir2_1_butkiewicz/transform.py b/data/potassium_ion_channel_kir2_1_butkiewicz/transform.py new file mode 100644 index 000000000..e9289d7fb --- /dev/null +++ b/data/potassium_ion_channel_kir2_1_butkiewicz/transform.py @@ -0,0 +1,164 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "potassium_ion_channel_kir2.1_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Drug_ID", + "Drug", + "Y", + "split", + ] + + # overwrite column names = fields + fields_clean = [ + "compound_id", + "SMILES", + "activity_potassium_ion_channel", + "split", + ] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "potassium_ion_channel_kir2_1_butkiewicz", + "description": """The Kir2.1 inward-rectifier potassium ion channel is +a target in the treatment of cardiovascular, neurological, renal and +metabolic disorders. Primary assay AID 1672. Validation screens AID +2032 and AID 463252. Counter screens AID 2105, AID 2345, AID 2236, and +AID 2329. The final set of 172 active compounds was constructed +subtracting the actives in AID 2105, AID 2345, AID 2236, and AID 2329 +from the molecules found active in both, AID 2032 and AID 463252""", + "targets": [ + { + "id": "activity_potassium_ion_channel", + "description": "whether it is active against potassium ion channel (1) or not (0).", + "units": None, + "type": "boolean", + "names": [ + "is blocking potassium ion channel activity", + "blocks potassium ion channel activity", + "inhibts the I nward-Rectifying Potassium Ion Channel Kir2.1", + ], + "pubchem_aids": [1672, 2032, 463252, 2105, 2345, 2236, 2329], + "uris": ["http://purl.obolibrary.org/obo/XCO_0000225"], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller +and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput Screening +with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte +and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker +and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang +and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. +and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets +from the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/serine_threonine_kinase_33_butkiewicz/meta.yaml b/data/serine_threonine_kinase_33_butkiewicz/meta.yaml new file mode 100644 index 000000000..0032cbfd7 --- /dev/null +++ b/data/serine_threonine_kinase_33_butkiewicz/meta.yaml @@ -0,0 +1,86 @@ +--- +name: serine_threonine_kinase_33_butkiewicz +description: |- + The serine/threonine kinase, STK33, has been shown to + be relevant for proliferation of mutant KRAS-dependent cells involved + in cancer. Primary screen AID 2661. Counter screen AID 2821. AID504583 + as validation screen. Actives in AID 2821 subtracted by the actives + from screen AID504583 resulted in the final set of 172 active + compounds. +targets: + - id: activity_serine_threonine_kinase33 + description: whether it is active against serine threonine kinase 33 receptor (1) or not (0). + units: + type: boolean + names: + - inhibits the activity of the serine/threonine kinase, STK3 + - a serine/threonine kinase, STK3 inhibitor + pubchem_aids: + - 2661 + - 2821 + - 504583 + uris: [] +identifiers: + - id: SMILES + type: SMILES + description: SMILES +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +num_points: 319792 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and + Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens + Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput + Screening with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta + Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin + A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky + and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, + E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from + the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/serine_threonine_kinase_33_butkiewicz/transform.py b/data/serine_threonine_kinase_33_butkiewicz/transform.py new file mode 100644 index 000000000..9baf3d54b --- /dev/null +++ b/data/serine_threonine_kinase_33_butkiewicz/transform.py @@ -0,0 +1,163 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "serine_threonine_kinase_33_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Drug_ID", + "Drug", + "Y", + "split", + ] + + # overwrite column names = fields + fields_clean = [ + "compound_id", + "SMILES", + "activity_serine_threonine_kinase33", + "split", + ] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "serine_threonine_kinase_33_butkiewicz", + "description": """The serine/threonine kinase, STK33, has been shown to +be relevant for proliferation of mutant KRAS-dependent cells involved +in cancer. Primary screen AID 2661. Counter screen AID 2821. AID504583 +as validation screen. Actives in AID 2821 subtracted by the actives +from screen AID504583 resulted in the final set of 172 active +compounds.""", + "targets": [ + { + "id": "activity_serine_threonine_kinase33", + "description": "whether it is active against serine threonine kinase 33 receptor (1) or not (0).", + "units": None, + "type": "boolean", + "names": [ # names for the property (to sample from for building the prompts) + "inhibits the activity of the serine/threonine kinase, STK3", + "a serine/threonine kinase, STK3 inhibitor", + ], + "pubchem_aids": [2661, 2821, 504583], + "uris": [], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller and +Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens +Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput +Screening with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta +Gindulyte and Jia He and Siqian He and Qingliang Li and Benjamin +A Shoemaker and Paul A Thiessen and Bo Yu and Leonid Zaslavsky +and Jian Zhang and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. and Lowe, +E. W. and Weaver, D. C. and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets from +the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml b/data/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml new file mode 100644 index 000000000..7530e6e52 --- /dev/null +++ b/data/tyrosyl-dna_phosphodiesterase_butkiewicz/meta.yaml @@ -0,0 +1,81 @@ +--- +name: tyrosyl-dna_phosphodiesterase_butkiewicz +description: | + Inhibition of Human tyrosyl-DNA phosphodiesterase 1 (TDP1) + potentially enhances anticancer activity of DNA topoisomerase I inhibitors. + Primary screen AID 485290. Counter screen AID 489007. + Final set contains all compounds active in the counter screen AID 489007. +targets: + - id: activity_tyrosyl_dna_phosphodiesterase + description: whether it active against tyrosyl-dna phosphodiesterase receptor (1) or not (0). + units: + type: boolean + names: + - inhibiting human tyrosyl-DNA phosphodiesterase 1 (TDP1) + - a tyrosyl-DNA phosphodiesterase 1 (TDP1) inhibitor + pubchem_aids: + - 485290 + - 489007 + uris: [] +identifiers: + - id: SMILES + type: SMILES + description: SMILES +license: CC BY 4.0 +links: + - url: https://doi.org/10.3390/molecules18010735 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gky1033 + description: corresponding publication + - url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/ + description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +num_points: 341365 +url: https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al +bibtex: + - |- + @article{Butkiewicz2013, + doi = {10.3390/molecules18010735}, + url = {https://doi.org/10.3390/molecules18010735}, + year = {2013}, + month = jan, + publisher = {{MDPI} {AG}}, + volume = {18}, + number = {1}, + pages = {735--756}, + author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller + and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, + title = {Benchmarking Ligand-Based Virtual High-Throughput Screening + with the {PubChem} Database}, + journal = {Molecules}} + - |- + @article{Kim2018, + doi = {10.1093/nar/gky1033}, + url = {https://doi.org/10.1093/nar/gky1033}, + year = {2018}, + month = oct, + publisher = {Oxford University Press ({OUP})}, + volume = {47}, + number = {D1}, + pages = {D1102--D1109}, + author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte + and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker + and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, + title = {{PubChem} 2019 update: improved access to chemical data}, + journal = {Nucleic Acids Research}} + - |- + @article{Butkiewicz2017, + doi = {}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, + year = {2017}, + publisher = {Chem Inform}, + volume = {3}, + number = {1}, + author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. + and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, + title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets + from the {P}ub{C}hem {D}atabase}}, + journal = {Chemical Science}} diff --git a/data/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py b/data/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py new file mode 100644 index 000000000..c458f4c34 --- /dev/null +++ b/data/tyrosyl-dna_phosphodiesterase_butkiewicz/transform.py @@ -0,0 +1,161 @@ +import pandas as pd +import yaml +from tdc.single_pred import HTS + + +def get_and_transform_data(): + # get raw data + label = "tyrosyl-dna_phosphodiesterase_butkiewicz" + splits = HTS(name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "Drug_ID", + "Drug", + "Y", + "split", + ] + + # overwrite column names = fields + fields_clean = [ + "compound_id", + "SMILES", + "activity_tyrosyl_dna_phosphodiesterase", + "split", + ] + df.columns = fields_clean + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "tyrosyl-dna_phosphodiesterase_butkiewicz", + "description": """Inhibition of Human tyrosyl-DNA phosphodiesterase 1 (TDP1) +potentially enhances anticancer activity of DNA topoisomerase I inhibitors. +Primary screen AID 485290. Counter screen AID 489007. +Final set contains all compounds active in the counter screen AID 489007. +""", + "targets": [ + { + "id": "activity_tyrosyl_dna_phosphodiesterase", + "description": "whether it active against tyrosyl-dna phosphodiesterase receptor (1) or not (0).", + "units": None, + "type": "boolean", + "names": [ + "inhibiting human tyrosyl-DNA phosphodiesterase 1 (TDP1)", + "a tyrosyl-DNA phosphodiesterase 1 (TDP1) inhibitor", + ], + "pubchem_aids": [485290, 489007], + "uris": [], + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "SMILES", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.3390/molecules18010735", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gky1033", + "description": "corresponding publication", + }, + { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/", + "description": "corresponding publication", + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + } + ], + "num_points": len(df), # number of datapoints in this dataset + "url": "https://tdcommons.ai/single_pred_tasks/hts/#butkiewicz-et-al", + "bibtex": [ + """@article{Butkiewicz2013, +doi = {10.3390/molecules18010735}, +url = {https://doi.org/10.3390/molecules18010735}, +year = {2013}, +month = jan, +publisher = {{MDPI} {AG}}, +volume = {18}, +number = {1}, +pages = {735--756}, +author = {Mariusz Butkiewicz and Edward Lowe and Ralf Mueller +and Jeffrey Mendenhall and Pedro Teixeira and C. Weaver and Jens Meiler}, +title = {Benchmarking Ligand-Based Virtual High-Throughput Screening +with the {PubChem} Database}, +journal = {Molecules}}""", + """@article{Kim2018, +doi = {10.1093/nar/gky1033}, +url = {https://doi.org/10.1093/nar/gky1033}, +year = {2018}, +month = oct, +publisher = {Oxford University Press ({OUP})}, +volume = {47}, +number = {D1}, +pages = {D1102--D1109}, +author = {Sunghwan Kim and Jie Chen and Tiejun Cheng and Asta Gindulyte +and Jia He and Siqian He and Qingliang Li and Benjamin A Shoemaker +and Paul A Thiessen and Bo Yu and Leonid Zaslavsky and Jian Zhang and Evan E Bolton}, +title = {{PubChem} 2019 update: improved access to chemical data}, +journal = {Nucleic Acids Research}}""", + """@article{Butkiewicz2017, +doi = {}, +url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5962024/}, +year = {2017}, +publisher = {Chem Inform}, +volume = {3}, +number = {1}, +author = {Butkiewicz, M. and Wang, Y. and Bryant, S. H. +and Lowe, E. W. and Weaver, D. C. and Meiler, J.}, +title = {{H}igh-{T}hroughput {S}creening {A}ssay {D}atasets +from the {P}ub{C}hem {D}atabase}}, +journal = {Chemical Science}}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: + https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/experiments/README.md b/experiments/README.md new file mode 100644 index 000000000..c6308eff6 --- /dev/null +++ b/experiments/README.md @@ -0,0 +1,49 @@ +# Working with the Stability cluster + +We currently run our large scale experiments on the Stability AI HPC cluster. +This subdirectory features a few helpful scripts that can help you get up and +running on the cluster. + +If you believe you need access to the cluster for your work please reach out +to the core team on Discord. + +1. [Install Miniconda](stability-cluster/miniconda_install.sh) - + installs miniconda for your cluster environment. + +2. [Create Environment](stability-cluster/env_creation.sh) - + creates a basic conda environment for experiments. + + - Creates a conda environment at the prefix `CONDA_ENV_PATH` path. + > Using the positional argument passed into the script + - Clones `chemnlp` into your personal cluster `USER` directory. + - Installs the current revision of the `chemnlp` repository and + dependencies that are in your personal directory into the conda environment. + + ```bash + # general case + source experiments/scripts/stability-cluster/env_creation.sh where/to/store/conda where/to/build/conda/from/ + + # for creating a personal environment + source experiments/scripts/stability-cluster/env_creation.sh jack/ jack/ + ``` + +3. [Running Experiment](stability-cluster/sbatch_run.sh) - + runs a GPT-NeoX training pipeline + + - creates a conda environment using the `env_creation.sh` script. + - runs the GPT-NeoX `train.py` script using the user configuration + > as GPT-NeoX configurations can be combined, the PEFT configurations are held + > separately to the full model training and cluster configurations + + ```bash + # general case + sbatch experiments/scripts/stability-cluster/sbatch_run.sh where/to/store/conda where/to/build/conda/from/ + + # for typical small model finetuning experiments + sbatch experiments/scripts/stability-cluster/sbatch_run.sh experiments/my-experiment jack cluster_setup.yml 160M.yml + + # for typical small model soft-prompt experiments + sbatch experiments/scripts/stability-cluster/sbatch_run.sh experiments/my-experiment jack cluster_setup.yml 160M.yml soft_prompt.yml + ``` + + > To interact with WandB services you need to authenticate yourself as per the [Stability HPC guidelines](https://www.notion.so/stabilityai/Stability-HPC-Cluster-User-Guide-226c46436df94d24b682239472e36843) to append a username + password to your .netrc file. diff --git a/experiments/configs/160M.yml b/experiments/configs/160M.yml new file mode 100644 index 000000000..4a729f69d --- /dev/null +++ b/experiments/configs/160M.yml @@ -0,0 +1,106 @@ +# Model architecture found at https://huggingface.co/EleutherAI/pythia-160m +# Pretraining config at https://github.com/EleutherAI/pythia/blob/main/models/160M/pythia-160m.yml +# See other examples at https://github.com/EleutherAI/gpt-neox/tree/main/configs +{ + # parallelism settings + # you will want to change these based on your cluster setup, + # ideally scheduling pipeline stages across the node boundaries + "pipe-parallel-size": 1, + "model-parallel-size": 1, + + # model settings + "num-layers": 12, + "hidden-size": 768, + "num-attention-heads": 12, + "seq-length": 2048, + "max-position-embeddings": 2048, + "pos-emb": "rotary", + "rotary-pct": 0.25, + "no-weight-tying": true, + "gpt-j-residual": true, + "output-layer-parallelism": "column", + + # these should provide a speedup but take time to build + "scaled-upper-triang-masked-softmax-fusion": false, + "bias-gelu-fusion": false, + + # init methods + "init_method": "small_init", + "output_layer_init_method": "wang_init", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0006, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.00006, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + "cpu_offload": False + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data-impl": "mmap", + "gas": 1, + "num_workers": 1, + + # activation checkpointing + "checkpoint-activations": true, + "checkpoint-num-layers": 1, + "partition-activations": true, + "synchronize-each-layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight-decay": 0.1, + "hidden-dropout": 0.0, + "attention-dropout": 0.0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 12, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train-iters": 320000, + "lr-decay-iters": 320000, + "distributed-backend": "nccl", + "lr-decay-style": "cosine", + "warmup": 0.01, + "checkpoint-factor": 10000, + "eval-interval": 1000, + "eval-iters": 10, + + # logging + "log-interval": 100, + "steps_per_print": 10, + "keep-last-n-checkpoints": 4, + "wall_clock_breakdown": true, + + # tokenisation + "tokenizer-type": "HFTokenizer", + "vocab-file": "/fsx/pile/20B_tokenizer.json", + + # deepspeed + "launcher": "openmpi", + "deepspeed_mpi": true, +} diff --git a/configs/tune.yaml b/experiments/configs/accelerate_tune.yaml similarity index 100% rename from configs/tune.yaml rename to experiments/configs/accelerate_tune.yaml diff --git a/experiments/configs/cluster_setup.yml b/experiments/configs/cluster_setup.yml new file mode 100644 index 000000000..9df0b00d0 --- /dev/null +++ b/experiments/configs/cluster_setup.yml @@ -0,0 +1,24 @@ +# Suggested data paths when using GPT-NeoX locally +{ + # see example configs for sampling options + "data-path": "/fsx/proj-chemnlp/data/marianna13/chemrxiv/data_text_document", + + "save": "/fsx/proj-chemnlp/experiments/checkpoints/finetuned/pythia-160M", + "load": "/fsx/proj-chemnlp/experiments/checkpoints/pretrained/pythia-160M", + "finetune": True, + "checkpoint_validation_with_forward_pass": False, + + "log-dir": "/fsx/proj-chemnlp/experiments/logs", + "log_interval": 100, + "log_grad_pct_zeros": False, + "log_param_norm": False, + "log_grad_norm": False, + + "use_wandb": True, + "wandb_host": "https://stability.wandb.io", + "wandb_project": "LLCheM", + "wandb_group": "Test Runs", + + "hostfile": "/mock_path", + "num_gpus": 1, +} diff --git a/experiments/configs/soft_prompt.yml b/experiments/configs/soft_prompt.yml new file mode 100644 index 000000000..4985fde4d --- /dev/null +++ b/experiments/configs/soft_prompt.yml @@ -0,0 +1,9 @@ +{ + # peft method settings + "soft_prompt_tuning": { + "enabled": True, # also freezes all other parameters + "n_tokens": 10, + "init_string": "", + "init_range": 0.5, + } +} diff --git a/experiments/data/prepare_chemrxiv.py b/experiments/data/prepare_chemrxiv.py new file mode 100644 index 000000000..5a1735756 --- /dev/null +++ b/experiments/data/prepare_chemrxiv.py @@ -0,0 +1,43 @@ +""" +Preparing chemrxiv dataset as per GPT-NeoX guidelines +NOTE this needs to be run from the root of this repository directory + +Example usage: + python experiments/chem_data_prep.py /fsx/proj-chemnlp/data/ chemnlp/gpt-neox/ +""" +import argparse +import os + +import datasets +import jsonlines + +DATASET = "marianna13/chemrxiv" +GPT_NEOX_KEY = "text" + +if __name__ == "__main__": + # parse args + parser = argparse.ArgumentParser() + parser.add_argument( + "save_dir", help="Where you want to store the prepared dataset." + ) + parser.add_argument( + "gptneox_dir", help="Where you can find the GPT-NeoX repository." + ) + args = parser.parse_args() + + # save initial strings from chemrxiv articles as jsonlines + chem_data = datasets.load_dataset(DATASET) + all_full_text_samples = [ + {GPT_NEOX_KEY: paper["TEXT"]} for paper in chem_data["train"] + ] + save_path = f"{args.save_dir}/{DATASET}" + data_path = f"{save_path}/data.jsonl" + os.makedirs(save_path, exist_ok=True) + with jsonlines.open(data_path, "w") as writer: + writer.write_all(all_full_text_samples) + + # execute gpt-neox processing + gpt_tool_path = f"{args.gptneox_dir}/tools/preprocess_data.py" + os.system( + f"python {gpt_tool_path} --input {data_path} --output-prefix {save_path}/data --vocab /fsx/pile/20B_tokenizer.json --dataset-impl mmap --tokenizer-type HFTokenizer --append-eod" # noqa: E501 + ) diff --git a/scripts/run_tune.py b/experiments/scripts/run_tune.py similarity index 97% rename from scripts/run_tune.py rename to experiments/scripts/run_tune.py index 63a17d094..8381efc86 100644 --- a/scripts/run_tune.py +++ b/experiments/scripts/run_tune.py @@ -15,7 +15,7 @@ from chemnlp.utils import load_config HERE = Path(__file__).resolve() -CONFIG_PATH = HERE.parent.parent / "configs/tune.yaml" +CONFIG_PATH = HERE.parent.parent / "configs/accelerate_tune.yaml" def run(): diff --git a/experiments/scripts/stability-cluster/env_creation.sh b/experiments/scripts/stability-cluster/env_creation.sh new file mode 100644 index 000000000..bb10b1e25 --- /dev/null +++ b/experiments/scripts/stability-cluster/env_creation.sh @@ -0,0 +1,31 @@ +#! /bin/bash +### This script creates a conda environment for chemnlp +### The first arg ($1) is the prefix directory where the environment is saved +### The second arg ($2) is the directory to use when building the environment + +## Must already have miniconda installed! +export CONDA_ENV_PATH=/fsx/proj-chemnlp/$1/conda/env/chemnlp-standard +export PYTHON_VER=3.8 + +## ensure we can use activate syntax in slurm scripts +CONDA_BASE=$(conda info --base) +source $CONDA_BASE/etc/profile.d/conda.sh + +# Create Python environment through conda +conda create --force --prefix ${CONDA_ENV_PATH} python=${PYTHON_VER} -y +conda activate ${CONDA_ENV_PATH} + +# Python requirements +## cd into your directory inside of proj-chemnlp +cd /fsx/proj-chemnlp/$2 + +## clone + submodules (ok if exists) +[ ! -d 'chemnlp' ] && git clone --recurse-submodules --remote-submodules git@github.com:OpenBioML/chemnlp.git + +## install +pip install -r chemnlp/gpt-neox/requirements/requirements.txt # base gpt-neox reqs +pip install -r chemnlp/gpt-neox/requirements/requirements-wandb.txt # add wand monitoring reqs + +## downgrades / pins +pip install protobuf=="3.20" +pip install numpy=="1.23" diff --git a/experiments/scripts/stability-cluster/miniconda_install.sh b/experiments/scripts/stability-cluster/miniconda_install.sh new file mode 100644 index 000000000..b0660ef07 --- /dev/null +++ b/experiments/scripts/stability-cluster/miniconda_install.sh @@ -0,0 +1,5 @@ +#! /bin/bash + +cd ~ +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh # Follow instructions, accept all conditions blindly diff --git a/experiments/scripts/stability-cluster/sbatch_run.sh b/experiments/scripts/stability-cluster/sbatch_run.sh new file mode 100644 index 000000000..9ae422299 --- /dev/null +++ b/experiments/scripts/stability-cluster/sbatch_run.sh @@ -0,0 +1,32 @@ +#! /bin/bash +#SBATCH --job-name="chemtest" +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +# #SBATCH --gres=gpu:2 +#SBATCH --output=/fsx/proj-chemnlp/experiments/logs/job_%j.out +#SBATCH --error=/fsx/proj-chemnlp/experiments/logs/job_%j.err +#SBATCH --open-mode=append +#SBATCH --account=chemnlp +#SBATCH --partition=g40 +#SBATCH --exclusive +# #SBATCH --nodelist=ip-26-0-128-[46,48,85,93-94,101,106,111,123,136,142-143,168-169,175,183,189,211,215,223,231,244],ip-26-0-129-[0-1,4,6,11,45,48,60,81-82,84-85,94,105],ip-26-0-130-[183,193],ip-26-0-131-[4-5,38,51,77,85,89,107-108,111-112,130,143,150-152,168,182-183,188],ip-26-0-132-[130,139,141-142,149,154,184],ip-26-0-133-[159-160,226,242],ip-26-0-134-[0,26-27,43,52,61],ip-26-0-137-[92,94,97,102,115-116,121,124,139,168,175],ip-26-0-139-[191,200,214,216,218,226,229,235,237,241,246],ip-26-0-142-[106,125,144,146,166,184,186,198,204,217,235,237,246,251,254],ip-26-0-143-[30,39,46,53,61,66,145,164,171,175,180,206,225,230,235,250],ip-26-0-129-122,ip-26-0-130-[12-13,19,116,127,132,134,147-148,150,163-164],ip-26-0-131-[239-240,244,247],ip-26-0-132-[7,10,21,37,93,98,107,118],ip-26-0-133-[67,76,81,89,111,115,126,131-133,140,145,148,151],ip-26-0-134-[66,76,83,90-91,105,120,134,141,157,201,219,226-227,248,254],ip-26-0-135-[1,4,22,49,55,64,67,110,118,163,173,184,186,190,192-193,204,208,219,242,255],ip-26-0-136-13,ip-26-0-137-[176,184,196,212,214,240],ip-26-0-138-[3,13,51,62,66,69,71,79,93,101,159,166,171,178,186,188,208,213],ip-26-0-141-[140,146,157,161,166,178,217,228,247],ip-26-0-142-[3,13,21,24,29,33,36,38,41,45,49,67,71,103],ip-26-0-143-[111,121],ip-26-0-128-146,ip-26-0-137-76 + +### This script runs a GPT-NeoX experiments +### The first arg ($1) is the prefix directory where the environment is saved +### The second arg ($2) is the directory to use when building the environment +### The third arg ($3) is the name of the cluster config +### The fourth arg ($4) is the name of the training config +### The fifth arg ($5) is the name of any supplementary config (prompt tuning) + +set -ex # allow for exiting based on non-0 codes + +# set workdir +CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp + +# create environment +source $CHEMNLP_PATH/experiments/scripts/stability-cluster/env_creation.sh $1 $2 + +# trigger run +cd $CHEMNLP_PATH/gpt-neox +python3 deepy.py train.py --conf_dir $CHEMNLP_PATH/experiments/configs $3 $4 $5 diff --git a/pyproject.toml b/pyproject.toml index fdef5d6d3..af0db38a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,11 +30,12 @@ dev = [ "pre-commit", "pydantic_yaml", "pytest", + "pubchempy" ] -# [project.optional-dependencies] -# dataset_creation = [ -# ] +dataset_creation = [ + "PyTDC", +] [tool.setuptools_scm] version_scheme = "post-release" diff --git a/src/chemnlp/data_val/model.py b/src/chemnlp/data_val/model.py index 5f0541a20..acb1aa372 100644 --- a/src/chemnlp/data_val/model.py +++ b/src/chemnlp/data_val/model.py @@ -1,5 +1,7 @@ from typing import Dict, List, Optional +import pubchempy as pcp +import requests from pydantic import root_validator, validator from pydantic_yaml import YamlModel, YamlStrEnum @@ -13,13 +15,21 @@ class IdentifierEnum(YamlStrEnum): inchi = "InChI" inchikey = "InChIKey" other = "Other" + # we distinguish two RXN-SMILES variants. + # the simple one only includes educt and product + # the other one (rxnsmilesWAdd) also includes solvents etc. + rxnsmiles = "RXNSMILES" + rxnsmilesWAdd = "RXNSMILESWAdd" class Identifier(YamlModel): """Identifier information.""" id: str + description: Optional[str] + """A description of the field""" + type: IdentifierEnum names: Optional[List[str]] @@ -43,16 +53,83 @@ class ColumnTypes(YamlStrEnum): continuous = "continuous" categorical = "categorical" ordinal = "ordinal" + boolean = "boolean" class Target(YamlModel): """Target information.""" id: str + description: str - units: str + """A english description of the field""" + + units: Optional[str] + """The units of the field. None if unitless.""" + type: ColumnTypes + """The type of the field. Can be one of `continuous`, `categorical`, `ordinal`, `boolean`.""" + names: List[str] + """A list of names describing the field. + + Note that this will be used in building the prompts. Some example for prompts: + + - Boolean variables + + - `Is ?` + - ``` + What molecules in the list are ? + - + - + - + ``` + + + - Continuous variables + + - `What is of ?` + - ``` + What is the molecule with largest in the following list? + - + - + - + ``` + """ + + uris: Optional[List[str]] + """A URI or multiple (consitent ) URIs for the field. + + Ideally this would be a link to an entry in an ontrology or controlled + vocabulary that can also provide a canonical description for the field. + """ + + pubchem_aids: Optional[List[int]] + """A PubChem assay IDs or multiple (consistent) PubChem assay IDs. + + Make sure that the first assay ID is the primary assay ID. + """ + + @validator("uris") + def uris_resolves(cls, values): + if values is not None: + for uri in values: + # perform a request to the URI and check if it resolves + response = requests.get(uri) + if response.status_code == 403: + print( + f"URI {uri} does not resolve (403) since forbidden, please check manually" + ) + elif response.status_code != 200: + raise ValueError(f"URI {uri} does not resolve") + + @validator("pubchem_aids") + def pubchem_assay_ids_resolve(cls, values): + if values is not None: + for aid in values: + assays = pcp.get_assays(aid) + if len(assays) == 0: + raise ValueError(f"PubChem assay ID {aid} does not resolve") class Template(YamlModel): @@ -79,6 +156,19 @@ class Link(YamlModel): description: str +class Benchmark(YamlModel): + """Benchmark information.""" + + """The name of the benchmark, e.g. MoleculeNet.""" + name: str + + """The link to the benchmark.""" + link: str + + """The name of the column in the dataset that indicates the fold of the data point.""" + split_column: str + + class Dataset(YamlModel): name: str description: str @@ -91,7 +181,22 @@ class Dataset(YamlModel): fields: Optional[Dict[str, TemplateField]] links: List[Link] + benchmarks: Optional[List[Benchmark]] + @validator("num_points") def num_points_must_be_positive(cls, v): if v < 0: raise ValueError("num_points must be positive") + + @validator("links") + def links_must_resolve(cls, v): + if v is not None: + for link in v: + response = requests.get(link.url) + if response.status_code == 403: + print( + f"Link {link.url} does not resolve (403) since forbidden, please check manually" + ) + elif response.status_code != 200: + if not (("acs" in response.text) or ("sage" in response.text)): + raise ValueError(f"Link {link.url} does not resolve")