From 9bd4531ed2a6e62d840a39190de8a24c3fb6650d Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:01:16 +0200 Subject: [PATCH 01/19] cleanup the makefile --- Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Makefile b/Makefile index 11965232..6034d8ad 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,5 @@ # https://makefiletutorial.com/ -run_stub: - bash ./data/mock_data/generate_mock_files.sh && nextflow run main.nf -entry TEST -profile dev -stub-run -process.cpus 1 -process.memory 1.GB -resume - run_dev: nextflow run main.nf -profile conda,dev -entry TEST -resume -with-tower From ecc9fe6a4cbe84f223195c9d6cfa900f3b2686c3 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:01:35 +0200 Subject: [PATCH 02/19] tweak comments --- workflows/merge_wf.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/merge_wf.nf b/workflows/merge_wf.nf index a6b47a71..39f4f40d 100644 --- a/workflows/merge_wf.nf +++ b/workflows/merge_wf.nf @@ -43,7 +43,6 @@ workflow MERGE_WF { - //FIXME: Refactor this to emit two different files and use only the approved samples //NOTE: Use the stats file for the entire cohort (from CALL_WF) // and filter out the samples which pass all thresholds approved_call_wf_samples_ch = cohort_stats_tsv From c46c8d6b14a99408fc3472b7c48edc1af3e66859 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:23:30 +0200 Subject: [PATCH 03/19] update readme --- README.md | 68 ++++++++++------------------------- docs/cloud_batch_execution.md | 0 docs/conda_execution.md | 38 ++++++++++++++++++++ docs/docker_execution.md | 0 docs/hpc_execution.md | 0 5 files changed, 56 insertions(+), 50 deletions(-) create mode 100644 docs/cloud_batch_execution.md create mode 100644 docs/conda_execution.md create mode 100644 docs/docker_execution.md create mode 100644 docs/hpc_execution.md diff --git a/README.md b/README.md index df5f1392..4cc0a36d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# XBS-nf +# MAGMA -XBS-nf (compleX Bacterial Samples) is a pipeline for comprehensive genomic analyses of Mycobacterium tuberculosis with a focus on clinical decision making as well as research. +MAGMA (**M**aximum **A**ccessible **G**enome for **M**tb **A**nalysis) is a pipeline for comprehensive genomic analyses of Mycobacterium tuberculosis with a focus on clinical decision making as well as research. # Salient features of the implementation @@ -9,25 +9,23 @@ XBS-nf (compleX Bacterial Samples) is a pipeline for comprehensive genomic analy - Ease of use on a range of infrastructure (cloud/on-prem HPC clusters/ servers (or local machines)) - Resumability for failed processes - Centralized locations for specifying analysis parameters and hardware requirements - - XBS-nf parameters (`default_parameters.config`) + - MAGMA parameters (`default_parameters.config`) - Hardware requirements (`conf/standard.config`) - Execution (software) requirements (`conf/docker.config` or `conf/conda.config`) -- A GVCF reference dataset for ~600 samples +- An (optional) GVCF reference dataset for ~600 samples is provided for augmenting smaller datasets + +> **Note** +> Downloading the reference EXIT_RIF GVCF files from FIXME # Usage and Tutorial -For the usage and tutorials please refer the XBS-nf website +For the usage and tutorials please refer the [docs](./docs) folder. ## Prerequisites -### Git tooling - -- `git` and `git-lfs` - -> NOTE: Without the `git-lfs` tool the optional bundled wouldn't be downloaded correctly. - ### Nextflow +- `git` : The version control in the pipeline. - `Java-11` or `Java-17` (preferred) **NOTE**: The `java` version should NOT be an `internal jdk` release! You can check the release via `java -version` @@ -44,7 +42,7 @@ $ curl -s https://get.nextflow.io | bash $ chmod +x nextflow ``` -- Add `nextflow` to your `path` (perhaps `/usr/local/bin/`) +- Add `nextflow` to your `path` (for example `/usr/local/bin/`) ```sh $ mv nextflow /usr/local/bin @@ -64,47 +62,17 @@ $ nextflow info ``` -### Local Conda environments for XBS-nf - -> **NOTE**: The conda environments are expected by the `conda_local` profile to be created within `xbs-nf/conda_envs` directory - -- Clone the pipeline locally and `cd` into it - -```sh -$ git clone https://github.com/TORCH-Consortium/xbs-nf - -$ cd xbs-nf - -``` - -- `cd` in the `conda_envs` folder and execute the following commands -```sh -$ conda env create -p xbs-nf-env-1 --file xbs-nf-env-1.yml - -$ conda env create -p xbs-nf-env-2 --file xbs-nf-env-2.yml -``` - -> TIP: For faster installation process, please download [mamba](https://github.com/mamba-org/mamba) tool and replace `conda` with `mamba` in the above commands. - -### Run the pipeline - -- Customize the pipeline and process level settings in the [default_params](./default_params.config) file +### Running MAGMA on different environments -- From inside the `xbs-nf` folder, invoke the pipeline - -```sh -$ nextflow run main.nf -profile conda -``` -- use the ```-resume``` flag to continue from previously generated output files, rather than starting from scratch. - -```sh -$ nextflow run main.nf -profile conda -resume -``` +1. Local Conda environments for MAGMA +2. Docker based execution for MAGMA +3. HPC based execution for MAGMA +4. Cloud batch (AWS/Google/Azure) based execution for MAGMA - +# Citation - +TODO: Update this section and add a citation.cff file # Contributions @@ -112,4 +80,4 @@ Contributions are warmly accepted! # License -Please refer the [LICENSE](./LICENSE) file. +Please refer the [GPL 3.0 LICENSE](./LICENSE) file. diff --git a/docs/cloud_batch_execution.md b/docs/cloud_batch_execution.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/conda_execution.md b/docs/conda_execution.md new file mode 100644 index 00000000..bb339e67 --- /dev/null +++ b/docs/conda_execution.md @@ -0,0 +1,38 @@ + +> **NOTE**: The conda environments are expected by the `conda_local` profile of the pipeline, to be created within `MAGMA/conda_envs` directory + +- Clone the pipeline locally and `cd` into it + +```sh +$ git clone https://github.com/TORCH-Consortium/MAGMA + +$ cd MAGMA + +``` + +- `cd` in the `conda_envs` folder and execute the following commands + +```sh +$ conda env create -p magma-env-1 --file magma-env-1.yml + +$ conda env create -p magma-env-2 --file magma-env-2.yml +``` + +> TIP: For faster installation process, please download [mamba](https://github.com/mamba-org/mamba) tool and replace `conda` with `mamba` in the above commands. + +### Run the pipeline + +- Customize the pipeline and process level settings in the [default_params](./default_params.config) file + +- From inside the `magma` folder, invoke the pipeline + +```sh +$ nextflow run main.nf -profile conda +``` +- use the ```-resume``` flag to continue from previously generated output files, rather than starting from scratch. + +```sh +$ nextflow run main.nf -profile conda -resume +``` + + diff --git a/docs/docker_execution.md b/docs/docker_execution.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/hpc_execution.md b/docs/hpc_execution.md new file mode 100644 index 00000000..e69de29b From 74ce67be8fe0be8b14357a7ad13e66b971a6c967 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:24:03 +0200 Subject: [PATCH 04/19] update conda env names --- conda_envs/{xbs-nf-env-1.yml => magma-env-1.yml} | 0 conda_envs/{xbs-nf-env-2.yml => magma-env-2.yml} | 0 conda_envs/setup_conda_envs.sh | 12 ++++++------ 3 files changed, 6 insertions(+), 6 deletions(-) rename conda_envs/{xbs-nf-env-1.yml => magma-env-1.yml} (100%) rename conda_envs/{xbs-nf-env-2.yml => magma-env-2.yml} (100%) diff --git a/conda_envs/xbs-nf-env-1.yml b/conda_envs/magma-env-1.yml similarity index 100% rename from conda_envs/xbs-nf-env-1.yml rename to conda_envs/magma-env-1.yml diff --git a/conda_envs/xbs-nf-env-2.yml b/conda_envs/magma-env-2.yml similarity index 100% rename from conda_envs/xbs-nf-env-2.yml rename to conda_envs/magma-env-2.yml diff --git a/conda_envs/setup_conda_envs.sh b/conda_envs/setup_conda_envs.sh index 64d1c9f4..52f5b26f 100644 --- a/conda_envs/setup_conda_envs.sh +++ b/conda_envs/setup_conda_envs.sh @@ -5,15 +5,15 @@ set -e # NOTE: Please replace `conda` with `mamba` if it is installed for faster installs. resolverCondaBinary="conda" # pick either conda OR mamba -# NOTE: By default, the conda environments are expected by the `conda_local` profile to be created within `xbs-nf/conda_envs` directory +# NOTE: By default, the conda environments are expected by the `conda_local` profile to be created within `magma/conda_envs` directory -$resolverCondaBinary env create -p xbs-nf-env-1 --file xbs-nf-env-1.yml +$resolverCondaBinary env create -p magma-env-1 --file magma-env-1.yml -$resolverCondaBinary env create -p xbs-nf-env-2 --file xbs-nf-env-2.yml +$resolverCondaBinary env create -p magma-env-2 --file magma-env-2.yml -echo "INFO: Activate conda env with tb-profiler and setup the WHO database within the xbs-nf-env-1" +echo "INFO: Activate conda env with tb-profiler and setup the WHO database within the magma-env-1" eval "$(conda shell.bash hook)" -conda activate "./xbs-nf-env-1" +conda activate "./magma-env-1" echo "INFO: Make a local copy and cd inside it" cp -r ../resources/resistance_db_who ./ @@ -26,5 +26,5 @@ echo "INFO: Remove the local copy of the database folder" cd .. rm -rf resistance_db_who -echo "INFO: Deactivate the xbs-nf-env-1 env" +echo "INFO: Deactivate the magma-env-1 env" conda deactivate From 52c8d38842584e2104872cf6797577f364192df1 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:24:32 +0200 Subject: [PATCH 05/19] update docker container folders --- containers/build.sh | 8 ++++---- .../{xbs-nf-container-1 => magma-container-1}/Dockerfile | 0 .../{xbs-nf-container-2 => magma-container-2}/Dockerfile | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename containers/{xbs-nf-container-1 => magma-container-1}/Dockerfile (100%) rename containers/{xbs-nf-container-2 => magma-container-2}/Dockerfile (100%) diff --git a/containers/build.sh b/containers/build.sh index aa2e5d71..aea21359 100755 --- a/containers/build.sh +++ b/containers/build.sh @@ -3,12 +3,12 @@ set -uex # NOTE: Make sure you've set the environment correctly and are logged in to the registry. -DOCKER_NAMESPACE="rg.nl-ams.scw.cloud/xbs-nf-containers" +DOCKER_NAMESPACE="rg.nl-ams.scw.cloud/magma-containers" -cp ../conda_envs/xbs-nf-env-1.yml ./xbs-nf-container-1 -cp -r ../resources/resistance_db_who ./xbs-nf-container-1 +cp ../conda_envs/magma-env-1.yml ./magma-container-1 +cp -r ../resources/resistance_db_who ./magma-container-1 -cp ../conda_envs/xbs-nf-env-2.yml ./xbs-nf-container-2 +cp ../conda_envs/magma-env-2.yml ./magma-container-2 for container_dir in $(find * -maxdepth 0 -type d); do echo "Building $container_dir ..." diff --git a/containers/xbs-nf-container-1/Dockerfile b/containers/magma-container-1/Dockerfile similarity index 100% rename from containers/xbs-nf-container-1/Dockerfile rename to containers/magma-container-1/Dockerfile diff --git a/containers/xbs-nf-container-2/Dockerfile b/containers/magma-container-2/Dockerfile similarity index 100% rename from containers/xbs-nf-container-2/Dockerfile rename to containers/magma-container-2/Dockerfile From d3c1ea660f1b9a16ad4cd34690cd6f299d089388 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:25:15 +0200 Subject: [PATCH 06/19] Update nextflow config name --- docs/cloud_batch_execution.md | 4 ++++ nextflow.config | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/cloud_batch_execution.md b/docs/cloud_batch_execution.md index e69de29b..d07f6c56 100644 --- a/docs/cloud_batch_execution.md +++ b/docs/cloud_batch_execution.md @@ -0,0 +1,4 @@ + +# Mention Tower Launch + + diff --git a/nextflow.config b/nextflow.config index 2ee3158a..87b7b93f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,10 +1,10 @@ manifest { - name = 'XBS-nf' - description = 'XBS-nf (compleX Bacterial Samples) is a pipeline for comprehensive genomic analyses of Mycobacterium tuberculosis with a focus on clinical decision making as well as research.' - version = '0.9.10' + name = 'MAGMA' + description = 'MAGMA is a pipeline for comprehensive genomic analyses of Mycobacterium tuberculosis with a focus on clinical decision making as well as research.' + version = '1.0.0' author = 'TORCH-Consortium' defaultBranch = 'master' - homePage = 'https://github.com/TORCH-Consortium/xbs-nf' + homePage = 'https://github.com/TORCH-Consortium/MAGMA' } params { includeConfig 'default_params.config' } From bf59b58e4e37b31365769daef009dc76f6af63cb Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:25:24 +0200 Subject: [PATCH 07/19] update sample params file --- params/params.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/params/params.yaml b/params/params.yaml index 8846901d..3fff2c15 100644 --- a/params/params.yaml +++ b/params/params.yaml @@ -3,15 +3,15 @@ # # Samplesheets -input_samplesheet: "/home/xbs-nf-runs/data/EarlyMGIT.10samples.slurm.csv" +input_samplesheet: "/home/magma-runs/data/EarlyMGIT.10samples.slurm.csv" # Main output dir -outdir : "/home/xbs-nf-results-earlymgit10samples" +outdir : "/home/magma-results-earlymgit10samples" # Location for manually created conda-envs -conda_envs_location : "/home/xbs-nf-conda_envs" +conda_envs_location : "/home/magma-conda_envs" # Some boolean options optimize_variant_recalibration : false From aaf877f8b142f6542c38ab563316bf1ff42baf4a Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:27:22 +0200 Subject: [PATCH 08/19] update docker and conda folder for the new name --- conda_envs/magma-env-1.yml | 2 +- conda_envs/magma-env-2.yml | 2 +- containers/magma-container-1/Dockerfile | 4 ++-- containers/magma-container-2/Dockerfile | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda_envs/magma-env-1.yml b/conda_envs/magma-env-1.yml index 0944ebeb..de3d7b6e 100644 --- a/conda_envs/magma-env-1.yml +++ b/conda_envs/magma-env-1.yml @@ -1,4 +1,4 @@ -name: xbs-nf-env-1 +name: magma-env-1 channels: - conda-forge - bioconda diff --git a/conda_envs/magma-env-2.yml b/conda_envs/magma-env-2.yml index 3433d126..08777bc5 100644 --- a/conda_envs/magma-env-2.yml +++ b/conda_envs/magma-env-2.yml @@ -1,4 +1,4 @@ -name: xbs-nf-env-2 +name: magma-env-2 channels: - conda-forge - bioconda diff --git a/containers/magma-container-1/Dockerfile b/containers/magma-container-1/Dockerfile index 9aed46c7..ecde5d65 100644 --- a/containers/magma-container-1/Dockerfile +++ b/containers/magma-container-1/Dockerfile @@ -1,9 +1,9 @@ FROM mambaorg/micromamba:0.25.1 AS base #NOTE: The conda env file has been copied via the build script -COPY --chown=$MAMBA_USER:$MAMBA_USER xbs-nf-env-1.yml /tmp/xbs-nf-env-1.yml +COPY --chown=$MAMBA_USER:$MAMBA_USER magma-env-1.yml /tmp/magma-env-1.yml -RUN micromamba install -y -f /tmp/xbs-nf-env-1.yml -n base +RUN micromamba install -y -f /tmp/magma-env-1.yml -n base RUN micromamba install -y -n base conda-forge::procps-ng && micromamba clean -a -y diff --git a/containers/magma-container-2/Dockerfile b/containers/magma-container-2/Dockerfile index fdb4c64d..cd14b60a 100644 --- a/containers/magma-container-2/Dockerfile +++ b/containers/magma-container-2/Dockerfile @@ -1,9 +1,9 @@ FROM mambaorg/micromamba:0.25.1 AS base #NOTE: The conda env file has been copied via the build script -COPY --chown=$MAMBA_USER:$MAMBA_USER xbs-nf-env-2.yml /tmp/xbs-nf-env-2.yml +COPY --chown=$MAMBA_USER:$MAMBA_USER magma-env-2.yml /tmp/magma-env-2.yml -RUN micromamba install -y -f /tmp/xbs-nf-env-2.yml -n base +RUN micromamba install -y -f /tmp/magma-env-2.yml -n base RUN micromamba install -y -n base conda-forge::procps-ng && micromamba clean -a -y From d407e19fc78ac7b80b8bec938e0236ec6b3ee44f Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:34:04 +0200 Subject: [PATCH 09/19] rename xbs to magma in configurations --- conf/conda_local.config | 12 ++++++------ conf/docker.config | 4 ++-- conf/template_noconda.config | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/conf/conda_local.config b/conf/conda_local.config index 3edb4388..af44392f 100644 --- a/conf/conda_local.config +++ b/conf/conda_local.config @@ -11,19 +11,19 @@ process { withName: 'GATK.*|LOFREQ.*|DELLY.*|TBPROFILER.*|MULTIQC.*|FASTQC.*|UTILS.*|FASTQ.*|SAMPLESHEET.*' { //environment does exist: - conda = "${params.conda_envs_location}/xbs-nf-env-1" + conda = "${params.conda_envs_location}/magma-env-1" - //environment does NOT exist: the env file contains all the info to create the environment "xbs-nf-env-1" - //conda = "${params.conda_envs_location}/xbs-nf-env-1.yml" + //environment does NOT exist: the env file contains all the info to create the environment "magma-env-1" + //conda = "${params.conda_envs_location}/magma-env-1.yml" } withName: 'BWA.*|IQTREE.*|SNPDISTS.*|SNPSITES.*|BCFTOOLS.*|BGZIP.*|SAMTOOLS.*|SNPEFF.*|CLUSTERPICKER.*' { //environment does exist: - conda = "${params.conda_envs_location}/xbs-nf-env-2" + conda = "${params.conda_envs_location}/magma-env-2" - //environment does NOT exist: the env file contains all the info to create the environment "xbs-nf-env-2" - //conda = "${params.conda_envs_location}/xbs-nf-env-2.yml" + //environment does NOT exist: the env file contains all the info to create the environment "magma-env-2" + //conda = "${params.conda_envs_location}/magma-env-2.yml" } diff --git a/conf/docker.config b/conf/docker.config index e01588f2..55547ad3 100644 --- a/conf/docker.config +++ b/conf/docker.config @@ -6,12 +6,12 @@ process { withName: 'GATK.*|LOFREQ.*|DELLY.*|TBPROFILER.*|MULTIQC.*|FASTQC.*|UTILS.*|FASTQ.*|SAMPLESHEET.*' { - container = "rg.nl-ams.scw.cloud/xbs-nf-containers/xbs-nf-container-1:0.9.11" + container = "rg.nl-ams.scw.cloud/magma-containers/magma-container-1:1.0.0" } withName: 'BWA.*|IQTREE.*|SNPDISTS.*|SNPSITES.*|BCFTOOLS.*|BGZIP.*|SAMTOOLS.*|SNPEFF.*|CLUSTERPICKER.*' { - container = "rg.nl-ams.scw.cloud/xbs-nf-containers/xbs-nf-container-2:0.9.11" + container = "rg.nl-ams.scw.cloud/magma-containers/magma-container-2:1.0.0" } } diff --git a/conf/template_noconda.config b/conf/template_noconda.config index 3bee85a4..9191d030 100644 --- a/conf/template_noconda.config +++ b/conf/template_noconda.config @@ -1,4 +1,4 @@ -//NOTE: Result directories used in XBS_main.py +//NOTE: Result directories used in magma_main.py // os.path.join(args['output_dir'], 'mapped_singles') // os.path.join(args['output_dir'], 'mapped') // os.path.join(args['output_dir'], 'gvcf') @@ -16,13 +16,13 @@ // os.path.join(args['output_dir'], 'vqsr/{}'.format(args['vcf_name'])) // os.path.join(args['output_dir'], 'fasta/{}'.format(args['vcf_name'])) // os.path.join(args['output_dir'], 'phylogeny/{}'.format(args['vcf_name'])) -// os.path.join(args['output_dir'], 'resistance/{}/XBS'.format(args['vcf_name'])) +// os.path.join(args['output_dir'], 'resistance/{}/magma'.format(args['vcf_name'])) // os.path.join(args['output_dir'], 'logs/{}.summary'.format(args['vcf_name'])), 'w') params { - input_samplesheet = "${projectDir}/resources/reference_set/xbs-nf.pbs.test.csv" + input_samplesheet = "${projectDir}/resources/reference_set/magma.pbs.test.csv" outdir = "${projectDir}/results" //----------------------- From 39a888a396b6bbfd6de695bfff12442bd91e5486 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:36:54 +0200 Subject: [PATCH 10/19] implement renaming in the default params script --- default_params.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/default_params.config b/default_params.config index 9e97ee41..e366a279 100644 --- a/default_params.config +++ b/default_params.config @@ -7,14 +7,14 @@ input_samplesheet = "samplesheet.csv" // The directory to which all output files should be written -outdir = "xbs-nf-results" +outdir = "magma-results" // The name of the output VCF file // NOTE: This parameter is used to derive the JOINT_NAME in XBS_main.py vcf_name = "joint" // NOTE: Got little genetic diveristy in your dataset? (.e.g clonal or <20 samples) Switch this option to true to include additional samples. -// NOTE: Provide this file: /xbs-nf/resources/exit_rif/ +// NOTE: Provide this file: /magma/resources/exit_rif/ // TODO: Allow this to be sourced via file system location use_ref_exit_rif_gvcf = false @@ -59,7 +59,7 @@ only_validate_fastqs = false // OR true //Use this flag to skip the final merge skip_merge = false -// XBS-nf optimises VQSR, if this messes up use the default settings for VQSR +// MAGMA optimises VQSR, if this messes up use the default settings for VQSR optimize_variant_recalibration = true //FIXME: This might not be needed since we will rely upon present conda_envs and container envs From bd371ce2ba9decc6d1bdc7eeb8d8f5c6198a9ee2 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:37:12 +0200 Subject: [PATCH 11/19] implement renaming in the resistance summarization script --- bin/summarize_resistance.py | 62 ++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/bin/summarize_resistance.py b/bin/summarize_resistance.py index 0fd1c7cb..1d188e8a 100755 --- a/bin/summarize_resistance.py +++ b/bin/summarize_resistance.py @@ -36,7 +36,7 @@ def add_var_to_df(df, pt_id, drug, var, freq): df.loc[pt_id, drug] += ' & {} ({:.0%})'.format(var_repr, freq) if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Analyse resistance output from XBS Pipeline') + parser = argparse.ArgumentParser(description='Analyse resistance output from magma Pipeline') parser.add_argument('major_res_var_dir', metavar='major_res_var_dir', type=str, help='The directory containing the major variants TBProfiler output files') parser.add_argument('minor_res_var_dir', metavar='minor_res_var_dir', type=str, help='The directory containing the minor variants TBProfiler output files') parser.add_argument('summary_output_dir', metavar='summary_output_dir', type=str, help='The directory where the resulting excel sheets should be placed') @@ -51,7 +51,7 @@ def add_var_to_df(df, pt_id, drug, var, freq): keys = file_name.split('.') samples[keys[1]] = {} with open(os.path.join(os.path.join(args['major_res_var_dir'], 'results', file_name))) as json_file: - samples[keys[1]]['xbs'] = json.load(json_file) + samples[keys[1]]['magma'] = json.load(json_file) if os.path.exists(os.path.join(os.path.join(args['minor_res_var_dir'], 'results'))): for file_name in os.listdir(os.path.join(os.path.join(args['minor_res_var_dir'], 'results'))): @@ -69,26 +69,26 @@ def add_var_to_df(df, pt_id, drug, var, freq): for patient, sample in tqdm(samples_df.iterrows(), total=samples_df.shape[0]): sample_res = samples[patient] - pt_df_xbs = pd.DataFrame(columns=['Drug', 'Variant', 'Interpretation', 'Source'] + ['Conclusion {}'.format(patient)] + list([patient])).set_index(['Drug', 'Variant']) + pt_df_magma = pd.DataFrame(columns=['Drug', 'Variant', 'Interpretation', 'Source'] + ['Conclusion {}'.format(patient)] + list([patient])).set_index(['Drug', 'Variant']) pt_df_lof = pd.DataFrame(columns=['Drug', 'Variant', 'Interpretation', 'Source'] + ['Conclusion {}'.format(patient)] + list([patient])).set_index(['Drug', 'Variant']) """ - Add the DR variants from the XBS analysis to the xbs variant dataframe. - Do this after adding the lofreq dr variants to show the XBS variant frequencies. + Add the DR variants from the magma analysis to the magma variant dataframe. + Do this after adding the lofreq dr variants to show the magma variant frequencies. """ - for var in sample_res['xbs']['dr_variants']: + for var in sample_res['magma']['dr_variants']: gene = var['gene'] if gene == '.': gene = var['locus_tag'] var_repr = '{}_{}'.format(gene, var['change']) for drug in var['drugs']: drug = drug['drug'].lower().replace(' ', '_') - pt_df_xbs.loc[(drug, var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 0, 'WHO Catalogue'] + pt_df_magma.loc[(drug, var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 0, 'WHO Catalogue'] """ - Add the other variants from the XBS analysis to the xbs variant dataframe. + Add the other variants from the magma analysis to the magma variant dataframe. """ - for var in sample_res['xbs']['other_variants']: + for var in sample_res['magma']['other_variants']: gene = var['gene'] if gene == '.': gene = var['locus_tag'] @@ -96,19 +96,19 @@ def add_var_to_df(df, pt_id, drug, var, freq): # Add all the other variants as unknown classification and overwrite their classification later if necessary for drug in var['gene_associated_drugs']: drug = drug.lower().replace(' ', '_') - pt_df_xbs.loc[(drug, var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 1, 'Tier 1 or 2 gene'] + pt_df_magma.loc[(drug, var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 1, 'Tier 1 or 2 gene'] # Overwrite the variant classification for drugs which have a WHO sens classification last as to overrule all other classifications if 'annotation' in var: for annotation in var['annotation']: if annotation['type'] == 'resistance_association_confidence' and (int(annotation['confidence']) == 4 or int(annotation['confidence']) == 5): - pt_df_xbs.loc[(annotation['drug'].lower().replace(' ', '_'), var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 2, 'WHO Catalogue'] + pt_df_magma.loc[(annotation['drug'].lower().replace(' ', '_'), var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 2, 'WHO Catalogue'] elif annotation['type'] == 'resistance_association_confidence' and (int(annotation['confidence']) == 3): - pt_df_xbs.loc[(annotation['drug'].lower().replace(' ', '_'), var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 1, 'WHO Catalogue'] + pt_df_magma.loc[(annotation['drug'].lower().replace(' ', '_'), var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 1, 'WHO Catalogue'] else: display(annotation) """ - Add the DR variants from the lofreq analysis to the xbs variant dataframe. + Add the DR variants from the lofreq analysis to the magma variant dataframe. """ if 'lofreq' in sample_res: for var in sample_res['lofreq']['dr_variants']: @@ -143,10 +143,10 @@ def add_var_to_df(df, pt_id, drug, var, freq): else: display(annotation) - # Remove all variants in the XBS summary from the lofreq summary - pt_df_lof = pt_df_lof.drop([i for i in pt_df_xbs.index if i in pt_df_lof.index]) - for drug in list(drugs - set([i[0] for i in pt_df_xbs.index.values])): - pt_df_xbs.loc[(drug, 'No variants found'), ('Interpretation')] = 2 + # Remove all variants in the magma summary from the lofreq summary + pt_df_lof = pt_df_lof.drop([i for i in pt_df_magma.index if i in pt_df_lof.index]) + for drug in list(drugs - set([i[0] for i in pt_df_magma.index.values])): + pt_df_magma.loc[(drug, 'No variants found'), ('Interpretation')] = 2 for drug in list(drugs - set([i[0] for i in pt_df_lof.index.values])): pt_df_lof.loc[(drug, 'No variants found'), ('Interpretation')] = 2 @@ -155,16 +155,16 @@ def add_var_to_df(df, pt_id, drug, var, freq): Calculate the conclusion for all samples """ for _, sample in samples_df.loc[[patient]].iterrows(): - work_df = pt_df_xbs[pd.notna(pt_df_xbs[patient])] - for drug in pt_df_xbs.index.levels[0]: + work_df = pt_df_magma[pd.notna(pt_df_magma[patient])] + for drug in pt_df_magma.index.levels[0]: if drug not in set([i[0] for i in work_df.index.values]): - pt_df_xbs.loc[drug, 'Conclusion {}'.format(patient)] = 2 + pt_df_magma.loc[drug, 'Conclusion {}'.format(patient)] = 2 elif 0 in work_df.loc[drug, 'Interpretation'].value_counts(): - pt_df_xbs.loc[drug, 'Conclusion {}'.format(patient)] = 0 + pt_df_magma.loc[drug, 'Conclusion {}'.format(patient)] = 0 elif 1 in work_df.loc[drug, 'Interpretation'].value_counts(): - pt_df_xbs.loc[drug, 'Conclusion {}'.format(patient)] = 1 + pt_df_magma.loc[drug, 'Conclusion {}'.format(patient)] = 1 else: - pt_df_xbs.loc[drug, 'Conclusion {}'.format(patient)] = 2 + pt_df_magma.loc[drug, 'Conclusion {}'.format(patient)] = 2 for _, sample in samples_df.loc[[patient]].iterrows(): work_df = pt_df_lof[pd.notna(pt_df_lof[patient])] for drug in pt_df_lof.index.levels[0]: @@ -178,9 +178,9 @@ def add_var_to_df(df, pt_id, drug, var, freq): pt_df_lof.loc[drug, 'Conclusion {}'.format(patient)] = 2 x2 = pt_df_lof.copy(deep=True) - pt_df_xbs = pt_df_xbs.reset_index().sort_values(['Conclusion {}'.format(patient)] + ['Drug', 'Interpretation', 'Variant']) - for column in [i for i in pt_df_xbs if 'Conclusion' in i or 'Interpretation' == i]: - pt_df_xbs[column] = pt_df_xbs[column].apply(lambda c: class_map[c]) + pt_df_magma = pt_df_magma.reset_index().sort_values(['Conclusion {}'.format(patient)] + ['Drug', 'Interpretation', 'Variant']) + for column in [i for i in pt_df_magma if 'Conclusion' in i or 'Interpretation' == i]: + pt_df_magma[column] = pt_df_magma[column].apply(lambda c: class_map[c]) pt_df_lof = pt_df_lof.reset_index().sort_values(['Conclusion {}'.format(patient)] + ['Drug', 'Interpretation', 'Variant']) for column in [i for i in pt_df_lof if 'Conclusion' in i or 'Interpretation' == i]: pt_df_lof[column] = pt_df_lof[column].apply(lambda c: class_map[c]) @@ -188,7 +188,7 @@ def add_var_to_df(df, pt_id, drug, var, freq): """ Write both sheets to excel with formatting""" with pd.ExcelWriter(os.path.join(summary_dir, '{}.xlsx'.format(patient)), engine='xlsxwriter') as writer: - pt_df_xbs.set_index(['Drug'] + ['Conclusion {}'.format(patient)] + ['Variant']).to_excel(writer, sheet_name=major_variants_sheet_name) + pt_df_magma.set_index(['Drug'] + ['Conclusion {}'.format(patient)] + ['Variant']).to_excel(writer, sheet_name=major_variants_sheet_name) pt_df_lof.set_index(['Drug'] + ['Conclusion {}'.format(patient)] + ['Variant']).to_excel(writer, sheet_name=minor_variants_sheet_name) #unclassified.reset_index().sort_values(by=['Conclusion', 'Drug', 'Interpretation', 'Variant']).set_index(['Drug', 'Conclusion', 'Variant']).to_excel(writer, sheet_name='Unclassified Variants') @@ -200,10 +200,10 @@ def add_var_to_df(df, pt_id, drug, var, freq): # Add formatting to Variants sheet for i in range(samples_df.loc[[patient]].shape[0]): - writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[1+i], alphabet[1+i], pt_df_xbs.shape[0]+1), cond_res) - writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[1+i], alphabet[1+i], pt_df_xbs.shape[0]+1), cond_sens) - writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[2+samples_df.loc[[patient]].shape[0]], alphabet[2+samples_df.loc[[patient]].shape[0]], pt_df_xbs.shape[0]+1), cond_res) - writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[2+samples_df.loc[[patient]].shape[0]], alphabet[2+samples_df.loc[[patient]].shape[0]], pt_df_xbs.shape[0]+1), cond_sens) + writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[1+i], alphabet[1+i], pt_df_magma.shape[0]+1), cond_res) + writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[1+i], alphabet[1+i], pt_df_magma.shape[0]+1), cond_sens) + writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[2+samples_df.loc[[patient]].shape[0]], alphabet[2+samples_df.loc[[patient]].shape[0]], pt_df_magma.shape[0]+1), cond_res) + writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[2+samples_df.loc[[patient]].shape[0]], alphabet[2+samples_df.loc[[patient]].shape[0]], pt_df_magma.shape[0]+1), cond_sens) # Add formatting to Lofreq variants sheet for i in range(samples_df.loc[[patient]].shape[0]): From 5c2b08ce7ec92ed4df1be3dcba51a761d2376a64 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 15:40:10 +0200 Subject: [PATCH 12/19] add pages for usage and presentations --- README.md | 4 ++-- docs/presentations.md | 0 docs/usage.md | 0 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 docs/presentations.md create mode 100644 docs/usage.md diff --git a/README.md b/README.md index 4cc0a36d..d63100a7 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,9 @@ MAGMA (**M**aximum **A**ccessible **G**enome for **M**tb **A**nalysis) is a pipe > **Note** > Downloading the reference EXIT_RIF GVCF files from FIXME -# Usage and Tutorial +# Tutorials and Presentations -For the usage and tutorials please refer the [docs](./docs) folder. +For the tutorials(./docs/tutorials.md) and [presentations](./docs/presentations.md) please refer the [docs](./docs) folder. ## Prerequisites diff --git a/docs/presentations.md b/docs/presentations.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 00000000..e69de29b From 9d77c85996d6d11f9fa5e5e781cb0c68932588b2 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 16:26:55 +0200 Subject: [PATCH 13/19] update the build number --- containers/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containers/build.sh b/containers/build.sh index aea21359..83e99b52 100755 --- a/containers/build.sh +++ b/containers/build.sh @@ -13,7 +13,7 @@ cp ../conda_envs/magma-env-2.yml ./magma-container-2 for container_dir in $(find * -maxdepth 0 -type d); do echo "Building $container_dir ..." cd $container_dir - CONTAINER_TAG=0.9.10 + CONTAINER_TAG=1.0.0 CONTAINER_NAME=$DOCKER_NAMESPACE/$container_dir:$CONTAINER_TAG echo "Container Name : $CONTAINER_NAME " docker build -t $CONTAINER_NAME . From 176a568e74f31311aeb0a1c7f0e73f4bf58044a3 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 17:28:20 +0200 Subject: [PATCH 14/19] update the region --- containers/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containers/build.sh b/containers/build.sh index 83e99b52..72abe374 100755 --- a/containers/build.sh +++ b/containers/build.sh @@ -3,7 +3,7 @@ set -uex # NOTE: Make sure you've set the environment correctly and are logged in to the registry. -DOCKER_NAMESPACE="rg.nl-ams.scw.cloud/magma-containers" +DOCKER_NAMESPACE="rg.fr-par.scw.cloud/magma-containers" cp ../conda_envs/magma-env-1.yml ./magma-container-1 cp -r ../resources/resistance_db_who ./magma-container-1 From cb358e0632d82948f813ff189f6f5901c24b842a Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 17:30:15 +0200 Subject: [PATCH 15/19] update the region --- conf/docker.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/docker.config b/conf/docker.config index 55547ad3..121dbc84 100644 --- a/conf/docker.config +++ b/conf/docker.config @@ -6,12 +6,12 @@ process { withName: 'GATK.*|LOFREQ.*|DELLY.*|TBPROFILER.*|MULTIQC.*|FASTQC.*|UTILS.*|FASTQ.*|SAMPLESHEET.*' { - container = "rg.nl-ams.scw.cloud/magma-containers/magma-container-1:1.0.0" + container = "rg.fr-par.scw.cloud/magma-containers/magma-container-1:1.0.0" } withName: 'BWA.*|IQTREE.*|SNPDISTS.*|SNPSITES.*|BCFTOOLS.*|BGZIP.*|SAMTOOLS.*|SNPEFF.*|CLUSTERPICKER.*' { - container = "rg.nl-ams.scw.cloud/magma-containers/magma-container-2:1.0.0" + container = "rg.fr-par.scw.cloud/magma-containers/magma-container-2:1.0.0" } } From 0cdf5cf2e7a64809af78efd741fe9c5446bd65ab Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 19:23:10 +0200 Subject: [PATCH 16/19] add labels for the cpus and memory --- modules/bgzip/bgzip.nf | 1 + modules/bwa/mem.nf | 1 + modules/fastqc/fastqc.nf | 1 + modules/gatk/apply_bqsr.nf | 1 + modules/gatk/base_recalibrator.nf | 1 + modules/gatk/collect_wgs_metrics.nf | 1 + modules/gatk/flag_stat.nf | 1 + modules/gatk/haplotype_caller.nf | 1 + .../gatk/haplotype_caller__minor_variants.nf | 1 + modules/gatk/mark_duplicates.nf | 1 + modules/lofreq/call.nf | 1 + modules/lofreq/indelqual.nf | 1 + modules/multiqc/multiqc.nf | 1 + modules/samtools/index.nf | 1 + modules/samtools/merge.nf | 1 + modules/samtools/stats.nf | 1 + nextflow.config | 87 ++++--------------- 17 files changed, 32 insertions(+), 71 deletions(-) diff --git a/modules/bgzip/bgzip.nf b/modules/bgzip/bgzip.nf index 93d671ba..dd348e71 100644 --- a/modules/bgzip/bgzip.nf +++ b/modules/bgzip/bgzip.nf @@ -1,5 +1,6 @@ process BGZIP { tag "${name}" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/bwa/mem.nf b/modules/bwa/mem.nf index abca4ae0..661dbabb 100644 --- a/modules/bwa/mem.nf +++ b/modules/bwa/mem.nf @@ -1,5 +1,6 @@ process BWA_MEM { tag "${sampleName}" + label 'cpu_high_memory_high' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/fastqc/fastqc.nf b/modules/fastqc/fastqc.nf index 400302b9..8a711b3f 100644 --- a/modules/fastqc/fastqc.nf +++ b/modules/fastqc/fastqc.nf @@ -1,5 +1,6 @@ process FASTQC { tag "${sampleName}" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/gatk/apply_bqsr.nf b/modules/gatk/apply_bqsr.nf index c6fb798a..9099886d 100644 --- a/modules/gatk/apply_bqsr.nf +++ b/modules/gatk/apply_bqsr.nf @@ -1,5 +1,6 @@ process GATK_APPLY_BQSR { tag "$sampleName" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/gatk/base_recalibrator.nf b/modules/gatk/base_recalibrator.nf index d374e186..20a18a9a 100644 --- a/modules/gatk/base_recalibrator.nf +++ b/modules/gatk/base_recalibrator.nf @@ -1,5 +1,6 @@ process GATK_BASE_RECALIBRATOR { tag "$sampleName" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish diff --git a/modules/gatk/collect_wgs_metrics.nf b/modules/gatk/collect_wgs_metrics.nf index 173f2acd..d4317570 100644 --- a/modules/gatk/collect_wgs_metrics.nf +++ b/modules/gatk/collect_wgs_metrics.nf @@ -1,5 +1,6 @@ process GATK_COLLECT_WGS_METRICS { tag "${sampleName}" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/gatk/flag_stat.nf b/modules/gatk/flag_stat.nf index 2d692d55..9d7308cf 100644 --- a/modules/gatk/flag_stat.nf +++ b/modules/gatk/flag_stat.nf @@ -1,5 +1,6 @@ process GATK_FLAG_STAT { tag "${sampleName}" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/gatk/haplotype_caller.nf b/modules/gatk/haplotype_caller.nf index 53f7c764..6ec6e57b 100644 --- a/modules/gatk/haplotype_caller.nf +++ b/modules/gatk/haplotype_caller.nf @@ -1,5 +1,6 @@ process GATK_HAPLOTYPE_CALLER { tag "$sampleName" + label 'cpu_medium_memory_medium' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish diff --git a/modules/gatk/haplotype_caller__minor_variants.nf b/modules/gatk/haplotype_caller__minor_variants.nf index 6215b232..38388e03 100644 --- a/modules/gatk/haplotype_caller__minor_variants.nf +++ b/modules/gatk/haplotype_caller__minor_variants.nf @@ -1,5 +1,6 @@ process GATK_HAPLOTYPE_CALLER__MINOR_VARIANTS { tag "$sampleName" + label 'cpu_medium_memory_medium' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish diff --git a/modules/gatk/mark_duplicates.nf b/modules/gatk/mark_duplicates.nf index 5cf9a907..3792d300 100644 --- a/modules/gatk/mark_duplicates.nf +++ b/modules/gatk/mark_duplicates.nf @@ -1,5 +1,6 @@ process GATK_MARK_DUPLICATES { tag "$sampleName" + label 'cpu_low_memory_high' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/lofreq/call.nf b/modules/lofreq/call.nf index 4d171566..57683ee0 100644 --- a/modules/lofreq/call.nf +++ b/modules/lofreq/call.nf @@ -1,5 +1,6 @@ process LOFREQ_CALL { tag "${sampleName}" + label 'cpu_high_memory_medium' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/lofreq/indelqual.nf b/modules/lofreq/indelqual.nf index aa490b47..c5383331 100644 --- a/modules/lofreq/indelqual.nf +++ b/modules/lofreq/indelqual.nf @@ -1,5 +1,6 @@ process LOFREQ_INDELQUAL { tag "${sampleName}" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/multiqc/multiqc.nf b/modules/multiqc/multiqc.nf index cf44a179..9c278045 100644 --- a/modules/multiqc/multiqc.nf +++ b/modules/multiqc/multiqc.nf @@ -1,5 +1,6 @@ process MULTIQC { publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish + label 'cpu_low_memory_low' input: path("*") diff --git a/modules/samtools/index.nf b/modules/samtools/index.nf index 90ddceac..92afa8dd 100644 --- a/modules/samtools/index.nf +++ b/modules/samtools/index.nf @@ -1,5 +1,6 @@ process SAMTOOLS_INDEX { tag "${sampleName}" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/samtools/merge.nf b/modules/samtools/merge.nf index 7937effc..514382ef 100644 --- a/modules/samtools/merge.nf +++ b/modules/samtools/merge.nf @@ -1,5 +1,6 @@ process SAMTOOLS_MERGE { tag "${sampleName}" + label 'cpu_medium_memory_medium' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/modules/samtools/stats.nf b/modules/samtools/stats.nf index c3a9070c..5c8edc3f 100644 --- a/modules/samtools/stats.nf +++ b/modules/samtools/stats.nf @@ -1,5 +1,6 @@ process SAMTOOLS_STATS { tag "${sampleName}" + label 'cpu_low_memory_low' publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish input: diff --git a/nextflow.config b/nextflow.config index 87b7b93f..0da68f49 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,89 +14,34 @@ process { cpus = 4 memory = 4.GB - withName: 'SAMPLESHEET_VALIDATION' { - //NOTE: If this process fails, terminate the pipeline execution - errorStrategy = 'terminate' - } - - withName: 'FASTQC' { - cpus = 2 - memory = 2.GB - } - - withName: 'BWA_MEM' { - cpus = 8 - memory = 8.GB - } - - withName: 'SAMTOOLS_MERGE' { - cpus = 4 - memory = 2.GB - } - - withName: 'MULTIQC' { - cpus = 2 - memory = 2.GB - } - - withName: 'GATK_MARK_DUPLICATES' { + withLabel: 'cpu_low_memory_low' { cpus = 2 - memory = 16.GB + memory = 2.GB } - withName: 'GATK_BASE_RECALIBRATOR' { + withLabel: 'cpu_low_memory_high' { cpus = 2 - memory = 2.GB + memory = 16.GB } - withName: 'GATK_APPLY_BQSR' { - cpus = 2 - memory = 2.GB - } - - withName: 'LOFREQ_INDELQUAL' { - cpus = 2 - memory = 2.GB - } - - withName: 'GATK_FLAG_STAT' { - cpus = 2 - memory = 2.GB - } - - withName: 'GATK_COLLECT_WGS_METRICS' { - cpus = 2 - memory = 2.GB - } - - withName: 'SAMTOOLS_STATS' { - cpus = 2 - memory = 2.GB - } - - withName: 'SAMTOOLS_INDEX.*' { - cpus = 2 - memory = 2.GB - } - - withName: 'DELLY_CALL' { - cpus = 2 - memory = 2.GB + withLabel: 'cpu_medium_memory_medium' { + cpus = 4 + memory = 4.GB } - withName: 'GATK_HAPLOTYPE_CALLER.*' { - cpus = 2 - memory = 4.GB + withLabel: 'cpu_high_memory_high' { + cpus = 8 + memory = 8.GB } - - withName: 'LOFREQ_CALL.*' { + + withLabel: 'cpu_high_memory_medium' { cpus = 8 - memory = 2.GB + memory = 4.GB } - withName: 'BGZIP' { - cpus = 1 - memory = 2.GB + withName: 'SAMPLESHEET_VALIDATION' { + //NOTE: If this process fails, terminate the pipeline execution + errorStrategy = 'terminate' } } From 12b320a7dbd0f3316e464764e2c700fbe4cdc8f3 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Sun, 15 Jan 2023 21:16:00 +0200 Subject: [PATCH 17/19] iterate on documentation --- default_params.config | 3 ++- docs/conda_execution.md | 40 ++++++++++++++++++++++++-------- docs/docker_execution.md | 49 ++++++++++++++++++++++++++++++++++++++++ docs/hpc_execution.md | 25 ++++++++++++++++++++ docs/presentations.md | 6 +++++ docs/usage.md | 7 ++++++ 6 files changed, 120 insertions(+), 10 deletions(-) diff --git a/default_params.config b/default_params.config index e366a279..4ef4eff1 100644 --- a/default_params.config +++ b/default_params.config @@ -76,6 +76,7 @@ compute_minor_variants = false // ##### SPECIFIC PATHS AND PARAMETERS ##### +//NOTE: It is best not to change this parameters and to rely upon the provided reference files ref_fasta_basename = "NC-000962-3-H37Rv" ref_fasta_dir = "${projectDir}/resources/genome" ref_fasta_dict = "${params.ref_fasta_dir}/${params.ref_fasta_basename}.dict" @@ -150,7 +151,7 @@ fastq_validator_path = "fastq_validator.sh" //FIXME Move all publishDir related content to config using the withName -// Control the global publishing behavior +//NOTE:Control the global publishing behavior, which is used as default in case there is no process specific config provided save_mode = 'symlink' should_publish = true diff --git a/docs/conda_execution.md b/docs/conda_execution.md index bb339e67..017b60ee 100644 --- a/docs/conda_execution.md +++ b/docs/conda_execution.md @@ -1,7 +1,25 @@ +## Conda based execution -> **NOTE**: The conda environments are expected by the `conda_local` profile of the pipeline, to be created within `MAGMA/conda_envs` directory +You can run the MAGMA pipeline using the Conda based package manager to install all the prerequisite softwares. -- Clone the pipeline locally and `cd` into it +The `conda` environments are expected by the `conda_local` profile of the pipeline, to be created within `MAGMA/conda_envs` directory + +> **NOTE** +> If you do have access to Singularity or Podman, then owing to their compatibility with Docker, you can still use the MAGMA Docker containers mentioned [docker.config](../conf/docker.config). + + +You can use the `conda` based setup for the pipeline for running MAGMA +- On a local linux machine (e.g. your laptop or university server) +- On an HPC cluster in case you don't have access to container systems like Singularity, Podman or Docker + + +### Steps to setup the pipeline locally + +> **NOTE** +> These steps are only necessary if you don't have access to any container system, then therefore you'd need to install all softwares using the `conda` package manager. + + +1. Copy the environment files from [conda_envs](../conda_envs) folder locally ```sh $ git clone https://github.com/TORCH-Consortium/MAGMA @@ -10,7 +28,12 @@ $ cd MAGMA ``` -- `cd` in the `conda_envs` folder and execute the following commands +2. After `cd` in the `conda_envs` folder and execute the following commands to create the env + +> **TIP** +> 1. For faster installation process, please download [mamba](https://github.com/mamba-org/mamba) tool and replace `conda` with `mamba` in the above commands. +> 2. The path `-p` should be customized as per you setup + ```sh $ conda env create -p magma-env-1 --file magma-env-1.yml @@ -18,21 +41,20 @@ $ conda env create -p magma-env-1 --file magma-env-1.yml $ conda env create -p magma-env-2 --file magma-env-2.yml ``` -> TIP: For faster installation process, please download [mamba](https://github.com/mamba-org/mamba) tool and replace `conda` with `mamba` in the above commands. ### Run the pipeline -- Customize the pipeline and process level settings in the [default_params](./default_params.config) file +1. Customize the pipeline and process level settings in the [default_params](../default_params.config) file -- From inside the `magma` folder, invoke the pipeline +2. From inside the `MAGMA` folder, invoke the pipeline ```sh -$ nextflow run main.nf -profile conda +$ nextflow run main.nf -profile conda_local ``` -- use the ```-resume``` flag to continue from previously generated output files, rather than starting from scratch. +3. Use the `-resume` flag to continue from previously generated output files, rather than starting from scratch. ```sh -$ nextflow run main.nf -profile conda -resume +$ nextflow run main.nf -profile conda_local -resume ``` diff --git a/docs/docker_execution.md b/docs/docker_execution.md index e69de29b..9368295c 100644 --- a/docs/docker_execution.md +++ b/docs/docker_execution.md @@ -0,0 +1,49 @@ +NOTE: Since we want to test more from the user-perspective, I've started to use the `-params-file` to capture pipeline parameters and the `-c` for a custom config file that anyone could rely upon. + + +Here's the command which should be used + +```console +nextflow run 'https://github.com/TORCH-Consortium/MAGMA' \ + -name experiment-analysis-1 \ + -params-file params.yml \ + -profile conda_local \ + -c custom.config \ + -r v1.0.0 +``` + +You could use `-r` option of Nextflow for working with any specific version/branch of the pipeline. + +--------- + +And here are the contents of the following files + +- `experiment-name.yml` => You could name it as per your convenience etc + +```yaml +input_samplesheet: "/full/path/to/samplesheet.csv" +outdir : "/full/path/to/magma-results" +optimize_variant_recalibration : false +compute_minor_variants : true +dataset_is_not_contaminated : true +conda_envs_location : "/home/magma-runs/magma/conda_envs" +``` + +- `custom.config` => Ideally this file should only contain hardware level configurations + +```nextflow + +process { + errorStrategy = { task.attempt < 3 ? 'retry' : 'ignore' } + + + time = '1h' + cpus = 8 + memory = 8.GB + + withName:FASTQ_VALIDATOR { + cpus = 2 + memory = 4.GB + } +} +``` diff --git a/docs/hpc_execution.md b/docs/hpc_execution.md index e69de29b..4609497f 100644 --- a/docs/hpc_execution.md +++ b/docs/hpc_execution.md @@ -0,0 +1,25 @@ + +- `custom.config` => Ideally should only contain hardware level configurations + +```nextflow +executor { + //queueSize = 1 + pollInterval = '5sec' +} + +process { + + executor = "slurm" + errorStrategy = { task.attempt < 3 ? 'retry' : 'ignore' } + + + time = '1h' + cpus = 8 + memory = 8.GB + + withName:FASTQ_VALIDATOR { + cpus = 2 + memory = 4.GB + } +} +``` diff --git a/docs/presentations.md b/docs/presentations.md index e69de29b..a3e9cf27 100644 --- a/docs/presentations.md +++ b/docs/presentations.md @@ -0,0 +1,6 @@ +1. TODO: Add Tim's explanation video for the pipeline + +2. Mention the poster of the pipeline in ESM conference, Italy + +3. + diff --git a/docs/usage.md b/docs/usage.md index e69de29b..0c151bf4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -0,0 +1,7 @@ +A number of customizations are available + + +## Providing custom parameters + + +## Providing custom configurations From 72272f15355af6dd252c682dcf9a454d67c55cab Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Wed, 18 Jan 2023 12:16:44 +0200 Subject: [PATCH 18/19] iterate on documentation --- docs/conda_execution.md | 4 +--- docs/docker_execution.md | 18 ++++++++++++------ docs/hpc_execution.md | 5 +++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/docs/conda_execution.md b/docs/conda_execution.md index 017b60ee..c4d16162 100644 --- a/docs/conda_execution.md +++ b/docs/conda_execution.md @@ -51,10 +51,8 @@ $ conda env create -p magma-env-2 --file magma-env-2.yml ```sh $ nextflow run main.nf -profile conda_local ``` -3. Use the `-resume` flag to continue from previously generated output files, rather than starting from scratch. +3. Use the `-resume` flag to continue from previously generated output files, rather than restarting the entire analysis. ```sh $ nextflow run main.nf -profile conda_local -resume ``` - - diff --git a/docs/docker_execution.md b/docs/docker_execution.md index 9368295c..e1cf808a 100644 --- a/docs/docker_execution.md +++ b/docs/docker_execution.md @@ -1,12 +1,19 @@ -NOTE: Since we want to test more from the user-perspective, I've started to use the `-params-file` to capture pipeline parameters and the `-c` for a custom config file that anyone could rely upon. +## Conda based execution + +You can run the MAGMA pipeline using the Conda based package manager to install all the prerequisite softwares. + +The `conda` environments are expected by the `conda_local` profile of the pipeline, to be created within `MAGMA/conda_envs` directory + +> **NOTE** +> If you do have access to Singularity or Podman, then owing to their compatibility with Docker, you can still use the MAGMA Docker containers mentioned [docker.config](../conf/docker.config). Here's the command which should be used ```console nextflow run 'https://github.com/TORCH-Consortium/MAGMA' \ - -name experiment-analysis-1 \ - -params-file params.yml \ + -name experiment-1 \ + -params-file experiment-1.yml \ -profile conda_local \ -c custom.config \ -r v1.0.0 @@ -18,7 +25,7 @@ You could use `-r` option of Nextflow for working with any specific version/bran And here are the contents of the following files -- `experiment-name.yml` => You could name it as per your convenience etc +- `experiment-1.yml` => You could name it as per your convenience. Here's a sample params yaml file ```yaml input_samplesheet: "/full/path/to/samplesheet.csv" @@ -29,14 +36,13 @@ dataset_is_not_contaminated : true conda_envs_location : "/home/magma-runs/magma/conda_envs" ``` -- `custom.config` => Ideally this file should only contain hardware level configurations +- `custom.config` => Ideally this file should only contain hardware level configurations such as ```nextflow process { errorStrategy = { task.attempt < 3 ? 'retry' : 'ignore' } - time = '1h' cpus = 8 memory = 8.GB diff --git a/docs/hpc_execution.md b/docs/hpc_execution.md index 4609497f..42d0bc78 100644 --- a/docs/hpc_execution.md +++ b/docs/hpc_execution.md @@ -1,3 +1,8 @@ +## HPC based execution + +You can run the MAGMA pipeline on your HPC cluster and install the prerequisite softwares using either +- `conda` +- Container system like `docker` - `custom.config` => Ideally should only contain hardware level configurations From 1f1bdf2ee2822ad86434dd7291bf6dffb07ea27c Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Wed, 18 Jan 2023 12:46:49 +0200 Subject: [PATCH 19/19] rename xbs -> magma in comments --- .gitignore | 6 +++--- bin/multiple_infection_filter.py | 2 +- bin/reformat_lofreq.py | 2 +- bin/samplesheet_validation.py | 2 +- nextflow_schema.template.json | 2 +- workflows/merge_wf.nf | 8 ++------ workflows/minor_variant_analysis_wf.nf | 2 +- 7 files changed, 10 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 30066a33..cdc82531 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ data/test_data/ resources/exit_rif/ -containers/xbs-nf-container-1/resistance_db_who +containers/magma-container-1/resistance_db_who */**/*fastq.gz */**/*fasta @@ -22,9 +22,9 @@ data/full_data results* *.nextflow* -conda_envs/xbs-nf-env* +conda_envs/magma-env* containers/**/*yml samplesheet.csv -xbs-nf.sh +magma.sh .Rproj.user diff --git a/bin/multiple_infection_filter.py b/bin/multiple_infection_filter.py index 60206b4e..2bf24476 100755 --- a/bin/multiple_infection_filter.py +++ b/bin/multiple_infection_filter.py @@ -7,7 +7,7 @@ import argparse if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Analyse resistance output from XBS Pipeline') + parser = argparse.ArgumentParser(description='Analyse resistance output from MAGMA pipeline') parser.add_argument('indir', metavar='indir', type=str, help='The directory containing the LoFreq TBProfiler output') parser.add_argument('relative_abundance_threshold', metavar='relative_abundance_threshold', type=float, help='Minimum relative abundance of the majority strain required to process the sample') diff --git a/bin/reformat_lofreq.py b/bin/reformat_lofreq.py index 3e12f80b..eb3b8c6d 100755 --- a/bin/reformat_lofreq.py +++ b/bin/reformat_lofreq.py @@ -32,7 +32,7 @@ def write_vcf(filename, df, header): df.to_csv(vcf, sep='\t', index=False) if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Analyse resistance output from XBS Pipeline') + parser = argparse.ArgumentParser(description='Analyse resistance output from the MAGMA pipeline') parser.add_argument('lofreq_vcf_file', metavar='lofreq_vcf_file', type=str, help='The input lofreq vcf file') parser.add_argument('lofreq_sample_name', metavar='lofreq_sample_name', type=str, help='The sample name') parser.add_argument('outfile', metavar='outfile', type=str, help='The name of the output VCF file') diff --git a/bin/samplesheet_validation.py b/bin/samplesheet_validation.py index bd5f5095..f613522c 100755 --- a/bin/samplesheet_validation.py +++ b/bin/samplesheet_validation.py @@ -7,7 +7,7 @@ from sys import exit -parser = argparse.ArgumentParser(description='Run the XBS Pipeline') +parser = argparse.ArgumentParser(description='Run the MAGMA pipeline samplesheet validation') parser.add_argument('input_file', metavar='input_file', type=str, help='The input sample file') args = vars(parser.parse_args()) diff --git a/nextflow_schema.template.json b/nextflow_schema.template.json index 3ca70e62..2c1352c7 100644 --- a/nextflow_schema.template.json +++ b/nextflow_schema.template.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema", "$id": "", - "title": "XBS-nf pipeline parameters", + "title": "MAGMA pipeline parameters", "description": "", "type": "object", "definitions": { diff --git a/workflows/merge_wf.nf b/workflows/merge_wf.nf index 39f4f40d..4d8f2c6b 100644 --- a/workflows/merge_wf.nf +++ b/workflows/merge_wf.nf @@ -31,14 +31,12 @@ workflow MERGE_WF { .map { row -> [ row.first() ] } .collect() .dump(tag:'MERGE_WF: approved_samples_minor_variants_ch', pretty: true) - /* .view {"\n\n XBS-NF-LOG approved_samples_minor_variants_ch : $it \n\n"} */ //NOTE: Reshape the flattened output of gvch_ch into the tuples of [sampleName, gvcf, gvcf.tbi] collated_gvcfs_ch = gvcf_ch .flatten() .collate(3) .dump(tag:'MERGE_WF: collated_gvcfs_ch', pretty: true) - /* .view {"\n\n XBS-NF-LOG collated_gvcfs_ch : $it \n\n"} */ //.collectFile(name: "$params.outdir/collated_gvcfs_ch.txt") @@ -59,14 +57,12 @@ workflow MERGE_WF { /* approved_call_wf_samples_ch */ /* .collect() */ /* .dump(tag:'approved_call_wf_samples_ch.collect()') */ - /* .view {"\n\n XBS-NF-LOG approved_call_wf_samples_ch.collect() : $it \n\n"} */ //NOTE: Join the approved samples from MINOR_VARIANT_ANALYSIS_WF and CALL_WF fully_approved_samples_ch = approved_samples_minor_variants_ch .join(approved_call_wf_samples_ch) .flatten() .dump(tag:'MERGE_WF: fully_approved_samples_ch', pretty: true) - /* .view {"\n\n XBS-NF-LOG fully_approved_samples_ch : $it \n\n"} */ //.collect() //.collectFile(name: "$params.outdir/approved_samples_ch.txt") @@ -124,7 +120,7 @@ workflow MERGE_WF { inccomplex_prefix_ch = Channel.of('ExDR.IncComplex') //NOTE: Both phylogenies should be excluding DR and excluding rRNA, then it is again filtered in two datasets one including complex regions and one excluding complex regions. - //Ergo PHYLOGENY_...__INCCOMPLEX should take snp_exc_vcf_ch. Refer https://github.com/TORCH-Consortium/xbs-nf/pull/114#discussion_r947732253 + //Ergo PHYLOGENY_...__INCCOMPLEX should take snp_exc_vcf_ch. Refer https://github.com/TORCH-Consortium/MAGMA/pull/114#discussion_r947732253 PHYLOGENY_ANALYSIS__INCCOMPLEX(inccomplex_prefix_ch, inccomplex_exclude_interval_ref_ch, SNP_ANALYSIS.out.snp_exc_vcf_ch) @@ -142,7 +138,7 @@ workflow MERGE_WF { .dump(tag:'MERGE_WF: excomplex_exclude_interval_ref_ch', pretty: true) - // excomplex_exclude_interval_ref_ch.view{ it -> "\n\n XBS-NF-LOG MERGE_WF excomplex_exclude_interval_ref_ch: $it \n\n"} + // excomplex_exclude_interval_ref_ch.view{ it -> "\n\n MAGMA-LOG MERGE_WF excomplex_exclude_interval_ref_ch: $it \n\n"} excomplex_prefix_ch = Channel.of('ExDR.ExComplex') diff --git a/workflows/minor_variant_analysis_wf.nf b/workflows/minor_variant_analysis_wf.nf index be65103f..a4ce0659 100644 --- a/workflows/minor_variant_analysis_wf.nf +++ b/workflows/minor_variant_analysis_wf.nf @@ -17,7 +17,7 @@ workflow MINOR_VARIANT_ANALYSIS_WF { .filter { it.extension == "gz" } .map { it -> it.name } .reduce { a, b -> "$a $b " } - /* .view {"\n\n XBS-NF-LOG vcfs_filenames_ch : $it \n\n"} */ + .dump(tag:'MINOR_VARIANT_WF: vcfs_string_ch', pretty: true) BCFTOOLS_MERGE(vcfs_string_ch, reformatted_lofreq_vcfs_tuple_ch)