diff --git a/assets/css/styles.scss b/assets/css/styles.scss index 183fa179..e36624ac 100644 --- a/assets/css/styles.scss +++ b/assets/css/styles.scss @@ -116,6 +116,10 @@ $footer-fg: DimGray; /* code */ +pre, code, .sourceCode { + font-size: 0.75rem; +} + pre { line-height: 1.4; background-color: $code-block-bg; diff --git a/home_precourse.qmd b/home_precourse.qmd index 9d08677e..5573f74a 100644 --- a/home_precourse.qmd +++ b/home_precourse.qmd @@ -284,18 +284,28 @@ into some problems with certain Conda packages that have not yet been built for the ARM64 architecture. The [Rosetta](https://support.apple.com/en-us/HT211861) software allows ARM64 Macs to use software built for the old AMD64 architecture, which means you can always fall back on creating AMD/Intel-based environments -and use them in conjunction with Rosetta. This is how you do it: +and use them in conjunction with Rosetta. This can be done by specifying +`--subdir osx-64` when creating the environment, _e.g._: + +```bash +conda env create -f --subdir osx-64 +``` + +or + +```bash +conda create -n myenv --subdir osx-64 +``` + +::: {.callout-important} +To make sure that subsequent installations into this environment also use the +`osx-64` architecture, activate the environment and then run: ```bash -CONDA_SUBDIR=osx-64 -conda activate conda config --env --set subdir osx-64 ``` +::: -The first command creates the Intel-based environment, while the last one -makes sure that subsequent commands are also using the Intel architecture. If -you don't want to remember and do this manually each time you want to use -AMD64/Rosetta you can check out [this bash script](https://github.com/fasterius/dotfiles/blob/main/scripts/intel-conda-env.sh). ## Installing Snakemake @@ -303,19 +313,19 @@ We will use Conda environments for the set up of this tutorial, but don't worry if you don't understand exactly what everything does - you'll learn all the details at the course. First make sure you're currently situated inside the tutorials directory (`workshop-reproducible-research/tutorials`) and then create -the Conda environment like so: +and activate the Conda environment with the commands below: + +::: {.callout-caution title="ARM64 users"} +Some of the packages in this environment are not available for the ARM64 +architecture, so you'll have to add `--subdir osx-64` to the `conda env create` +command. See the [instructions above](#conda-on-new-macs) for more details. +::: ```bash conda env create -f snakemake/environment.yml -n snakemake-env conda activate snakemake-env ``` -::: {.callout-caution title="ARM64 users"} -Some of the packages in this environment is not available for the ARM64 -architecture, so you'll have to follow the [instructions -above](#conda-on-new-macs). -::: - Check that Snakemake is installed correctly, for example by executing `snakemake --help`. This should output a list of available Snakemake settings. If you get `bash: snakemake: command not found` then you need to go back and ensure that diff --git a/lectures/introduction/images/john-sundh.png b/lectures/introduction/images/john-sundh.png index a9f2343b..5c86c6fb 100644 Binary files a/lectures/introduction/images/john-sundh.png and b/lectures/introduction/images/john-sundh.png differ diff --git a/lectures/snakemake/snakemake.qmd b/lectures/snakemake/snakemake.qmd index b7d217c5..ba2789f0 100644 --- a/lectures/snakemake/snakemake.qmd +++ b/lectures/snakemake/snakemake.qmd @@ -98,12 +98,10 @@ digraph snakemake_dag { 15[label = "fastqc", color = "0.00 0.6 0.85", style="rounded"]; 16[label = "fastqc", color = "0.00 0.6 0.85", style="rounded"]; 17[label = "fastqc", color = "0.00 0.6 0.85", style="rounded"]; - 18[label = "generate_rulegraph", color = "0.33 0.6 0.85", style="rounded"]; - 19[label = "make_supplementary", color = "0.39 0.6 0.85", style="rounded"]; + 18[label = "make_supplementary", color = "0.39 0.6 0.85", style="rounded"]; 1 -> 0 14 -> 0 18 -> 0 - 19 -> 0 2 -> 1 7 -> 1 10 -> 1 @@ -124,9 +122,8 @@ digraph snakemake_dag { 4 -> 15 9 -> 16 12 -> 17 - 1 -> 19 - 14 -> 19 - 18 -> 19 + 1 -> 18 + 14 -> 18 } ``` @@ -178,7 +175,7 @@ digraph snakemake_dag { ## Example: sequence trimming Using a bash-script: -```{.bash code-line-numbers="|1-2,10|3|4-5|6-7|8-9"} +```{.bash filename="trimfastq.sh" code-line-numbers="|1-2,10|3|4-5|6-7|8-9"} for input in *.fastq do sample=$(echo ${input} | sed 's/.fastq//') @@ -201,7 +198,7 @@ $ bash trimfastq.sh ## Example: sequence trimming {auto-animate=true} Using Snakemake rules: -```{.python code-line-numbers="|1,6|3|2,8|4,5,9,10|7"} +```{.python filename="Snakefile" code-line-numbers="|1,6|3|2,8|4,5,9,10|7"} rule trim_fastq: output: temp("{sample}.trimmed.fastq") input: "{sample}.fastq" @@ -235,38 +232,79 @@ $ snakemake -c 1 a.trimmed.fastq.gz b.trimmed.fastq.gz ```{.default code-line-numbers=false} $ snakemake -c 1 a.trimmed.fastq.gz b.trimmed.fastq.gz -Provided cores: 1 +Building DAG of jobs... +Using shell: /bin/bash +Provided cores: 1 (use --cores to define parallelism) Rules claiming more threads will be scaled down. -Job counts: -count jobs -2 gzip -2 trim_fastq -4 -rule trim_fastq: +Job stats: +job count +---------- ------- +gzip 2 +trim_fastq 2 +total 4 +Select jobs to execute... +Execute 1 jobs... + +[Tue Nov 19 23:09:00 2024] +localrule trim_fastq: + input: b.fastq + output: b.trimmed.fastq + jobid: 3 + reason: Missing output files: b.trimmed.fastq + wildcards: sample=b + resources: tmpdir=/var/folders/wb/jf9h8kw11b734gd98s6174rm0000gp/T + +[Tue Nov 19 23:09:01 2024] +Finished job 3. +1 of 4 steps (25%) done +Select jobs to execute... +Execute 1 jobs... + +[Tue Nov 19 23:09:01 2024] +localrule gzip: + input: b.trimmed.fastq + output: b.trimmed.fastq.gz + jobid: 2 + reason: Missing output files: b.trimmed.fastq.gz; Input files updated by another job: b.trimmed.fastq + wildcards: sample=b + resources: tmpdir=/var/folders/wb/jf9h8kw11b734gd98s6174rm0000gp/T + +[Tue Nov 19 23:09:02 2024] +Finished job 2. +2 of 4 steps (50%) done +Removing temporary output b.trimmed.fastq. +Select jobs to execute... +Execute 1 jobs... + +[Tue Nov 19 23:09:02 2024] +localrule trim_fastq: input: a.fastq output: a.trimmed.fastq + jobid: 1 + reason: Missing output files: a.trimmed.fastq wildcards: sample=a - 1 of 4 steps (25%) done + resources: tmpdir=/var/folders/wb/jf9h8kw11b734gd98s6174rm0000gp/T -rule gzip: +[Tue Nov 19 23:09:02 2024] +Finished job 1. +3 of 4 steps (75%) done +Select jobs to execute... +Execute 1 jobs... + +[Tue Nov 19 23:09:02 2024] +localrule gzip: input: a.trimmed.fastq output: a.trimmed.fastq.gz + jobid: 0 + reason: Missing output files: a.trimmed.fastq.gz; Input files updated by another job: a.trimmed.fastq wildcards: sample=a -Removing temporary output file a.trimmed.fastq. -2 of 4 steps (50%) done - -rule trim_fastq: - input: b.fastq - output: b.trimmed.fastq - wildcards: sample=b -3 of 4 steps (75%) done + resources: tmpdir=/var/folders/wb/jf9h8kw11b734gd98s6174rm0000gp/T -rule gzip: - input: b.trimmed.fastq - output: b.trimmed.fastq.gz - wildcards: sample=b -Removing temporary output file b.trimmed.fastq. +[Tue Nov 19 23:09:03 2024] +Finished job 0. 4 of 4 steps (100%) done +Removing temporary output a.trimmed.fastq. +Complete log: .snakemake/log/2024-11-19T230900.634412.snakemake.log ``` ## Getting into the Snakemake mindset @@ -531,11 +569,11 @@ rule trim_fastq: ```{dot} digraph snakemake_dag { graph[bgcolor=white, margin=1]; - node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + node[shape=box, style=rounded, fontname=sans, fontsize=18, penwidth=2]; edge[penwidth=2, color=grey]; 0[label = "make_supplementary", color = "black", style="rounded"]; 1[label = "generate_count_table", color = "black", style="rounded"]; - 2[label = "sort_bam\nprefix: results/bam/SRR935090", color = "black", style="rounded"]; + 2[label = "sort_bam\nsample_id: SRR935090", color = "black", style="rounded"]; 3[label = "align_to_genome", color = "black", style="rounded"]; 4[label = "get_SRA_by_accession\nsample_id: SRR935090", color = "black", style="rounded"]; 5[label = "index_genome", color = "black", style="rounded"]; @@ -543,10 +581,8 @@ digraph snakemake_dag { 7[label = "get_genome_gff3\ngenome_id: NCTC8325", color = "black", style="rounded"]; 8[label = "multiqc", color = "black", style="rounded"]; 9[label = "fastqc", color = "black", style="rounded"]; - 10[label = "generate_rulegraph", color = "black", style="rounded"]; 1 -> 0 8 -> 0 - 10 -> 0 2 -> 1 7 -> 1 3 -> 2 @@ -561,9 +597,7 @@ digraph snakemake_dag { ## {auto-animate=true} -- The tutorial contains a workflow to download and map RNA-seq reads against a - reference genome. -- Here we ask for [results/supplementary.html]{.green}, which is an Quarto +- Here we ask for [results/supplementary.html]{.green}, which is a Quarto report generated by the rule `make_supplementary`: ::: {data-id="code1"} @@ -577,11 +611,11 @@ $ snakemake -c 1 results/supplementary.html ```{dot} digraph snakemake_dag { graph[bgcolor=white, margin=1]; - node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + node[shape=box, style=rounded, fontname=sans, fontsize=18, penwidth=2]; edge[penwidth=2, color=grey]; 0[label = "make_supplementary", color = "red", style="rounded"]; 1[label = "generate_count_table", color = "black", style="rounded"]; - 2[label = "sort_bam\nprefix: results/bam/SRR935090", color = "black", style="rounded"]; + 2[label = "sort_bam\nsample_id: SRR935090", color = "black", style="rounded"]; 3[label = "align_to_genome", color = "black", style="rounded"]; 4[label = "get_SRA_by_accession\nsample_id: SRR935090", color = "black", style="rounded"]; 5[label = "index_genome", color = "black", style="rounded"]; @@ -589,10 +623,8 @@ digraph snakemake_dag { 7[label = "get_genome_gff3\ngenome_id: NCTC8325", color = "black", style="rounded"]; 8[label = "multiqc", color = "black", style="rounded"]; 9[label = "fastqc", color = "black", style="rounded"]; - 10[label = "generate_rulegraph", color = "black", style="rounded"]; 1 -> 0 8 -> 0 - 10 -> 0 2 -> 1 7 -> 1 3 -> 2 @@ -607,10 +639,6 @@ digraph snakemake_dag { ## {auto-animate=true} -- The tutorial contains a workflow to download and map RNA-seq reads against a - reference genome. -- Here we ask for [results/supplementary.html]{.green}, which is an Quarto - report generated by the rule `make_supplementary`: - If the timestamp of a file upstream in the workflow is updated... ::: {data-id="code1"} @@ -623,11 +651,11 @@ $ touch results/bowtie2/NCTC8325.1.bt2 ```{dot} digraph snakemake_dag { graph[bgcolor=white, margin=1]; - node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + node[shape=box, style=rounded, fontname=sans, fontsize=18, penwidth=2]; edge[penwidth=2, color=grey]; 0[label = "make_supplementary", color = "black", style="rounded"]; 1[label = "generate_count_table", color = "black", style="rounded"]; - 2[label = "sort_bam\nprefix: results/bowtie2/SRR935090", color = "black", style="rounded"]; + 2[label = "sort_bam\nsample_id: SRR935090", color = "black", style="rounded"]; 3[label = "align_to_genome", color = "black", style="rounded"]; 4[label = "get_SRA_by_accession\nsample_id: SRR935090", color = "black", style="rounded"]; 5[label = "index_genome*", color = "cyan", style="rounded"]; @@ -635,10 +663,8 @@ digraph snakemake_dag { 7[label = "get_genome_gff3\ngenome_id: NCTC8325", color = "black", style="rounded"]; 8[label = "multiqc", color = "black", style="rounded"]; 9[label = "fastqc", color = "black", style="rounded"]; - 10[label = "generate_rulegraph", color = "black", style="rounded"]; 1 -> 0 8 -> 0 - 10 -> 0 2 -> 1 7 -> 1 3 -> 2 @@ -653,11 +679,6 @@ digraph snakemake_dag { ## {auto-animate=true} -- The tutorial contains a workflow to download and map RNA-seq reads against a - reference genome. -- Here we ask for [results/supplementary.html]{.green}, which is an Quarto - report generated by the rule `make_supplementary`: -- If the timestamp of a file upstream in the workflow is updated... - Snakemake detects a file change and only reruns the necessary rules. ::: {data-id="code1"} @@ -670,11 +691,11 @@ $ snakemake -c 1 results/supplementary.html ```{dot} digraph snakemake_dag { graph[bgcolor=white, margin=1]; - node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + node[shape=box, style=rounded, fontname=sans, fontsize=18, penwidth=2]; edge[penwidth=2, color=grey]; 0[label = "make_supplementary", color = "red", style="rounded"]; 1[label = "generate_count_table", color = "black", style="rounded"]; - 2[label = "sort_bam\nprefix: results/bowtie2/SRR935090", color = "black", style="rounded"]; + 2[label = "sort_bam\nsample_id: SRR935090", color = "black", style="rounded"]; 3[label = "align_to_genome", color = "black", style="rounded"]; 4[label = "get_SRA_by_accession\nsample_id: SRR935090", fontcolor="grey", fillcolor="lightgrey", color = "grey", style="rounded,dashed,filled"]; 5[label = "index_genome", color = "grey", fillcolor="lightgrey", fontcolor="grey", style="rounded,dashed,filled"]; @@ -682,10 +703,8 @@ digraph snakemake_dag { 7[label = "get_genome_gff3\ngenome_id: NCTC8325", color = "grey", fontcolor="grey", fillcolor="lightgrey", style="rounded,dashed,filled"]; 8[label = "multiqc", fontcolor="grey", fillcolor="lightgrey", color = "grey", style="rounded,dashed,filled"]; 9[label = "fastqc", color = "grey", fillcolor="lightgrey", fontcolor="grey", style="rounded,dashed,filled"]; - 10[label = "generate_rulegraph", background_color="black", fontcolor="grey", color = "grey", fillcolor="lightgrey", style="rounded,dashed,filled"]; 1 -> 0 8 -> 0 - 10 -> 0 2 -> 1 7 -> 1 3 -> 2 @@ -716,7 +735,6 @@ rule trim_fastq: ## {auto-animate=true} -- Rules are typically named and have input and/or output directives - Logfiles help with debugging and leave a "paper trail" ```{.python code-line-numbers="4,7"} @@ -732,8 +750,6 @@ rule trim_fastq: ## {auto-animate=true} -- Rules are typically named and have input and/or output directives -- Logfiles help with debugging and leave a "paper trail" - Params can be used to pass on settings ```{.python code-line-numbers="5-7,10"} @@ -752,9 +768,6 @@ rule trim_fastq: ## {auto-animate=true} -- Rules are typically named and have input and/or output directives -- Logfiles help with debugging and leave a "paper trail" -- Params can be used to pass on settings - The `threads` directive specify maximum number of threads for a rule - You can also define `resources` such as disk/memory requirements and runtime @@ -778,11 +791,6 @@ rule trim_fastq: ## {auto-animate=true} -- Rules are typically named and have input and/or output directives -- Logfiles help with debugging and leave a "paper trail" -- Params can be used to pass on settings -- The `threads` directive specify maximum number of threads for a rule -- You can also define `resources` such as disk/memory requirements and runtime - Rules can be executed in separate software environments using either the `conda` or `container` directive @@ -808,16 +816,10 @@ rule trim_fastq: ## {auto-animate=true} -- Rules are typically named and have input and/or output directives -- Logfiles help with debugging and leave a "paper trail" -- Params can be used to pass on settings -- The `threads` directive specify maximum number of threads for a rule -- You can also define `resources` such as disk/memory requirements and runtime - Rules can be executed in separate software environments using either the `conda` or `container` directive -`envs/seqtk.yaml` -```{.yaml} +```{.yaml filename="envs/seqtk.yaml"} name: seqtk channels: - bioconda @@ -829,47 +831,50 @@ dependencies: [https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html) -## Snakemake commandline +## Snakemake commandline {auto-animate=true} -::: {.fragment} - Generate the output of the first rule in Snakefile ```{.bash code-line-numbers=false} $ snakemake -s Snakefile ``` -::: -::: {.fragment} +## Snakemake commandline {auto-animate=true} + - Run the workflow in dry mode and print shell commands ```{.bash code-line-numbers=false} -$ snakemake -n -p +$ snakemake -s Snakefile -n -p ``` -::: -::: {.fragment} +## Snakemake commandline {auto-animate=true} + - Execute the workflow with 8 cores ```{.bash code-line-numbers=false} -$ snakemake --cores 8 +$ snakemake -s Snakefile -c 8 -p ``` -::: -::: {.fragment} +## Snakemake commandline {auto-animate=true} + - Specify a configuration file ```{.bash code-line-numbers=false} -$ snakemake --configfile config.yaml +$ snakemake --configfile config.yaml -s Snakefile -c 8 -p ``` -::: -::: {.fragment} +## Snakemake commandline {auto-animate=true} + - Run rules with specific conda environments or Docker/Apptainer containers ```{.bash code-line-numbers=false} -$ snakemake --software-deployment-method conda -$ snakemake --software-deployment-method apptainer +$ snakemake --configfile config.yaml -s Snakefile -c 8 -p \ + --software-deployment-method conda +``` + +```{.bash code-line-numbers=false} +$ snakemake --configfile config.yaml -s Snakefile -c 8 -p \ + --software-deployment-method apptainer ``` -::: # Questions? diff --git a/pages/conda.qmd b/pages/conda.qmd index 3d552e98..9218fadb 100644 --- a/pages/conda.qmd +++ b/pages/conda.qmd @@ -41,8 +41,7 @@ A Conda _environment_ is essentially a directory that is added to your PATH and that contains a specific collection of packages that you have installed. Packages are symlinked between environments to avoid unnecessary duplication. -::: {.callout-} -**Different Conda flavours** +::: {.callout-note title="Different Conda flavours"} You may come across several flavours of Conda. There's _Miniconda_, which is the installer for Conda. The second is _Anaconda_, which is a distribution of not only Conda, but also over 150 scientific Python packages curated by the @@ -52,14 +51,15 @@ not even use. Then, lastly, there's the _Miniforge_ flavour that we're using here, which is a community-driven version of Conda that's highly popular within the scientific community. -The difference between Miniconda and Miniforge is that the former points to -points to the `default` channel by default (which requires an Anaconda license -for commercial purposes), while the latter points to the community-maintained -`conda-forge` channel by default. While Conda is created and owned by Anaconda -the company, Conda itself is open source - it's the `default` channel that is -proprietary. The `conda-forge` and `bioconda` channels (two of the largest -channels outside of `default`) are community-driven. Confusing? Yes. If you -want this information more in-depth you can read this [blog post by Anaconda](https://www.anaconda.com/blog/is-conda-free). +The difference between Miniconda and Miniforge is that the former points to the +`default` channel by default (which requires an Anaconda license for commercial +purposes), while the latter points to the community-maintained `conda-forge` +channel by default. While Conda is created and owned by Anaconda the company, +Conda itself is open source - it's the `default` channel that is proprietary. +The `conda-forge` and `bioconda` channels (two of the largest channels outside +of `default`) are community-driven. Confusing? Yes. If you want this information +more in-depth you can read this [blog post by +Anaconda](https://www.anaconda.com/blog/is-conda-free). ::: ## The basics @@ -80,12 +80,43 @@ called _Project A_. - Let's make our first Conda environment: ```bash -conda create -n project_a -c bioconda fastqc +conda create -n project_a fastqc ``` -This will create an environment called `project_a`, containing FastQC from the -Bioconda channel. Conda will list the packages that will be installed and ask -for your confirmation. +This will create an environment called `project_a`, containing FastQC. + +You should see something like this printed to the terminal: + +``` +Channels: + - conda-forge + - bioconda + - defaults +Platform: osx-arm64 # <- Your platform may differ +``` + +This is Conda telling you which channels it is looking in for the package you +requested. If you followed the +[pre-course](../home_precourse.html#configuring-conda) instructions and added +the `conda-forge` and `bioconda` channels to your conda configuration, you +should see them listed here. If you had not configured Conda to use these +channels, you would have to specify them when installing packages, _e.g._ `conda +install -c bioconda fastqc`. + +Further down you will see something like this: + +``` +The following NEW packages will be INSTALLED: + + fastqc bioconda/noarch::fastqc-0.12.1-hdfd78af_0 + +``` + +This shows that the `fastqc` package will be installed from the `bioconda` +channel (the `noarch` part shows that the package is not specific to any +computer architecture). In the example above we see that version `0.12.1` will +be installed. The `hdfd78af_0` part is a unique build string that is used to +differentiate between different builds of the same package version. - Once it is done, you can activate the environment: @@ -142,7 +173,7 @@ activated. install`. Make sure that `project_a` is the active environment first. ```bash -conda install -c bioconda multiqc +conda install multiqc ``` - If we don't specify the package version, the latest available version will be @@ -150,13 +181,13 @@ conda install -c bioconda multiqc - Run the following to see what versions are available: ```bash -conda search -c bioconda multiqc +conda search multiqc ``` - Now try to install a different version of MultiQC, _e.g._: ```bash -conda install -c bioconda multiqc=1.13 +conda install multiqc=1.13 ``` Read the information that Conda displays in the terminal. It probably asks if @@ -196,7 +227,7 @@ conda create -n project_old -c bioconda bbmap=37.10 Now let's try to remove an installed package from the active environment: -``` +```bash conda remove multiqc ``` @@ -290,9 +321,9 @@ Conda installation path. - Activate the environment! -- Now we can run the code for the MRSA project found in `code/run_qc.sh`, - either by running `bash code/run_qc.sh` or by opening the `run_qc.sh` file - and executing each line in the terminal one by one. Do this! +- Now we can run the code for the MRSA project found in `code/run_qc.sh` + by running `bash code/run_qc.sh`. Do this! (You can also open the file + and run the commands manually if you prefer.) This should download the project FASTQ files and run FastQC on them (as mentioned above). @@ -469,7 +500,18 @@ directory of the currently active environment. When you create a new Conda environment you can choose to install a specific version of Python in that environment as well. As an example, create an -environment containing Python version `3.5` by running: +environment containing Python version `3.5`: + +::: {.callout-note} + +Python versions older than `v3.8.5` are not available for Macs with the M-series +chips, so if you are using one of those you will need to add `--subdir osx-64` +to the command, _e.g._: + +```bash +conda create --subdir osx-64 -n py35 python=3.5 +``` +::: ```bash conda create -n py35 python=3.5 diff --git a/pages/git.qmd b/pages/git.qmd index a81c2fe3..6829fc6f 100644 --- a/pages/git.qmd +++ b/pages/git.qmd @@ -35,7 +35,7 @@ There are many benefits of using Git in your research project: to collaborate by tracking all edits made by each person. It will also handle any potential conflicting edits. - Using a cloud-based repository hosting service (the one you push your commits - to), like _e.g._ [GitHub](https://github.com/) or + to), like [GitHub](https://github.com/) or [Bitbucket](https://bitbucket.org/), adds additional features, such as being able to discuss the project, comment on edits, or report issues. - If at some point your project will be published GitHub or Bitbucket (or @@ -515,10 +515,10 @@ displays the difference on a word-per-word basis rather than line-per-line. ::: {.callout-note} Git is constantly evolving, along with some of its commands. The `checkout` -command was previously used for switching between branches, but this -functionality now has the dedicated (and clearer) `switch` command for this. -If you've previously learned using `checkout` instead you can keep doing that -without any issues, as the `checkout` command itself hasn't changed. +command was previously used for switching between branches, but now there's the +dedicated (and clearer) `switch` command for this functionality. If you've +previously learned using `checkout` instead you can keep doing that without any +issues, as the `checkout` command itself hasn't changed. ::: Now, let's assume that we have tested our code and the alignment analysis is run @@ -539,7 +539,7 @@ git merge test_alignment ``` Run `git log --graph --all --oneline` again to see how the merge commit brings -back the changes made in `test_alignment` to `main`. +the changes made in `test_alignment` into `main`. ::: {.callout-tip} If working on different features or parts of an analysis on different @@ -718,9 +718,11 @@ git remote add origin git@github.com:user/git_tutorial.git your local Git clone. The short name of the default remote is usually "_origin_" by convention. -::: {.callout-note} +::: {.callout-note} Make sure you've used an SSH address (_i.e._ starting with `git@github.com` -rather than an HTTPS address (starting with `https://github.com`)! +rather than an HTTPS address (starting with `https://github.com`)! Also make +sure you've set up ssh-keys as described in the +[github-setup](../home_precourse.html#github-setup) in the pre-course material. ::: - We have not yet synced the local and remote repositories, though, we've simply @@ -896,8 +898,8 @@ a shorthand for `git fetch` followed by `git merge FETCH_HEAD` (where That's quite a few concepts and commands you've just learnt! It can be a bit hard to keep track of everything and the connections between local and remote -Git repositories and how you work with them, but hopefully the following figure -will give you a short visual summary: +Git repositories and how you work with them. The following figure will give you +a short visual summary: ![](images/git_overview_remote.png){ width=600px } @@ -915,7 +917,7 @@ repositories and how to sync them: ### Remote branches -Remote branches work much in the same way a local branches, but you have to +Remote branches work much in the same way as local branches, but you have to push them separately; you might have noticed that GitHub only listed our repository as having one branch (you can see this by going to the _Code_ tab). This is because we only pushed our `main` branch to the remote. Let's create diff --git a/pages/snakemake.qmd b/pages/snakemake.qmd index bce62a9a..7348756b 100644 --- a/pages/snakemake.qmd +++ b/pages/snakemake.qmd @@ -101,88 +101,144 @@ will convert all characters in the set `[a-z]` to the corresponding character in the set `[A-Z]`, and then send the output to `a.upper.txt`". Now let's run our first Snakemake workflow. When a workflow is executed -Snakemake tries to generate a set of target files. Target files can be -specified via the command line (or, as you will see later, in several other -ways). Here we ask Snakemake to make the file `a.upper.txt`. We can specify -the file containing our rules with `-s` but since the default behaviour of -Snakemake is to look for a file called `Snakefile` in either the working -directory or in a subdirectory called `workflow/` we don't need to specify -that here. It's good practice to first run with the flag `-n` (or -`--dry-run`), which will show what Snakemake plans to do without actually -running anything, and you also need to specify how many cores to be used for -the workflow with `--cores` or `-c`. For now, you only need 1 so set `-c 1`. -You can also use the flag `-p`, for showing the shell commands that it will -execute, and the flag `-r` for showing the reason for running a specific -rule. `snakemake --help` will show you all available flags. +Snakemake tries to generate a set of **target files**. Target files can be +specified via the command line or, as you will see later, in several other ways. +But importantly you will always have to specify what you want Snakemake to +generate, in one way or another. The basic Snakemake command line looks like +this: -```no-highlight -$ snakemake -n -c 1 -r -p a.upper.txt +```bash +snakemake [target files ...] +``` + +where `` are flags that modify the behaviour of Snakemake and +`[target files ...]` are the files that you want Snakemake to generate. + +Let's ask Snakemake to generate the file `a.upper.txt`, this is our target file. +For the arguments part we'll use the `-s` flag to specify the file containing +our rules and the `-c` flag to set the number of cores to use. It's good +practice to first run with the flag `-n` (or `--dry-run`), which will show what +Snakemake plans to do without actually running anything. You can also use the +flag `-p`, for showing the shell commands that it will execute. `snakemake +--help` will show you all available flags. + +The command thus becomes: + +```bash +snakemake -s Snakefile -c 1 -n -p a.upper.txt +``` + +You should see something like this: +```no-highlight Building DAG of jobs... Job stats: -job count min threads max threads ---------------------- ------- ------------- ------------- -convert_to_upper_case 1 1 1 -total 1 1 1 +job count +--------------------- ------- +convert_to_upper_case 1 +total 1 -[Mon Oct 25 16:48:43 2021] +[Wed Nov 13 21:59:01 2024] rule convert_to_upper_case: input: a.txt output: a.upper.txt jobid: 0 reason: Missing output files: a.upper.txt - resources: tmpdir=/var/folders/p0/6z00kpv16qbf_bt52y4zz2kc0000gp/T + resources: tmpdir= tr [a-z] [A-Z] < a.txt > a.upper.txt - + Job stats: -job count min threads max threads ---------------------- ------- ------------- ------------- -convert_to_upper_case 1 1 1 -total 1 1 1 +job count +--------------------- ------- +convert_to_upper_case 1 +total 1 + +Reasons: + (check individual jobs above for details) + output files have to be generated: + convert_to_upper_case This was a dry-run (flag -n). The order of jobs does not reflect the order of execution. ``` -You can see that Snakemake plans to run one job: the rule `convert_to_upper_case` -with `a.txt` as input and `a.upper.txt` as output. The reason for doing this is -that it's missing the file `a.upper.txt`. Now execute the workflow without the -`-n` flag and check that the contents of `a.upper.txt` is as expected. Then try -running the same command again. What do you see? It turns out that Snakemake -only reruns jobs if there have been changes to either **the input files, or the -workflow itself**. This is how Snakemake ensures that everything in the -workflow is up to date. We will get back to this shortly. +You can see that Snakemake plans to run one job: the rule +`convert_to_upper_case` with `a.txt` as input and `a.upper.txt` as output. The +reason for doing this is that it's missing the file `a.upper.txt`. Now run the +command without the `-n` flag and check that the contents of `a.upper.txt` is as +expected. -What if we ask Snakemake to generate the file `b.upper.txt`? +We used the `-s` flag to point to the file containing the rules. In fact, +Snakemake will look for a file named `Snakefile` in the current directory (or in +a subdirectory called `workflow`) by default, which means we can omit this flag +for this example. If we also omit the `-c` flag Snakemake will use all available +cores, which won't make a difference for this small workflow. The command can +thus be simplified to: -```no-highlight -$ snakemake -n -c 1 -r -p b.upper.txt +```bash +snakemake -p a.upper.txt +``` -Building DAG of jobs... -MissingRuleException: -No rule to produce b.upper.txt (if you use input functions make sure that they don't raise unexpected exceptions). +Try running this simplified command. What do you see? It turns out that +Snakemake only reruns jobs if there have been changes to either **the input +files, or the workflow itself**. Here neither the `a.txt` input, nor the +`Snakefile` have been changed so Snakemake determines that everything in the +workflow is up to date and there is nothing to do. + +What if we ask Snakemake to generate the file `b.upper.txt`? Run the following: + +```bash +snakemake -n -p b.upper.txt +``` + +You will see that Snakemake complains that there is no rule to produce +`b.upper.txt`. This is because we have only defined a rule for `a.upper.txt`. We +could copy the rule to make a similar one for `b.txt`, but that would be a bit +cumbersome. Here is where named wildcards come in; one of the most powerful +features of Snakemake. Simply change the input and output directives from + +```python + output: + "a.upper.txt" + input: + "a.txt" +``` + +to + +```python + output: + "{some_name}.upper.txt" + input: + "{some_name}.txt" ``` -That didn't work well. We could copy the rule to make a similar one for -`b.txt`, but that would be a bit cumbersome. Here is where named wildcards come -in; one of the most powerful features of Snakemake. Simply change the input -from `input: "a.txt"` to `input: "{some_name}.txt"` and the output to `output: -"{some_name}.upper.txt"`. Now try asking for `b.upper.txt` again. +Now try asking for `b.upper.txt` again. Tada! What happens here is that Snakemake looks at all the rules it has available (actually only one in this case) and tries to assign values to all -wildcards so that the targeted files can be generated. In this case it was -quite simple, you can see that it says that `wildcards: some_name=b`, but for -large workflows and multiple wildcards it can get much more complex. Named -wildcards is what enables a workflow (or single rules) to be efficiently -generalized and reused between projects or shared between people. - -It seems we have the first part of our workflow working, now it's time to make -the second rule for concatenating the outputs from `convert_to_upper_case`. The -rule structure will be similar; the only difference is that here we have two -inputs instead of one. This can be expressed in two ways, either with named +wildcards so that the targeted files can be generated. In this case you can see +that it says `wildcards: some_name=b`, which means that the `some_name` wildcard +was assigned the value `b`. This was quite simple but for large workflows and +multiple wildcards it can get much more complex. Named wildcards is what enables +a workflow (or single rules) to be efficiently generalized and reused between +projects or shared between people. + +Remember that we're trying to make a workflow that will convert two files to +uppercase and concatenate them. It seems we have the first part of our workflow +working, now it's time to make the second rule for concatenating the outputs +from `convert_to_upper_case`. + +The syntax for concatenating two files in Bash is : + +```bash +cat first_file.txt second_file.txt > output_file.txt` +``` + +The structure of this second rule will be similar to the first but here we have +two inputs instead of one. This can be expressed in two ways, either with named inputs like this: ```python @@ -214,29 +270,19 @@ Snakemake workflows. The parser will complain, but sometimes the error message can be difficult to interpret. ::: -Now try to construct this rule yourself and name it `concatenate_a_and_b`. -The syntax for concatenating two files in Bash is -`cat first_file.txt second_file.txt > output_file.txt`. Call the output `c.txt`. -Run the workflow in Snakemake and validate that the output looks as expected. +Now try to construct this second rule yourself. Add it to the `Snakefile` and +name the rule `concatenate_a_and_b`. Use `a.upper.txt` and `b.upper.txt` as +input and call the output `c.txt`. Check the solution below if you get stuck. -Wouldn't it be nice if our workflow could be used for _any_ files, not just -`a.txt` and `b.txt`? We can achieve this by using named wildcards (or in other -ways as we will discuss later). As we've mentioned, Snakemake looks at all the -rules it has available and tries to assign values to all wildcards so that the -targeted files can be generated. We therefore have to name the output file in -a way so that it also contains information about which input files it should be -based on. Try to figure out how to do this yourself. If you're stuck you can -look at the spoiler below, but spend some time on it before you look. Also -rename the rule to `concatenate_files` to reflect its new more general use. ::: {.callout-tip collapse="true" title="Click to show"} ```python -rule concatenate_files: +rule concatenate_a_and_b: output: - "{first}_{second}.txt" + "c.txt" input: - "{first}.upper.txt", - "{second}.upper.txt" + "a.upper.txt", + "b.upper.txt" shell: """ cat {input[0]} {input[1]} > {output} @@ -244,66 +290,59 @@ rule concatenate_files: ``` ::: -We can now control which input files to use by the name of the file we ask -Snakemake to generate. Run the workflow without the flag `-n` (or `--dry-run`) -to execute both rules, providing one core with `-c 1` (or `--cores 1`): - -```no-highlight -$ snakemake a_b.txt -c 1 +Now try to generate the file `c.txt` with your workflow by running: -Building DAG of jobs... -Using shell: /bin/bash -Provided cores: 1 (use --cores to define parallelism) -Rules claiming more threads will be scaled down. -Job stats: -job count min threads max threads ---------------------- ------- ------------- ------------- -concatenate_files 1 1 1 -convert_to_upper_case 2 1 1 -total 3 1 1 +```bash +snakemake -p c.txt +``` -Select jobs to execute... +Validate that the output looks as expected. -[Mon Oct 25 16:51:52 2021] -rule convert_to_upper_case: - input: b.txt - output: b.upper.txt - jobid: 2 - wildcards: some_name=b - resources: tmpdir=/var/folders/p0/6z00kpv16qbf_bt52y4zz2kc0000gp/T - -[Mon Oct 25 16:51:53 2021] -Finished job 2. -1 of 3 steps (33%) done -Select jobs to execute... - -[Mon Oct 25 16:51:53 2021] -rule convert_to_upper_case: - input: a.txt - output: a.upper.txt - jobid: 1 - wildcards: some_name=a - resources: tmpdir=/var/folders/p0/6z00kpv16qbf_bt52y4zz2kc0000gp/T +Good job! You've now created a simple Snakemake workflow that converts two files +to uppercase and concatenates them. -[Mon Oct 25 16:51:53 2021] -Finished job 1. -2 of 3 steps (67%) done -Select jobs to execute... +Wouldn't it be nice if our workflow could be used for _any_ files, not just +`a.txt` and `b.txt`? We can achieve this by generalizing the +`concatenate_a_and_b` rule using named wildcards. + +As we've mentioned, Snakemake looks at all the rules it has available and tries +to assign values to all wildcards so that the targeted files can be generated. +We therefore have to name the output file in a way so that it contains +information about which input files it should be based on. Try to figure out how +to modify the `concatenate_a_and_b` rule yourself. If you're stuck you can look +at the solution below, but spend some time on it before you look. Also rename +the `concatenate_a_and_b` rule to `concatenate_files` to reflect its new more +general use. -[Mon Oct 25 16:51:53 2021] +::: {.callout-tip collapse="true" title="Click to show"} +```python rule concatenate_files: - input: a.upper.txt, b.upper.txt - output: a_b.txt - jobid: 0 - wildcards: first=a, second=b - resources: tmpdir=/var/folders/p0/6z00kpv16qbf_bt52y4zz2kc0000gp/T + output: + "{first}_{second}.txt" + input: + "{first}.upper.txt", + "{second}.upper.txt" + shell: + """ + cat {input[0]} {input[1]} > {output} + """ +``` +::: + +We can now control which input files to use by the name of the file we ask +Snakemake to generate. Run the workflow to create a file `a_b.txt`: -[Mon Oct 25 16:51:53 2021] -Finished job 0. -3 of 3 steps (100%) done +```bash +snakemake -p a_b.txt ``` -Neat! +You will see that Snakemake assigned the values `a` and `b` to the wildcards +`{first}` and `{second}`, respectively. This is because the output of the +`concatenate_files` rule is defined as `"{first}_{second}.txt"`, which means +that when we ask for `a_b.txt`, the `{first}` wildcard is assigned `a` and the +`{second}` wildcard is assigned `b`. Try to generate the file `b_a.txt` and see +what happens. + ::: {.callout-tip} You can name a file whatever you want in a Snakemake workflow, but you will @@ -367,7 +406,7 @@ Let's run the workflow with the updated rule. Remove the file `a_b.txt` or add `-f` to the Snakemake command to force a re-run: ```bash -snakemake a_b.txt -c 1 -f +snakemake -f a_b.txt ``` If you added the `print` statement to the function you should see the @@ -482,81 +521,84 @@ The main difference here is that now each node is a job instead of a rule. You can see that the wildcards used in each job are also displayed. Another difference is the dotted lines around the nodes. A dotted line is Snakemake's way of indicating that this rule doesn't need to be rerun in order to generate -`a_b.txt`. Validate this by running `snakemake -n -r a_b.txt` and it should say +`a_b.txt`. Validate this by running `snakemake -n a_b.txt` and it should say that there is nothing to be done. - We've discussed before that one of the main purposes of using a WfMS is that it automatically makes sure that everything is up to date. This is done by recursively checking that outputs are always newer than inputs for all the rules involved in the generation of your target files. Now try to change the contents of `a.txt` to some other text and save it. What do you think will -happen if you run `snakemake -n -r a_b.txt` again? +happen if you run `snakemake -n a_b.txt` again? ::: {.callout-tip collapse="true" title="Click to show"} ```no-highlight -$ snakemake -n -r a_b.txt - Building DAG of jobs... Job stats: -job count min threads max threads ---------------------- ------- ------------- ------------- -concatenate_files 1 1 1 -convert_to_upper_case 1 1 1 -total 2 1 1 +job count +--------------------- ------- +concatenate_files 1 +convert_to_upper_case 1 +total 2 -[Mon Oct 25 17:00:02 2021] +[Fri Nov 15 10:27:22 2024] rule convert_to_upper_case: input: a.txt output: a.upper.txt jobid: 1 reason: Updated input files: a.txt wildcards: some_name=a - resources: tmpdir=/var/folders/p0/6z00kpv16qbf_bt52y4zz2kc0000gp/T + resources: tmpdir= -[Mon Oct 25 17:00:02 2021] +[Fri Nov 15 10:27:22 2024] rule concatenate_files: input: a.upper.txt, b.upper.txt output: a_b.txt jobid: 0 reason: Input files updated by another job: a.upper.txt wildcards: first=a, second=b - resources: tmpdir=/var/folders/p0/6z00kpv16qbf_bt52y4zz2kc0000gp/T + resources: tmpdir= Job stats: -job count min threads max threads ---------------------- ------- ------------- ------------- -concatenate_files 1 1 1 -convert_to_upper_case 1 1 1 -total 2 1 1 +job count +--------------------- ------- +concatenate_files 1 +convert_to_upper_case 1 +total 2 + +Reasons: + (check individual jobs above for details) + input files updated by another job: + concatenate_files + updated input files: + convert_to_upper_case This was a dry-run (flag -n). The order of jobs does not reflect the order of execution. ``` ::: Were you correct? Also generate the job graph and compare to the one generated -above. What's the difference? Now rerun without `-n` and validate that -`a_b.txt` contains the new text (don't forget to specify `-c 1`). Note that -Snakemake doesn't look at the contents of files when trying to determine what has -changed, only at the timestamp for when they were last modified. +above. What's the difference? Now rerun without `-n` and validate that `a_b.txt` +contains the new text. Note that Snakemake doesn't look at the contents of files +when trying to determine what has changed, only at the timestamp for when they +were last modified. We've seen that Snakemake keeps track of if files in the workflow have changed, and automatically makes sure that any results depending on such files are -regenerated. What about if the rules themselves are changed? It turns out that -since version 7.8.0 Snakemake keeps track of this automatically. +regenerated. What about if the rules themselves are changed? Since version 7.8.0 +Snakemake keeps track of this automatically. -Let's say that we want to modify the rule `concatenate_files` to also include -which files were concatenated. +Modify the rule `concatenate_files` to also include which files were +concatenated. ```python rule concatenate_files: output: "{first}_{second}.txt" input: - "{first}.upper.txt", - "{second}.upper.txt" + concat_input shell: """ echo 'Concatenating {input}' | cat - {input[0]} {input[1]} > {output} @@ -572,7 +614,14 @@ it should read from standard input). Lastly, the output from `cat` is sent to `{output}`. ::: -If you now run the workflow as before you should see: +If you now run: + +```bash +snakemake -n a_b.txt +``` + + you should see: + ```bash rule concatenate_files: input: a.upper.txt, b.upper.txt @@ -582,21 +631,28 @@ rule concatenate_files: wildcards: first=a, second=b ``` -Because although no files involved in the workflow have been changed, Snakemake -recognizes that the workflow code itself has been modified and this triggers -a re-run. +Although no files involved in the workflow have been changed, Snakemake +recognizes that the workflow code itself has been modified and this triggers a +re-run. + +Snakemake is aware of changes to five categories of such "rerun-triggers": -Snakemake is aware of changes to four categories of such "rerun-triggers": -"input" (changes to rule input files), "params" (changes to the rule `params` section), -"software-env" (changes to Conda environment files specified by the `conda:` -directive) and "code" (changes to code in the `shell:`, `run:`, `script:` and -`notebook:` directives). +- "input" (changes to rule input files), +- "params" (changes to the rule `params` section) +- "software-env" (changes to Conda environment files specified by the `conda:` directive) +- "code" (changes to code in the `shell:`, `run:`, `script:` and `notebook:` directives) +- "mtime" (changes to the modification time of input files) + +Using the flag `--rerun-triggers` you can specify which of these categories +should trigger a re-run. For example, `--rerun-triggers input` will only +re-run the workflow if the specified input files have changed and `--rerun-triggers +input params` will re-run if either the input files or the `params` section of +the rule has changed. Prior to version 7.8.0, only changes to the modification time of input files would trigger automatic re-runs. To run Snakemake with this previous behaviour you can use the setting `--rerun-triggers mtime` at the command line. -Change the `shell:` section of the `concatenate_files` rule back to the previous -version, then try running: `snakemake -n -r a_b.txt --rerun-triggers mtime` and +Try running: `snakemake --rerun-triggers mtime -n a_b.txt` and you should again see `Nothing to be done (all requested files are present and up to date).` You can also export information on how all files were generated (when, by which @@ -604,18 +660,18 @@ rule, which version of the rule, and by which commands) to a tab-delimited file like this: ```bash -snakemake a_b.txt -c 1 -D > summary.tsv +snakemake a_b.txt -D > summary.tsv ``` The content of `summary.tsv` is shown in the table below: -
-| output_file | date | rule | version | log-file(s) | input-file(s) | shellcmd | status | plan | -|----------------|-----------------------------|--------------------------|-------------|-----------------|----------------------------|------------------------------------------|--------------------------------|-------------------| -| a_b.txt | Mon Oct 25 17:01:46 2021 | concatenate_files | - | | a.upper.txt,b.upper.txt | cat a.upper.txt b.upper.txt > a_b.txt | rule implementation changed | update pending | -| a.upper.txt | Mon Oct 25 17:01:46 2021 | convert_to_upper_case | - | | a.txt | tr [a-z] [A-Z] < a.txt > a.upper.txt | ok | no update | -| b.upper.txt | Mon Oct 25 17:01:46 2021 | convert_to_upper_case | - | | b.txt | tr [a-z] [A-Z] < b.txt > b.upper.txt | ok | no update | -
+| output_file | date | rule | log-file(s) | input-file(s) | shellcmd | status | plan | +|-------------|-----------------------|-------------------------|-------------|------------------------|-------------------------------------------|----------------------------|-----------------| +| a_b.txt | Fri Nov 15 10:32:04 2024 | concatenate_files | | a.upper.txt,b.upper.txt | cat a.upper.txt b.upper.txt > a_b.txt | rule implementation changed | update pending | +| a.upper.txt | Fri Nov 15 10:32:04 2024 | convert_to_upper_case | | a.txt | tr [a-z] [A-Z] < a.txt > a.upper.txt | ok | no update | +| b.upper.txt | Thu Nov 14 00:04:06 2024 | convert_to_upper_case | | b.txt | tr [a-z] [A-Z] < b.txt > b.upper.txt | ok | no update | + +: summary.tsv {.responsive .sm} You can see in the second last column that the rule implementation for `a_b.txt` has changed. The last column shows if Snakemake plans to regenerate the files @@ -629,6 +685,19 @@ since it's easy to delete if you don't need it anymore and everything is contained in the project directory. Just be sure to add it to `.gitignore` so that you don't end up tracking it with git. +Let's take a look at the final type of graph that Snakemake can generate, this +is done with the `--filegraph` flag and will show the rules and their input and +output files. + +```bash +snakemake --filegraph a_b.txt | dot -Tpng > filegraph.png +``` + +In terms of details/complexity the filegraph is an intermediate between the +rulegraph (`--rulegraph`) and the jobgraph (`--dag`). It shows the rules to run +with their respective input and output files (shown with wildcards), but doesn't +show the jobs that will be executed. + By now you should be familiar with the basic functionality of Snakemake, and you can build advanced workflows with only the features we have discussed here. There's a lot we haven't covered though, in particular when it comes to making @@ -642,8 +711,7 @@ the other tutorials instead. ::: {.callout-note title="Quick recap"} In this section we've learned: -- How to use `--dag` and `--rulegraph` for visualizing the job and rule - graphs, respectively. +- How to use `--rulegraph`, `--dag` and `--filegraph` for visualizing the workflow. - How Snakemake reruns relevant parts of the workflow after there have been changes. - How Snakemake tracks changes to files and code in a workflow @@ -662,25 +730,12 @@ will be spent improving and making this workflow more flexible! ::: {.callout-tip} This section will leave a little more up to you compared to the previous one. If you get stuck at some point the final workflow after all the -modifications is available in `tutorials/git/Snakefile`. +modifications is available in `tutorials/containers/Snakefile`. ::: You are probably already in your `snakemake-env` environment, otherwise activate it (use `conda info --envs` if you are unsure). -::: {.callout-tip} -Here we have one Conda environment for executing the whole Snakemake -workflow. Snakemake also supports using explicit Conda environments on -a per-rule basis, by specifying something like `conda: -rule-specific-env.yml` in the rule definition and running Snakemake with -the `--use-conda` flag. The given rule will then be run in the Conda -environment specified in `rule-specific-env.yml` that will be created and -activated on the fly by Snakemake. Note that by default Snakemake uses -`conda` to generate the rule-specific environments. This behaviour can be -changed by running with `--conda-frontend conda`, which will force -Snakemake to use `conda` instead. -::: - Let's start by generating the rule graph so that we get an overview of the workflow. Here we have to specify the file with the rules using the `-s` flag to Snakemake since the path to the file differs from the default. @@ -691,13 +746,47 @@ snakemake -s snakefile_mrsa.smk --rulegraph | dot -T png > rulegraph_mrsa.png There's another difference in this command compared to the one we've used before, namely that we don't define a target. In the toy example we used -`a_b.txt` as a target, and the wildcards were resolved based on that. -How come that we don't need to do that here? It turns out that by default -Snakemake targets the first rule in a workflow. By convention, we call this rule -`all` and let it serve as a rule for aggregating the main outputs of the -workflow. - -![](images/rulegraph_mrsa.svg) +`a_b.txt` as a target, and the wildcards were resolved based on that. How come +that we don't need to do that here? This is because by default Snakemake uses +the first rule in a workflow as its target, unless a target is specified on the +command line. By convention, this rule is called `all` and serves as a rule +for aggregating the main outputs of the workflow. + +```{dot} +//| echo: false +//| label: MRSA_rulegraph +digraph MRSA_rulegraph { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=14, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "all", color = "0.07 0.6 0.85", style="rounded"]; + 1[label = "generate_count_table", color = "0.20 0.6 0.85", style="rounded"]; + 2[label = "sort_bam", color = "0.60 0.6 0.85", style="rounded"]; + 3[label = "align_to_genome", color = "0.00 0.6 0.85", style="rounded"]; + 4[label = "get_SRA_by_accession", color = "0.27 0.6 0.85", style="rounded"]; + 5[label = "index_genome", color = "0.47 0.6 0.85", style="rounded"]; + 6[label = "get_genome_fasta", color = "0.33 0.6 0.85", style="rounded"]; + 7[label = "get_genome_gff3", color = "0.40 0.6 0.85", style="rounded"]; + 8[label = "multiqc", color = "0.53 0.6 0.85", style="rounded"]; + 9[label = "fastqc", color = "0.13 0.6 0.85", style="rounded"]; + 8 -> 0 + 1 -> 0 + 7 -> 1 + 2 -> 1 + 3 -> 2 + 5 -> 3 + 4 -> 3 + 6 -> 5 + 9 -> 8 + 4 -> 9 +} +``` + +As you can see from the `snakefile_mrsa.smk` and the rulegraph, the input to the +`all` rule is the output from the rules `multiqc` and `generate_count_table`. +These rules in turn have their own inputs from other rules and by targeting the +`all` rule Snakemake determines what needs to be run by resolving the graph from +bottom to top. Now take some time and look through the workflow file and try to understand how the rules fit together. Use the rule graph as aid. The rules represent a quite @@ -717,64 +806,69 @@ will be used to generate a count table, _i.e._ a table that shows how many reads map to each gene for each sample. This count table is then what the downstream analysis will be based on. -![](images/dag_mrsa.svg) - -Now try to run the whole workflow. Hopefully you see something like this. +```{dot} +//| echo: false +//| label: MRSA_jobgraph +digraph MRSA_jobgraph { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "all", color = "0.07 0.6 0.85", style="rounded"]; + 1[label = "generate_count_table", color = "0.20 0.6 0.85", style="rounded"]; + 2[label = "sort_bam", color = "0.60 0.6 0.85", style="rounded"]; + 3[label = "align_to_genome", color = "0.00 0.6 0.85", style="rounded"]; + 4[label = "get_SRA_by_accession\nsample_id: SRR935090", color = "0.27 0.6 0.85", style="rounded"]; + 5[label = "index_genome", color = "0.47 0.6 0.85", style="rounded"]; + 6[label = "get_genome_fasta", color = "0.33 0.6 0.85", style="rounded"]; + 7[label = "sort_bam", color = "0.60 0.6 0.85", style="rounded"]; + 8[label = "align_to_genome", color = "0.00 0.6 0.85", style="rounded"]; + 9[label = "get_SRA_by_accession\nsample_id: SRR935091", color = "0.27 0.6 0.85", style="rounded"]; + 10[label = "sort_bam", color = "0.60 0.6 0.85", style="rounded"]; + 11[label = "align_to_genome", color = "0.00 0.6 0.85", style="rounded"]; + 12[label = "get_SRA_by_accession\nsample_id: SRR935092", color = "0.27 0.6 0.85", style="rounded"]; + 13[label = "get_genome_gff3", color = "0.40 0.6 0.85", style="rounded"]; + 14[label = "multiqc", color = "0.53 0.6 0.85", style="rounded"]; + 15[label = "fastqc", color = "0.13 0.6 0.85", style="rounded"]; + 16[label = "fastqc", color = "0.13 0.6 0.85", style="rounded"]; + 17[label = "fastqc", color = "0.13 0.6 0.85", style="rounded"]; + 1 -> 0 + 14 -> 0 + 2 -> 1 + 7 -> 1 + 10 -> 1 + 13 -> 1 + 3 -> 2 + 4 -> 3 + 5 -> 3 + 6 -> 5 + 8 -> 7 + 9 -> 8 + 5 -> 8 + 11 -> 10 + 12 -> 11 + 5 -> 11 + 15 -> 14 + 16 -> 14 + 17 -> 14 + 4 -> 15 + 9 -> 16 + 12 -> 17 +} +``` + +Now let's run the whole workflow. As before, specify `-p` to have Snakemake +print the shell commands for rules and use `-c` to control how many cores to run +with. If you don't specify number of cores Snakemake will use all available +cores on your machine. Let's run the workflow with one core: -```no-highlight -Building DAG of jobs... -Using shell: /bin/bash -Provided cores: 1 (use --cores to define parallelism) -Rules claiming more threads will be scaled down. -Job stats: -job count min threads max threads --------------------- ------- ------------- ------------- -align_to_genome 3 1 1 -all 1 1 1 -fastqc 3 1 1 -generate_count_table 1 1 1 -generate_rulegraph 1 1 1 -get_SRA_by_accession 3 1 1 -get_genome_fasta 1 1 1 -get_genome_gff3 1 1 1 -index_genome 1 1 1 -multiqc 1 1 1 -sort_bam 3 1 1 -total 19 1 1 - -Select jobs to execute... - -[Mon Oct 25 17:13:47 2021] -rule get_genome_fasta: - output: data/ref/NCTC8325.fa.gz - jobid: 6 - resources: tmpdir=/var/folders/p0/6z00kpv16qbf_bt52y4zz2kc0000gp/T - ---2021-10-25 17:13:48-- ftp://ftp.ensemblgenomes.org/pub/bacteria/release-37/fasta/bacteria_18_collection/staphylococcus_aureus_subsp_aureus_nctc_8325/dna//Staphylococcus_aureus_subsp_aureus_nctc_8325.ASM1342v1.dna_rm.toplevel.fa.gz - => ‘data/ref/NCTC8325.fa.gz’ -Resolving ftp.ensemblgenomes.org (ftp.ensemblgenomes.org)... 193.62.197.75 -Connecting to ftp.ensemblgenomes.org (ftp.ensemblgenomes.org)|193.62.197.75|:21... connected. -Logging in as anonymous ... Logged in! -==> SYST ... done. ==> PWD ... done. -. -. -[lots of stuff] -. -. -localrule all: - input: results/tables/counts.tsv, results/multiqc/multiqc.html, results/rulegraph.png - jobid: 0 - resources: tmpdir=/var/folders/p0/6z00kpv16qbf_bt52y4zz2kc0000gp/T - -[Mon Oct 25 17:14:38 2021] -Finished job 0. -19 of 19 steps (100%) done +```bash +snakemake -s snakefile_mrsa.smk -c 1 -p ``` -After everything is done, the workflow will have resulted in a bunch of files in -the directories `data/` and `results/`. Take some time to look through the -structure, in particular the quality control reports in `results/multiqc/` and -the count table in `results/tables/`. +After everything is done, the workflow will have generated fastq files for each +sample under `data/` and reference genome files under `data/ref`. The `results/` +directory will contain the output from the different steps in the workflow, +divided into subdirectories. Take some time to look through the structure. ::: {.callout-note title="Quick recap"} In this section we've learned: @@ -784,7 +878,7 @@ In this section we've learned: - Which output files the MRSA workflow produces. ::: -## Parameters +## Parameters and configuration In a typical bioinformatics project, considerable efforts are spent on tweaking parameters for the various programs involved. It would be inconvenient if you @@ -795,9 +889,9 @@ keyword. ```python rule some_rule: output: - "..." + "results/some_output.txt" input: - "..." + "some_input.txt" params: cutoff=2.5 shell: @@ -806,12 +900,15 @@ rule some_rule: """ ``` -Most of the programs are run with default settings in the MRSA workflow and +In the toy example above, the params directive has one keyword called `cutoff` +which is set to `2.5` and this value can be accessed in the shell command with +`{params.cutoff}`. + +In the MRSA workflow most of the programs are run with default settings and don't use the `params:` directive. However, the `get_SRA_by_accession` rule -is an exception. Here the remote address for each of the files to download -is passed to the shell directive via: +is an exception. Let's take a look at this part of the workflow: -```python +```{.python filename="snakefile_mrsa.smk"} def get_sample_url(wildcards): samples = { "SRR935090": "https://figshare.scilifelab.se/ndownloader/files/39539767", @@ -830,12 +927,16 @@ rule get_SRA_by_accession: url = get_sample_url shell: """ - wget -O - {params.url} | seqtk sample - 25000 | gzip -c > {output[0]} + wget -q -O - {params.url} | seqtk sample - 25000 | gzip -c > {output[0]} """ - ``` -You may recognize this from [the basics](#the-basics) of this tutorial where we +Here `params` has a keyword called `url` which is set to a function called +`get_sample_url` defined just above the rule. The function returns a URL for the +sample that the rule is supposed to download and this URL is then accessed in +the `shell:` directive with `{params.url}`. + +You may recognize this from [The basics](#the-basics) of this tutorial where we used input functions to generate strings and lists of strings for the `input:` section of a rule. Using a function to return values based on the wildcards also works for `params:`. Here `sample_id` is a wildcard which in this specific @@ -860,7 +961,7 @@ need help, click to show the solution below. ::: {.callout-tip collapse="true" title="Click to show"} -```python +```{.python filename="snakefile_mrsa.smk"} rule get_SRA_by_accession: """ Retrieve a single-read FASTQ file @@ -872,22 +973,27 @@ rule get_SRA_by_accession: max_reads = 20000 shell: """ - wget -O - {params.url} | seqtk sample - {params.max_reads} | gzip -c > {output[0]} + wget -q -O - {params.url} | seqtk sample - {params.max_reads} | gzip -c > {output[0]} """ ``` ::: -Now run through the workflow. Because there's been changes to the `get_SRA_by_accession` -rule this will trigger a re-run of the rule for all three accessions. In addition -all downstream rules that depend on output from `get_SRA_by_accession` are re-run. +Now run the workflow again with: -As you can see the parameter values we set in the `params` section don't have -to be static, they can be any Python expression. In particular, Snakemake -provides a global dictionary of configuration parameters called `config`. -Let's modify `get_SRA_by_accession` to look something like this in order to -make use of this dictionary: +```bash +snakemake -s snakefile_mrsa.smk -p -c 1 +``` -```python +Because there's been changes to the `get_SRA_by_accession` rule this will +trigger a re-run of the rule for all three accessions. In addition all +downstream rules that depend on output from `get_SRA_by_accession` are re-run. + +As you can see the parameter values we set in the `params` section don't have to +be static, they can be any Python expression. In particular, Snakemake provides +a global dictionary of configuration parameters called `config`. Let's modify +`get_SRA_by_accession` in order to make use of this dictionary: + +```{.python filename="snakefile_mrsa.smk"} rule get_SRA_by_accession: """ Retrieve a single-read FASTQ file @@ -899,7 +1005,7 @@ rule get_SRA_by_accession: max_reads = config["max_reads"] shell: """ - wget -L {params.url} | seqtk sample - {params.max_reads} | gzip -c > {output[0]} + wget -q -O - {params.url} | seqtk sample - {params.max_reads} | gzip -c > {output[0]} """ ``` @@ -920,16 +1026,23 @@ project-specific settings, sample ids and so on. This also enables us to write the Snakefile in a more general manner so that it can be better reused between projects. Like several other files used in these tutorials, this file should be in [YAML format](https://en.wikipedia.org/wiki/YAML). Create the file below and -save it as `config.yml`. +save it as `config.yml` (here we change back to using 25000 reads): -```yaml +```{.yaml filename="config.yml"} max_reads: 25000 ``` If we now run Snakemake with `--configfile config.yml`, it will parse this file -to form the `config` dictionary. If you want to overwrite a parameter value, -_e.g._ for testing, you can still use the `--config KEY=VALUE` flag, as in -`--config max_reads=1000`. +to form the `config` dictionary. Try it out by running: + +```bash +snakemake -s snakefile_mrsa.smk --configfile config.yml -p -c 1 +``` + +Configuration settings that you specify with `--config KEY=VALUE` on the command +line will overwrite settings specified in a config file. This can be useful if +you want to run the workflow with a different setting for a specific parameter +for testing purposes. ::: {.callout-tip} Rather than supplying the config file from the command line you could also @@ -960,14 +1073,14 @@ this. Just as we define `input` and `output` in a rule we can also define ```python rule some_rule: output: - "..." + "results/some_output.txt" input: - "..." + "some_input.txt" log: - "..." + "logs/some_log.txt" shell: """ - echo 'Converting {input} to {output}' > {log} + some_program {input} {output} > {log} """ ``` @@ -976,18 +1089,19 @@ a little differently by Snakemake. For example, it's shown in the file summary when using `-D` and unlike other output files it's not deleted if jobs fail which of course is necessary for debugging purposes. It's also a good way to clarify the purpose of the file. We probably don't need to save logs for -all the rules, only the ones with interesting output. +all the rules, only the ones with interesting output: -- `get_genome_fasta` and `get_genome_gff3` would be good to log since they are - dependent on downloading files from an external server. - `multiqc` aggregates quality control data for all the samples into one html report, and the log contains information about which samples were aggregated. +- `get_genome_fasta` and `get_genome_gff3` would be good to log since they are + dependent on downloading files from an external server. - `index_genome` outputs some statistics about the genome indexing. +- `generate_count_table` outputs information about input files, settings and a summary of results. - `align_to_genome` outputs important statistics about the alignments. This is probably the most important log to save. Now add a log file to some or all of the rules above. A good place to save them -to would be `results/logs/rule_name/`. In order to avoid that multiple jobs +to would be `results/logs//`. In order to avoid that multiple jobs write to the same files Snakemake requires that all output and log files contain the same wildcards, so be sure to include any wildcards used in the rule in the log name as well, _e.g._ `{some_wildcard}.log`. @@ -997,16 +1111,15 @@ log to contain. Some of the programs we use send their log information to standard out, some to standard error and some let us specify a log file via a flag. -For example, in the `align_to_genome` rule, it could look like this (Bowtie2 -writes log info to standard error): +For example, `bowtie2` used in the `align_to_genome` rule writes log info to standard error so we use `2>{log}` to redirect this to the log file: -```python +```{.python filename="snakefile_mrsa.smk"} rule align_to_genome: """ Align a fastq file to a genome index using Bowtie 2. """ output: - "results/bam/{sample_id,\w+}.bam" + "results/bam/{sample_id,\\w+}.bam" input: "data/{sample_id}.fastq.gz", "results/bowtie2/NCTC8325.1.bt2", @@ -1023,23 +1136,126 @@ rule align_to_genome: """ ``` -To save some time you can use the info below. +For programs used in the rules `multiqc`, `get_genome_fasta`, `get_genome_gff3`, +`index_genome` and `generate_count_table` you can use the table below if you +need help with how to redirect output to the log file. -```bash -# wget has a -o flag for specifying the log file -wget remote_file -O output_file -o {log} +| Rule | Program | Redirect flag | +|-----------|----------------|----------------| +| `multiqc` | `multiqc` | `2>{log}` | +| `get_genome_fasta`/`get_genome_gff3` | `wget` | `-o {log}` | +| `index_genome` | `bowtie2-build` | `>{log}` | +| `generate_count_table` | `featureCounts` | `2>{log}` | + + +If you need help, click to show the solution below for the rules. + +::: {.callout-tip collapse="true" title="Click to show"} +```{.python filename="snakefile_mrsa.smk"} +rule multiqc: + """ + Aggregate all FastQC reports into a MultiQC report. + """ + output: + html = "results/multiqc/multiqc.html", + stats = "results/multiqc/multiqc_general_stats.txt" + input: + "results/fastqc/SRR935090_fastqc.zip", + "results/fastqc/SRR935091_fastqc.zip", + "results/fastqc/SRR935092_fastqc.zip" + log: + "results/logs/multiqc.log" + shell: + """ + # Run multiQC and keep the html report + multiqc -n multiqc.html {input} 2>{log} + mv multiqc.html {output.html} + mv multiqc_data/multiqc_general_stats.txt {output.stats} + + # Remove the other directory that multiQC creates + rm -rf multiqc_data + """ + +rule get_genome_fasta: + """ + Retrieve the sequence in fasta format for a genome. + """ + output: + "data/ref/NCTC8325.fa.gz" + log: + "results/logs/get_genome_fasta.log" + shell: + """ + wget -o {log} ftp://ftp.ensemblgenomes.org/pub/bacteria/release-37/fasta/bacteria_18_collection/staphylococcus_aureus_subsp_aureus_nctc_8325/dna//Staphylococcus_aureus_subsp_aureus_nctc_8325.ASM1342v1.dna_rm.toplevel.fa.gz -O {output} + """ + +rule get_genome_gff3: + """ + Retrieve annotation in gff3 format for a genome. + """ + output: + "data/ref/NCTC8325.gff3.gz" + log: + "results/logs/get_genome_gff3.log" + shell: + """ + wget -o {log} ftp://ftp.ensemblgenomes.org/pub/bacteria/release-37/gff3/bacteria_18_collection/staphylococcus_aureus_subsp_aureus_nctc_8325//Staphylococcus_aureus_subsp_aureus_nctc_8325.ASM1342v1.37.gff3.gz -O {output} + """ + +rule index_genome: + """ + Index a genome using Bowtie 2. + """ + output: + "results/bowtie2/NCTC8325.1.bt2", + "results/bowtie2/NCTC8325.2.bt2", + "results/bowtie2/NCTC8325.3.bt2", + "results/bowtie2/NCTC8325.4.bt2", + "results/bowtie2/NCTC8325.rev.1.bt2", + "results/bowtie2/NCTC8325.rev.2.bt2" + input: + "data/ref/NCTC8325.fa.gz" + log: + "results/logs/index_genome.log" + shell: + """ + # Bowtie2 cannot use .gz, so unzip to a temporary file first + gunzip -c {input} > tempfile + bowtie2-build tempfile results/bowtie2/NCTC8325 >{log} -# MultiQC and featureCounts write to standard error so we redirect with "2>" -multiqc -n output_file input_files 2> {log} -featureCounts -t gene -g gene_id -a gff_file -o output_file input_files 2>{log} + # Remove the temporary file + rm tempfile + """ -# Bowtie2-build redirects to standard out so we use ">" -bowtie2-build input_file index_dir > {log} +rule generate_count_table: + """ + Generate a count table using featureCounts. + """ + output: + "results/tables/counts.tsv" + input: + bams = ["results/bam/SRR935090.sorted.bam", + "results/bam/SRR935091.sorted.bam", + "results/bam/SRR935092.sorted.bam"], + annotation = "data/ref/NCTC8325.gff3.gz" + log: + "results/logs/generate_count_table.log" + shell: + """ + featureCounts -t gene -g gene_id -a {input.annotation} -o {output} {input.bams} 2>{log} + """ ``` +::: + +After adding logfiles to the rules, rerun the whole workflow with: -Now rerun the whole workflow. Do the logs contain what they should? Note how -much easier it is to follow the progression of the workflow when the rules write -to logs instead of to the terminal. +```bash +snakemake -s snakefile_mrsa.smk --configfile config.yml -p -c 1 +``` + +Do the logs contain what they should? Note how much easier it is to follow the +progression of the workflow when the rules write to logs instead of to the +terminal. ::: {.callout-tip} If you have a rule with a shell directive in which several commands are run @@ -1068,47 +1284,50 @@ from `align_to_genome` is a BAM file, which contains information about all the reads for a sample and where they map in the genome. For downstream processing we need this file to be sorted by genome coordinates. This is what the rule `sort_bam` is for. We therefore end up with both `results/bam/{sample_id}.bam` -and `results/bam/{sample_id}.sorted.bam`. +and `results/bam/{sample_id}.sorted.bam` as you can see if you list the contents of `results/bam`. In Snakemake we can mark an output file as temporary like this: ```python -output: temp("...") +rule some_rule: + output: + temp("results/some_output.txt") ``` The file will then be deleted as soon as all jobs where it's an input have -finished. Now do this for the output of `align_to_genome`. We have to rerun the -rule for it to trigger, so use `-R align_to_genome`. It should look something -like this: +finished. Let's use the `temp` notation for for the output of `align_to_genome`, +modify the rule so that the `output:` directive looks like this: -```no-highlight -. -. -rule sort_bam: - input: results/bam/SRR935090.bam - output: results/bam/SRR935090.sorted.bam - jobid: 2 - wildcards: sample_id=SRR935090 +```{.python filename="snakefile_mrsa.smk"} + output: + temp("results/bam/{sample_id,\\w+}.bam") +``` + +We have to force a rerun of the rule for it to trigger, so we use `--forcerun align_to_genome` to do this. Run the workflow with: + +```bash +snakemake -s snakefile_mrsa.smk --configfile config.yml -p -c 1 --forcerun align_to_genome +``` -Removing temporary output file results/bam/SRR935090.bam. -Finished job 2. -. -. +Take a look at the output printed to your terminal. After the rule `sort_bam` you should see something like this: + +```no-highlight +Removing temporary output results/bam/SRR935092.bam. ``` +If you list the contents of `results/bam` you should see that it now only +contains the sorted BAM files. + ::: {.callout-tip} -Sometimes you may want to trigger removal of temporary files without -actually rerunning the jobs. You can then use the `--delete-temp-output` -flag. In some cases you may instead want to run only parts of a workflow -and therefore want to prevent files marked as temporary from being deleted -(because the files are needed for other parts of the workflow). In such -cases you can use the `--notemp` flag. +If you want to prevent Snakemake from deleting files marked as temporary you can +run with the `--notemp` command line flag which will cause Snakemake to ignore +any `temp()` declarations. If you at any point want to delete remaining files +that are declared as temporary in your workflow you can instead run with the +`--delete-temp-output` which will delete all remaining `temp` files. ::: -Snakemake has a number of options for marking files: +Snakemake has a number of additional options for marking files: -- `temp("...")`: The output file should be deleted once it's no longer needed - by any rules. - `protected("...")`: The output file should be write-protected. Typically used to protect files that require a huge amount of computational resources from being accidentally deleted. @@ -1125,21 +1344,20 @@ In this section we've learned: - How to mark an output file as temporary for automatic removal. ::: -## Targets +## The expand function -We've mentioned that Snakemake rules take either strings or a list of strings as -input, and that we can use any Python expression in Snakemake workflows. -Here we'll show how these features help us condense the code of rules. +Now that we have a working workflow with logs and temporary files, let's look at +a way to make the code more readable. Consider the rule `align_to_genome` below. -```python +```{.python filename="snakefile_mrsa.smk"} rule align_to_genome: """ Align a fastq file to a genome index using Bowtie 2. """ output: - "results/bam/{sample_id}.bam" + temp("results/bam/{sample_id,\\w+}.bam") input: "data/{sample_id}.fastq.gz", "results/bowtie2/NCTC8325.1.bt2", @@ -1148,113 +1366,167 @@ rule align_to_genome: "results/bowtie2/NCTC8325.4.bt2", "results/bowtie2/NCTC8325.rev.1.bt2", "results/bowtie2/NCTC8325.rev.2.bt2" + log: + "results/logs/align_to_genome/{sample_id}.log" shell: """ - bowtie2 -x results/bowtie2/NCTC8325 -U {input[0]} > {output} + bowtie2 -x results/bowtie2/NCTC8325 -U {input[0]} > {output} 2>{log} """ ``` Here we have seven inputs; the FASTQ file with the reads and six files with similar file names from the Bowtie2 genome indexing. Instead of writing all -the filenames we can tidy this up by using a Python expression to generate a -list of these files instead. If you're familiar with Python you could do -this with [list comprehensions](https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions) -like this: +the filenames we can tidy this up by using the built-in function `expand()`. -```python -input: - "data/{sample_id}.fastq.gz", - [f"results/bowtie2/NCTC8325.{substr}.bt2" for - substr in ["1", "2", "3", "4", "rev.1", "rev.2"]] +```{.python} +expand("results/bowtie2/NCTC8325.{substr}.bt2", + substr = ["1", "2", "3", "4", "rev.1", "rev.2"]) ``` -This will take the elements of the list of substrings one by one, and insert -that element in the place of `{substr}`. Since this type of aggregating -rules are quite common, Snakemake also has a more compact way of achieving the -same thing. +Here `expand` will take the string `"results/bowtie2/NCTC8325.{substr}.bt2"` and +insert each of the values in the `substr` list: `["1", "2", "3", "4", "rev.1", +"rev.2"]` in place of `{substr}`. The result is a list of strings that is used +as input to the rule. This is a very powerful feature that can save you a lot of +typing and make your code more readable. -```python -input: - "data/{sample_id}.fastq.gz", - expand("results/bowtie2/NCTC8325.{substr}.bt2", - substr = ["1", "2", "3", "4", "rev.1", "rev.2"]) -``` +Now modify the rules `index_genome` and `align_to_genome` to use the `expand()` +expression in the output and input, respectively. Click the solution below if +you need help. -::: {.callout-caution} -When using expand() like this, `substr` is not a wildcard because it is -resolved to the values explicitly given inside the expand expression. +::: {.callout-tip collapse="true" title="Click to show"} +```{.python filename="snakefile_mrsa.smk"} +rule index_genome: + """ + Index a genome using Bowtie 2. + """ + output: + expand("results/bowtie2/NCTC8325.{substr}.bt2", + substr=["1", "2", "3", "4","rev.1", "rev.2"]) + input: + "data/ref/NCTC8325.fa.gz" + log: + "results/logs/index_genome/NCTC8325.log" + shell: + """ + # Bowtie2 cannot use .gz, so unzip to a temporary file first + gunzip -c {input} > tempfile + bowtie2-build tempfile results/bowtie2/NCTC8325 >{log} + + # Remove the temporary file + rm tempfile + """ + +rule align_to_genome: + """ + Align a fastq file to a genome index using Bowtie 2. + """ + output: + temp("results/bam/{sample_id,\\w+}.bam") + input: + "data/{sample_id}.fastq.gz", + expand("results/bowtie2/NCTC8325.{substr}.bt2", + substr=["1", "2", "3", "4", "rev.1", "rev.2"]) + log: + "results/logs/align_to_genome/{sample_id}.log" + shell: + """ + bowtie2 -x results/bowtie2/NCTC8325 -U {input[0]} > {output} 2>{log} + """ +``` ::: -Now change in the rules `index_genome` and `align_to_genome` to use the -`expand()` expression. +Great! Let's put the `expand` function to more use! -In the workflow we decide which samples to run by including the SRR ids in the -names of the inputs to the rules `multiqc` and `generate_count_table`: +The `multiqc` and `generate_count_table` rules take as input the fastqc zip files, and the sorted bam files for all three samples, respectively. See the excerpt below: + +```{.python filename="snakefile_mrsa.smk"} +rule multiqc: + ... + input: + "results/fastqc/SRR935090_fastqc.zip", + "results/fastqc/SRR935091_fastqc.zip", + "results/fastqc/SRR935092_fastqc.zip" + +... -```python rule generate_count_table: - output: - "results/tables/counts.tsv" + ... input: bams = ["results/bam/SRR935090.sorted.bam", "results/bam/SRR935091.sorted.bam", "results/bam/SRR935092.sorted.bam"], ... -rule multiqc: - output: - html = "results/multiqc/multiqc.html", - stats = "results/multiqc/multiqc_general_stats.txt" - input: - "results/fastqc/SRR935090_fastqc.zip", - "results/fastqc/SRR935091_fastqc.zip", - "results/fastqc/SRR935092_fastqc.zip" +``` + +Having the files for each sample written explicitly in these rules is how +Snakemake knows to run the workflow for all three samples. However, this is also +a potential source of errors since it's easy to change in one place and forget +to change in the other. Because we can use Python code "everywhere" let's +instead define a list of sample ids and put at the top of the Snakefile, just +before the rule `all`: +```{.python filename="snakefile_mrsa.smk"} +sample_ids = ["SRR935090", "SRR935091", "SRR935092"] ``` -The output files from these two rules, `results/multiqc.html` and -`results/tables/counts.tsv`, are in turn specified as input to the `all` rule at -the top of the file. Because the first rule is targeted by default when we run -Snakemake on the command line (like we mentioned in the [MRSA workflow -section](#the-mrsa-workflow)) this is what triggers the rules to run on each of -the three samples. +Now use `expand()` in `multiqc` and `generate_count_table` to use `sample_ids` for +the sample ids. For the `multiqc` rule it could look like this: -However, this is a potential source of errors since it's easy to change in one -place and forget to change in the other. Because we can use Python code -"everywhere" let's instead define a list of sample ids and put at the very -top of the Snakefile, just before the rule `all`: +```{.python} + input: + expand("results/fastqc/{sample_id}_fastqc.zip", + sample_id = sample_ids) +``` -```python -SAMPLES = ["SRR935090", "SRR935091", "SRR935092"] +See if you can update the `generate_count_table` rule in the same manner. If you +need help, click the solution below. + +::: {.callout-tip collapse="true" title="Click to show"} +```{.python filename="snakefile_mrsa.smk"} +rule generate_count_table: + """ + Generate a count table using featureCounts. + """ + output: + "results/tables/counts.tsv" + input: + bams = expand("results/bam/{sample_id}.sorted.bam", sample_id = sample_ids), + annotation = "data/ref/NCTC8325.gff3.gz" + log: + "results/logs/generate_count_table.log" + shell: + """ + featureCounts -t gene -g gene_id -a {input.annotation} -o {output} {input.bams} 2>{log} + """ ``` +::: -Now use `expand()` in `multiqc` and `generate_count_table` to use `SAMPLES` for -the sample ids. For the `multiqc` rule it could look like this: +Now run the workflow again with: -```python -input: - expand("results/fastqc/{sample_id}_fastqc.zip", sample_id = SAMPLES) +```bash +snakemake -s snakefile_mrsa.smk --configfile config.yml -p -c 1 --forcerun align_to_genome index_genome multiqc generate_count_table ``` -See if you can update the `generate_count_table` rule in the same manner! +to force a rerun of the modified rules and make sure that the workflow still +works as expected. ::: {.callout-note title="Quick recap"} In this section we've learned: -- How to use the `expand()` expression to create a list with file names, - inserting all provided wildcard values. +- How to use the `expand()` expression to make the code more readable. ::: ## Shadow rules Take a look at the `index_genome` rule below: -```python +```{.python filename="snakefile_mrsa.smk"} rule index_genome: """ Index a genome using Bowtie 2. """ output: - index = expand("results/bowtie2/NCTC8325.{substr}.bt2", + expand("results/bowtie2/NCTC8325.{substr}.bt2", substr = ["1", "2", "3", "4", "rev.1", "rev.2"]) input: "data/NCTC8325.fa.gz" @@ -1272,50 +1544,101 @@ rule index_genome: ``` There is a temporary file here called `tempfile` which is the uncompressed -version of the input, since Bowtie2 cannot use compressed files. There are -a number of drawbacks with having files that aren't explicitly part of the -workflow as input/output files to rules: +version of the input fasta file. This file is created when the rule starts ( +since Bowtie2 cannot use compressed files) but is not needed upon completion. +There are a number of drawbacks with having files that aren't explicitly part of +the workflow as input/output files to rules: - Snakemake cannot clean up these files if the job fails, as it would do for normal output files. - If several jobs are run in parallel there is a risk that they write to `tempfile` at the same time. This can lead to very scary results. - Sometimes we don't know the names of all the files that a program can - generate. It is, for example, not unusual that programs leave some kind of - error log behind if something goes wrong. + generate and we may only be interested in a few of them. All of these issues can be dealt with by using the `shadow` option for a rule. The shadow option results in that each execution of the rule is run in an isolated temporary directory (located in `.snakemake/shadow/` by default). There are a few options for `shadow` (for the full list of these options see the [Snakemake docs](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#shadow-rules)). + The most simple is `shadow: "minimal"`, which means that the rule is executed in -an empty directory that the input files to the rule have been symlinked into. -For the rule below, that means that the only file available would be `input.txt`. -The shell commands would generate the files `some_other_junk_file` and -`output.txt`. Lastly, Snakemake will move the output file (`output.txt`) to its -"real" location and remove the whole shadow directory. We therefore never have -to think about manually removing `some_other_junk_file`. +an empty shadow directory that the input files to the rule have been symlinked into. +For the rule below, that means that `some_input.txt` would be symlinked into a shadow directory under `.snakemake/shadow/`. The shell commands would generate the file `some_other_junk_file` in this shadow directory, as well as the output file `results/some_output.txt`. When the rule finishes, the shadow directory is removed which means we wouldn't have to think about manually removing `some_other_junk_file`. ```python rule some_rule: output: - "output.txt" + "results/some_output.txt" input: - "input.txt" + "some_input.txt" shadow: "minimal" shell: """ touch some_other_junk_file - cp {input} {output} + some_program {input} {output} + """ +``` + +Try this out for the rules `multiqc` and `index_genome` where we have to +"manually" deal with files that aren't tracked by Snakemake. Add `shadow: +"minimal"` to these rules and also delete the shell commands that remove +temporary files from those rules (the last lines in the shell directives that +begin with `rm`), as they are no longer needed. Check the solution below if you +need help. + +::: {.callout-tip collapse="true" title="Click to show"} +```{.python filename="snakefile_mrsa.smk"} +rule multiqc: + """ + Aggregate all FastQC reports into a MultiQC report. + """ + output: + html = "results/multiqc/multiqc.html", + stats = "results/multiqc/multiqc_general_stats.txt" + input: + expand("results/fastqc/{sample_id}_fastqc.zip", + sample_id = sample_ids) + log: + "results/logs/multiqc.log" + shadow: "minimal" + shell: + """ + # Run multiQC and keep the html report + multiqc -n multiqc.html {input} 2>{log} + mv multiqc.html {output.html} + mv multiqc_data/multiqc_general_stats.txt {output.stats} + """ + +rule index_genome: + """ + Index a genome using Bowtie 2. + """ + output: + expand("results/bowtie2/NCTC8325.{substr}.bt2", + substr=["1", "2", "3", "4","rev.1", "rev.2"]) + input: + "data/ref/NCTC8325.fa.gz" + log: + "results/logs/index_genome/NCTC8325.log" + shadow: "minimal" + shell: + """ + # Bowtie2 cannot use .gz, so unzip to a temporary file first + gunzip -c {input} > tempfile + bowtie2-build tempfile results/bowtie2/NCTC8325 >{log} """ ``` +::: -Try this out for the rules where we have to "manually" deal with files that -aren't tracked by Snakemake (`multiqc`, `index_genome`). Also remove the shell -commands that remove temporary files from those rules, as they are no longer -needed. Now rerun the workflow and validate that the temporary files don't show -up in your working directory. +Now run the workflow again with: + +```bash +snakemake -s snakefile_mrsa.smk --configfile config.yml -p -c 1 +``` + +and make sure that you see neither `tempfile`, nor the `multiqc_data` directory +in your current working directory after the workflow has finished. ::: {.callout-tip} Some people use the shadow option for almost every rule and some never @@ -1333,123 +1656,104 @@ In this section we've learned: - How to use the shadow option to handle files that are not tracked by Snakemake. ::: -## Generalising workflows +## Generalizing the MRSA workflow -It's a good idea to separate project-specific parameters from the -actual implementation of the workflow. This allows anyone using the -workflow to modify its behaviour without changing the underlying code, -making the workflow more general. +You've now improved the MRSA workflow by making it more readable, adding +logfiles, a config file etc. However, the workflow is not very general since +it's hard-coded to work with the three samples `SRR935090`, `SRR935091` and +`SRR935092` and the genome `NCTC8325`. If someone would want to use the workflow +with other samples or genomes they would have to change the code in the +Snakefile, which is not optimal. Instead it's a good idea to separate +project-specific parameters from the actual implementation of the workflow. This +allows anyone using the workflow to modify its behaviour without changing the +underlying code, making the workflow more general. -In order to generalize our RNA-seq analysis workflow we should move all -project-specific information to `config.yml`. This means that we want the -config file to: +In this section we will go through how to make the workflow more general and +flexible by moving the sample ids and URLs to a configuration file and turning the reference genome into a wildcard. -- Specify which samples to run. -- Specify which genome to align to and where to download its sequence and - annotation files. -- (Contain any other parameters we might need to make it into a general - workflow, _e.g._ to support both paired-end and single-read sequencing) +### Specifying samples in a configuration file -::: {.callout-note} -Putting all configuration in `config.yml` will break the -`generate_rulegraph` rule. You can fix it either by replacing -`--config max_reads=0` with `--configfile=config.yml` in the shell -command of that rule in the Snakefile, or by adding -`configfile: "config.yml"` to the top of the Snakefile (as mentioned -in a previous tip). -::: +Remove the `sample_ids = ["SRR935090", "SRR935091", "SRR935092"]` line we added +to the top of `snakefile_mrsa.smk` and add the following to `config.yml`: -The first point is straightforward; rather than using `SAMPLES = ["..."]` in -the Snakefile we define it as a parameter in `config.yml`. You can either add -it as a list similar to the way it was expressed before by adding: - -```yaml -SAMPLES: ["SRR935090", "SRR935091", "SRR935092"] +```{.yaml filename="config.yml"} +samples: + SRR935090: "https://figshare.scilifelab.se/ndownloader/files/39539767" + SRR935091: "https://figshare.scilifelab.se/ndownloader/files/39539770" + SRR935092: "https://figshare.scilifelab.se/ndownloader/files/39539773" ``` -To `config.yml`, or you can use this YAML notation (whether you -choose `SAMPLES` or `sample_ids` as the name of the entry doesn't matter, -you will just have to reference the same name in the config dictionary -inside the workflow): +Now when passing `--configfile config.yml`on the command line +`config["samples"]` will be a dictionary where the keys are the sample ids and +the values are the URLs to the FASTQ files. -```yaml -sample_ids: - - SRR935090 - - SRR935091 - - SRR935092 -``` - -Change the workflow to reference `config["sample_ids"]` (if using the latter -example) instead of `SAMPLES`, as in: +Next, modify the `get_sample_url` function to use this dictionary instead of the +hard-coded values: -```bash -expand("results/fastqc/{sample_id}_fastqc.zip", - sample_id = config["sample_ids"]) +```{.python filename="snakefile_mrsa.smk"} +def get_sample_url(wildcards): + return config["samples"][wildcards.sample_id] ``` -Remove the line with `SAMPLES = ["SRR935090", "SRR935091", "SRR935092"]` that we -added to the top of `snakefile_mrsa.smk` in the [targets section](#targets). - -Do a dry-run afterwards to make sure that everything works as expected. - -You may remember from the [parameters section](#parameters) part of this -tutorial that we're using a function to return the URL of the FASTQ files to -download for each sample: +Now change the `multiqc` and `generate_count_table` rules that use the `expand` function in the input directive to use `config["samples"].keys()` to expand the `sample_id` wildcard. This is how it should look for the `multiqc` rule: ```python -def get_sample_url(wildcards): - samples = { - "SRR935090": "https://figshare.scilifelab.se/ndownloader/files/39539767", - "SRR935091": "https://figshare.scilifelab.se/ndownloader/files/39539770", - "SRR935092": "https://figshare.scilifelab.se/ndownloader/files/39539773" - } - return samples[wildcards.sample_id] + input: + expand("results/fastqc/{sample_id}_fastqc.zip", + sample_id = config["samples"].keys()) ``` -Here the URLs of each sample_id is hard-coded in the `samples` dictionary -inside the function. To generalize this function we can move the definition -to the config file, placing it for example under an entry that we call -`sample_urls` like this: +Try to change the `generate_count_table` rule in the same way. Check the solution below if you need help. -```yaml -sample_urls: - SRR935090: "https://figshare.scilifelab.se/ndownloader/files/39539767" - SRR935091: "https://figshare.scilifelab.se/ndownloader/files/39539770" - SRR935092: "https://figshare.scilifelab.se/ndownloader/files/39539773" +::: {.callout-tip collapse="true" title="Click to show"} +```{.python filename="snakefile_mrsa.smk"} +rule generate_count_table: + """ + Generate a count table using featureCounts. + """ + output: + "results/tables/counts.tsv" + input: + bams = expand("results/bam/{sample_id}.sorted.bam", + sample_id = config["samples"].keys()), + annotation = "data/ref/NCTC8325.gff3.gz" + log: + "results/logs/generate_count_table.log" + shell: + """ + featureCounts -t gene -g gene_id -a {input.annotation} -o {output} {input.bams} 2>{log} + """ ``` +::: -This is what's called 'nested' key/value pairs, meaning that each sample_id --> URL pair becomes nested under the config key `sample_urls`. So in order -to access the URL of _e.g._ `SRR935090` we would use -`config["sample_urls"]["SRR935090"]`. This means that you will have to -update the `get_sample_url` function to: +To see that your new implementation works you can do a forced rerun of the modified rules: -```python -def get_sample_url(wildcards): - return config["sample_urls"][wildcards.sample_id] +```bash +snakemake -s snakefile_mrsa.smk --configfile config.yml -p -c 1 --forcerun generate_count_table multiqc get_SRA_by_accession ``` -Now the function uses the global `config` dictionary to return URLs for each -sample_id. Again, do a dry-run to see that the new implementation works. - ::: {.callout-tip} + If you were to scale up this workflow with more samples it could become -impractical to have to define the URLs by hand in the config file. A -tip then is to have a separate file where samples are listed in one column -and the URLs (or file paths) in another column. With a few lines of python -code you could then read that list at the start of the workflow and add -each sample to the config dictionary. +impractical to have to define the sample ids and URLs by hand in the config +file. A tip then is to have a separate file where samples are listed in one +column and the URLs (or file paths) in another column. With a few lines of +python code you could then read that list at the start of the workflow and add +each sample to the config dictionary. See +[below](#reading-samples-from-a-file-instead-of-hard-coding-them) for an example +of how to do this. + ::: -Now let's take a look at the genome reference used in the workflow. In the -`get_genome_fasta` and `get_genome_gff3` rules we have hard-coded FTP paths -to the FASTA GFF annotation file for the genome `NCTC8325`. We can -generalize this in a similar fashion to what we did with the -`get_SRA_by_accession` rule. Let's add a nested entry called `genomes` to -the config file that will hold the genome id and FTP paths to the FASTA and -GFF file: +### Generalizing the genome reference -```yaml +Now we'll take a look at how to generalize the genome reference used in the +workflow. + +In the `get_genome_fasta` and `get_genome_gff3` rules we have hard-coded FTP +paths to the FASTA and GFF annotation file for the genome `NCTC8325`. Let's move this information to the configuration file, and also add information for another genome, `ST398`. Add the following to `config.yml`: + +```{.yaml filename="config.yml"} genomes: NCTC8325: fasta: ftp://ftp.ensemblgenomes.org/pub/bacteria/release-37/fasta/bacteria_18_collection/staphylococcus_aureus_subsp_aureus_nctc_8325/dna//Staphylococcus_aureus_subsp_aureus_nctc_8325.ASM1342v1.dna_rm.toplevel.fa.gz @@ -1459,16 +1763,15 @@ genomes: gff3: ftp://ftp.ensemblgenomes.org/pub/bacteria/release-37/gff3/bacteria_18_collection/staphylococcus_aureus_subsp_aureus_st398//Staphylococcus_aureus_subsp_aureus_st398.ASM958v1.37.gff3.gz ``` -As you can see this is very similar to what with did with `sample_urls`, -just that we have one more nested level. Now to access the FTP path to the -FASTA file for genome id `NCTC8325` we can use -`config["genomes"]["NCTC8325"]["fasta"]`. +As you can see this is very similar to what we did with `samples`, it's just +that we have one more nested level. Now to access the FTP path to the FASTA file +for genome id `NCTC8325` we can use `config["genomes"]["NCTC8325"]["fasta"]`. Let's now look at how to do the mapping from genome id to FASTA path in the rule `get_genome_fasta`. This is how the rule currently looks (if you have added the log section as previously described). -```python +```{.python filename="snakefile_mrsa.smk"} rule get_genome_fasta: """ Retrieve the sequence in fasta format for a genome. @@ -1483,15 +1786,14 @@ rule get_genome_fasta: """ ``` -We don't want the hard-coded genome id `NCTC8325`, so replace that with a -wildcard, say `{genome_id}` (remember to add the wildcard to the `log:` -directive as well). We now need to supply the remote paths to the FASTA file -for a given genome id. Because we've added this information to the -config file we just need to pass it to the rule in some way, and just like -in the `get_SRA_by_accession` rule we'll use a function to do the job: +Replace the hard-coded `NCTC8325` with a wildcard `{genome_id}` in both the +`input` and `log` directives. We now need to supply the remote paths to the +FASTA file for a given genome id. Because we've added this information to the +config file we just need to pass it to the rule in some way, and just like in +the `get_SRA_by_accession` rule we'll use a function to do the job: -```python +```{.python filename="snakemfile_mrsa.smk"} def get_fasta_path(wildcards): return config["genomes"][wildcards.genome_id]["fasta"] @@ -1536,31 +1838,27 @@ rule get_genome_gff3: ``` ::: -Also change in `index_genome` to use a wildcard rather than a hard-coded genome -id. Here you will run into a complication if you have followed the previous -instructions and use the `expand()` expression. We want the list to expand to -`["results/bowtie2/{genome_id}.1.bt2", "results/bowtie2/{genome_id}.2.bt2", ...]`, -_i.e._ only expanding the wildcard referring to the Bowtie2 index. To keep the -`genome_id` wildcard from being expanded we have to "mask" it with double curly -brackets: `{{genome_id}}`. In addition, we need to replace the hard-coded -`results/bowtie2/NCTC8325` in the shell directive of the rule with the genome id -wildcard. Inside the shell directive the wildcard object is accessed with this -syntax: `{wildcards.genome_id}`, so the Bowtie2-build command should be: +Also replace `NCTC8325` with `{genome_id}` in the `log`, `input` and `output` of +the `index_genome` rule. Here you will run into a complication if you have +followed the previous instructions and use the `expand()` expression. We want +the list of output files be `["results/bowtie2/{genome_id}.1.bt2", +"results/bowtie2/{genome_id}.2.bt2", ...]`, _i.e._ only expanding the `substr` +wildcard in the Bowtie2 index. To prevent Snakemake from trying to expand the +`genome_id` wildcard we have to "mask" it with double curly brackets: +`{{genome_id}}`. + +In addition, we need to replace the hard-coded `results/bowtie2/NCTC8325` in the +shell directive of the rule with the genome id wildcard. Inside the shell +directive the wildcard object is accessed with this syntax: +`{wildcards.genome_id}`, so the Bowtie2-build command should be: ```bash bowtie2-build tempfile results/bowtie2/{wildcards.genome_id} > {log} ``` -Note that this will only work if the `{genome_id}` wildcard can be resolved to -something defined in the config (currently `NCTC8325` or `ST398`). If you try to -generate a FASTA file for a genome id not defined in the config Snakemake will -complain, even at the dry-run stage. +The final rule should look like this: -Finally, remember that any wildcards need to be present both in the `output:` -and `log:` directives? This means we have to update the `log:` directive in -`index_genome` as well. The final rule should look like this: - -```python +```{.python filename="snakefile_mrsa.smk"} rule index_genome: """ Index a genome using Bowtie 2. @@ -1577,7 +1875,7 @@ rule index_genome: """ # Bowtie2 cannot use .gz, so unzip to a temporary file first gunzip -c {input} > tempfile - bowtie2-build tempfile results/bowtie2/{wildcards.genome_id} > {log} + bowtie2-build tempfile results/bowtie2/{wildcards.genome_id} >{log} """ ``` @@ -1585,31 +1883,62 @@ Good job! The rules `get_genome_fasta`, `get_genome_gff3` and `index_genome` can now download and index *any genome* as long as we provide valid links in the config file. -However, we need to define somewhere which genome id we actually want to use -when running the workflow. This needs to be done both in `align_to_genome` and -`generate_count_table`. Do this by introducing a parameter in `config.yml` -called `"genome_id"` (you can set it to either `NCTC8325` or `ST398`), _e.g._: +Only two rules remain where the `NCTC8325` genome id is hard-coded: `align_to_genome` and `generate_count_table`. We will use these rules to define which genome id we actually want to use when running the workflow. First add a parameter to `config.yml` +called `"genome_id"` and set it to `NCTC8325`: -```yaml +```{.yaml filename="config.yml"} genome_id: "NCTC8325" ``` -Now we can resolve the `genome_id` wildcard from the config. See below for an -example for `align_to_genome`. Here the `substr` wildcard gets expanded from a -list while `genome_id` gets expanded from the config file. +Now we can resolve the `genome_id` wildcard from the config. For +`align_to_genome` we change the input directive to: ```python input: "data/{sample_id}.fastq.gz", - index = expand("results/bowtie2/{genome_id}.{substr}.bt2", + expand("results/bowtie2/{genome_id}.{substr}.bt2", genome_id = config["genome_id"], substr = ["1", "2", "3", "4", "rev.1", "rev.2"]) ``` -Also change the hard-coded genome id in the `generate_count_table` input in a -similar manner: +Here the `substr` wildcard gets expanded from a list while `genome_id` gets +expanded from the config dictionary. Also change the hard-coded `NCTC8325` in +the `shell:` directive of `align_to_genome` so that the genome_id is inserted +directly from the config, like this: ```python +shell: + """ + bowtie2 -x results/bowtie2/{config[genome_id]} -U {input[0]} > {output} 2>{log} + """ +``` + +The final rule should look like this: + +```{.python filename="snakefile_mrsa.smk"} +rule align_to_genome: + """ + Align a fastq file to a genome index using Bowtie 2. + """ + output: + temp("results/bam/{sample_id,\\w+}.bam") + input: + "data/{sample_id}.fastq.gz", + expand("results/bowtie2/{genome_id}.{substr}.bt2", + genome_id = config["genome_id"], + substr = ["1", "2", "3", "4", "rev.1", "rev.2"]) + log: + "results/logs/align_to_genome/{sample_id}.log" + shell: + """ + bowtie2 -x results/bowtie2/{config[genome_id]} -U {input[0]} > {output} 2>{log} + """ +``` + +Now let's change the hard-coded genome id in the `generate_count_table` input in +a similar manner. The final rule should look like this: + +```{.python filename="snakefile_mrsa.smk"} rule generate_count_table: """ Generate a count table using featureCounts. @@ -1619,9 +1948,9 @@ rule generate_count_table: "results/tables/counts.tsv.summary" input: bams=expand("results/bam/{sample_id}.sorted.bam", - sample_id = config["sample_ids"]), + sample_id = config["samples"].keys()), annotation=expand("data/ref/{genome_id}.gff3.gz", - genome_id = config["genome_id"]) + genome_id = config["genome_id"]) log: "results/logs/generate_count_table.log" shell: @@ -1630,26 +1959,22 @@ rule generate_count_table: """ ``` +To make sure that the workflow works as expected, run it with: + +```bash +snakemake -s snakefile_mrsa.smk --configfile config.yml -p -c 1 +``` + +Try changing the `genome_id` in the config file to `ST398` and rerun the workflow. You should now see that the workflow downloads the genome files for `ST398` and aligns the samples to that genome instead. + In general, we want the rules as far downstream as possible in the workflow to be the ones that determine what the wildcards should resolve to. In our case -this is `align_to_genome` and `generate_count_table`. You can think of it like +`align_to_genome` and `generate_count_table`. You can think of it like the rule that really "needs" the file asks for it, and then it's up to Snakemake to determine how it can use all the available rules to generate it. Here the `align_to_genome` rule says "I need this genome index to align my sample to" and then it's up to Snakemake to determine how to download and build the index. -One last thing is to change the hard-coded `NCTC8325` in the `shell:` directive -of `align_to_genome`. Bowtie2 expects the index name supplied with the `-x` flag -to be without the ".*.bt2" suffix so we can't use `-x {input.index}`. Instead -we'll insert the genome_id directly from the config like this: - -```bash -shell: - """ - bowtie2 -x results/bowtie2/{config[genome_id]} -U {input[0]} > {output} 2>{log} - """ -``` - ::: {.callout-note title="Summary"} Well done! You now have a complete Snakemake workflow with a number of excellent features: @@ -1666,15 +1991,24 @@ excellent features: reproduce either via Git, Snakemake's `--archive` option or a Docker image. ::: -## Reading samples from a file instead of hard-coding them +## Extra material + +If you want to read more about Snakemake in general you can find several +resources here: + +- The Snakemake documentation is available on [ReadTheDocs](https://snakemake.readthedocs.io/en/stable/#). +- Here is another (quite in-depth) [tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html#tutorial). +- If you have questions, check out [stack overflow](https://stackoverflow.com/questions/tagged/snakemake). + +### Reading samples from a file instead of hard-coding them So far we've specified the samples to use in the workflow either as a hard-coded list in the Snakefile, or as a list in the configuration file. This is of course impractical for large real-world examples. Here we'll just quickly show how you -could supply the samples instead via a tab-separated file. For example you could -create a file called `samples.tsv` with the following content: +could supply the samples to the MRSA workflow via a tab-separated file. For +example you could create a file called `samples.tsv` with the following content: -``` +```{.txt filename="samples.tsv"} SRR935090 https://figshare.scilifelab.se/ndownloader/files/39539767 SRR935091 https://figshare.scilifelab.se/ndownloader/files/39539770 SRR935092 https://figshare.scilifelab.se/ndownloader/files/39539773 @@ -1685,7 +2019,7 @@ fastq file. Now in order to read this into the workflow we need to use a few lines of python code. Since you can mix python code with rule definitions in Snakemake we'll just add the following lines to the top of the Snakefile: -```python +```{.python filename="snakefile_mrsa.smk"} # define an empty 'samples' dictionary samples = {} # read the sample list file and populate the dictionary @@ -1703,7 +2037,7 @@ url for `SRR935090` we can use `samples["SRR935090"]`. For example, the `get_sample_url` function can now be written as: -```python +```{.python filename="snakefile_mrsa.smk"} def get_sample_url(wildcards): return samples[wildcards.sample_id] ``` @@ -1711,25 +2045,9 @@ def get_sample_url(wildcards): We can also use the `samples` dictionary in `expand()`, for example in the `multiqc` rule: ```python -rule multiqc: - """ - Aggregate all FastQC reports into a MultiQC report. - """ - output: - html="results/multiqc/multiqc.html", - stats="results/multiqc/multiqc_general_stats.txt" input: - expand("results/fastqc/{sample_id}_fastqc.zip", sample_id = samples.keys()) - log: - "results/logs/multiqc/multiqc.log" - shadow: "minimal" - shell: - """ - # Run multiQC and keep the html report - multiqc -n multiqc.html {input} 2> {log} - mv multiqc.html {output.html} - mv multiqc_data/multiqc_general_stats.txt {output.stats} - """ + expand("results/fastqc/{sample_id}_fastqc.zip", + sample_id = samples.keys()) ``` Now this depends on there being a `samples.tsv` file in the working directory. @@ -1757,21 +2075,6 @@ with open(config["sample_list"], "r") as fhin: This way, anyone can take our Snakefile and just update the path to their own `sample_list` using the config file. -::: {.callout-note title="Quick recap"} -In this section we've learned: - -- How to generalize a Snakemake workflow. -::: - -## Extra material - -If you want to read more about Snakemake in general you can find several -resources here: - -- The Snakemake documentation is available on [ReadTheDocs](https://snakemake.readthedocs.io/en/stable/#). -- Here is another (quite in-depth) [tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html#tutorial). -- If you have questions, check out [stack overflow](https://stackoverflow.com/questions/tagged/snakemake). - ### Using containers in Snakemake Snakemake also supports defining an Apptainer or Docker container for each rule @@ -1779,7 +2082,7 @@ Snakemake also supports defining an Apptainer or Docker container for each rule during the course). Analogous to using a rule-specific Conda environment, specify `container: "docker://some-account/rule-specific-image"` in the rule definition. Instead of a link to a container image, it is also possible to -provide the path to a `*.sif` file (= a _Singularity image file_). When +provide the path to an image file somwhere on your filesystem. When executing Snakemake, add the `--software-deployment-method apptainer` (or the shorthand `--sdm apptainer`) flag to the command line. For the given rule, an Apptainer container will then be created from the image or file that is provided @@ -1817,12 +2120,11 @@ Start your Snakemake workflow with the following command: snakemake --software-deployment-method apptainer ``` -Feel free to modify the MRSA workflow according to this example. As Apptainer -is a container software that was developed for HPC clusters, and for example the -Mac version is still a beta version, it might not work to run your updated -Snakemake workflow with Apptainer locally on your computer. -In the next section we explain how you can run Snakemake workflows on UPPMAX -where Apptainer is pre-installed. +Feel free to modify the MRSA workflow according to this example. As Apptainer is +a container software that was developed for HPC clusters with only limited +support on *e.g.* Macs, it might not work to run your updated Snakemake workflow +with Apptainer locally on your computer. In the next section we explain how you +can run Snakemake workflows on UPPMAX where Apptainer is pre-installed. ### Running Snakemake workflows on HPC clusters @@ -1850,9 +2152,9 @@ things on the cluster or even logging out of the cluster. For short workflows with only a few rules that need the same compute resources in terms of CPU (cores) and memory, you can submit the entire workflow as a job directly to the SLURM scheduler, or start an interactive job (in your `tmux` or -`screen` session) and run your Snakemake workflow as you would do that on your -local machine. Make sure to give your job enough time to finish running all -rules of your Snakemake workflow. +`screen` session), and run your Snakemake workflow as you would on your local +machine. Make sure to give your job enough time to finish running all rules of +your Snakemake workflow. If you choose this option, you don't need to install anything from the plugin catalogue. However, your workflow may not run as efficiently as it could if you @@ -1918,7 +2220,7 @@ rule testrule: resources: runtime = 60, mem_mb = 16000, - cpus_per_task = 4 + tasks = 4 shell: """ uname -a > {output} @@ -1927,7 +2229,7 @@ rule testrule: This rule uses the standard resource `runtime` to set the maximum allowed time (in minutes) for the rule, sets the memory requirement with `mem_mb` and the -number of requested CPUs with `cpus_per_task`. In this example the rule will have a +number of requested CPUs with `tasks`. In this example the rule will have a time limit of 60 minutes, will require 16G of RAM and 4 CPUs. Some clusters also require you to specify the **partition** you want to run your @@ -1945,7 +2247,7 @@ rule testrule: resources: runtime = 60, mem_mb = 16000, - cpus_per_task = 4, + tasks = 4, slurm_partition: "shared" shell: """ @@ -1961,14 +2263,14 @@ Snakemake without changing the workflow code. For example, you could create a `dardel` folder (_e.g._ in the root of your workflow) with a `config.yaml` file that contains the following: -```yaml +```{.yaml filename="dardel/config.yaml"} executor: "slurm" jobs: 100 default-resources: slurm_account: "naiss-2023-01-001" slurm_partition: "shared" mem_mb: 16000 - cpus_per_task: 4 + tasks: 1 runtime: 60 ``` @@ -1986,24 +2288,24 @@ the command line call much more succinct. To set rule-specific resources in the configuration profile, you can add a `set_resources:` section to the `config.yaml` file: -```yaml +```{.yaml filename="dardel/config.yaml"} executor: "slurm" jobs: 100 default-resources: slurm_account: "naiss-2023-01-001" slurm_partition: "shared" mem_mb: 16000 - cpus_per_task: 4 + tasks: 4 runtime: 60 set_resources: index_genome: runtime: 240 mem_mb: 32000 - cpus_per_task: 8 + tasks: 8 align_to_genome: runtime: 120 mem_mb: 24000 - cpus_per_task: 6 + tasks: 6 ``` In this example, the `index_genome` rule will have a runtime of 240 minutes, diff --git a/tutorials/containers/Snakefile b/tutorials/containers/Snakefile index de562cc2..5af76853 100644 --- a/tutorials/containers/Snakefile +++ b/tutorials/containers/Snakefile @@ -1,5 +1,5 @@ from snakemake.utils import min_version -min_version("5.3.0") +min_version("8.0.0") configfile: "config.yml" @@ -10,11 +10,10 @@ rule all: input: "results/tables/counts.tsv", "results/multiqc.html", - "results/rulegraph.png", "results/supplementary.html" def get_sample_url(wildcards): - return config["sample_urls"][wildcards.sample_id] + return config["samples"][wildcards.sample_id] rule get_SRA_by_accession: """ @@ -40,7 +39,7 @@ rule get_SRA_by_accession: url = get_sample_url shell: """ - wget -o {log} -O - {params.url} | seqtk sample - {params.max_reads} | gzip -c > {output[0]} + wget -q -o {log} -O - {params.url} | seqtk sample - {params.max_reads} | gzip -c > {output[0]} """ rule fastqc: @@ -71,14 +70,14 @@ rule multiqc: html="results/multiqc.html", stats="results/multiqc/multiqc_general_stats.txt" input: - expand("results/fastqc/{sample_id}_fastqc.zip", sample_id = config["sample_ids"]) + expand("results/fastqc/{sample_id}_fastqc.zip", sample_id = config["samples"].keys()) log: "results/logs/multiqc/multiqc.log" shadow: "minimal" shell: """ # Run multiQC and keep the html report - multiqc -n multiqc.html {input} 2> {log} + multiqc -n multiqc.html {input} 2>{log} mv multiqc.html {output.html} mv multiqc_data/multiqc_general_stats.txt {output.stats} """ @@ -134,7 +133,7 @@ rule index_genome: """ # Bowtie2 cannot use .gz, so unzip to a temporary file first gunzip -c {input} > tempfile - bowtie2-build tempfile results/bowtie2/{wildcards.genome_id} > {log} + bowtie2-build tempfile results/bowtie2/{wildcards.genome_id} >{log} """ rule align_to_genome: @@ -144,10 +143,10 @@ rule align_to_genome: output: # Here the sample_id wildcard is constrained with \w+ to match only # 'word characters', i.e. letters, numbers and underscore - temp("results/bam/{sample_id,\w+}.bam") + temp("results/bam/{sample_id,\\w+}.bam") input: - fastq = "data/{sample_id}.fastq.gz", - index = expand("results/bowtie2/{genome_id}.{substr}.bt2", + "data/{sample_id}.fastq.gz", + expand("results/bowtie2/{genome_id}.{substr}.bt2", genome_id=config["genome_id"], substr=["1", "2", "3", "4", "rev.1", "rev.2"]) log: @@ -155,7 +154,7 @@ rule align_to_genome: genome_id = config["genome_id"]) shell: """ - bowtie2 -x results/bowtie2/{config[genome_id]} -U {input.fastq} > {output} 2>{log} + bowtie2 -x results/bowtie2/{config[genome_id]} -U {input[0]} > {output} 2>{log} """ rule sort_bam: @@ -163,9 +162,9 @@ rule sort_bam: Sort a bam file. """ output: - "{prefix}.sorted.bam" + "results/bam/{sample_id}.sorted.bam" input: - "{prefix}.bam" + "results/bam/{sample_id}.bam" shell: """ samtools sort {input} > {output} @@ -179,7 +178,7 @@ rule generate_count_table: "results/tables/counts.tsv", "results/tables/counts.tsv.summary" input: - bams=expand("results/bam/{sample_id}.sorted.bam", sample_id = config["sample_ids"]), + bams=expand("results/bam/{sample_id}.sorted.bam", sample_id = config["samples"].keys()), annotation=expand("data/ref/{genome_id}.gff3.gz", genome_id = config["genome_id"]) log: "results/logs/generate_count_table.log" @@ -198,7 +197,6 @@ rule make_supplementary: counts = "results/tables/counts.tsv", summary_file = "results/tables/counts.tsv.summary", multiqc_file = "results/multiqc/multiqc_general_stats.txt", - rulegraph = "results/rulegraph.png" log: "results/logs/make_supplementary.log" params: @@ -210,19 +208,7 @@ rule make_supplementary: -P counts_file:../{input.counts} \ -P multiqc_file:../{input.multiqc_file} \ -P summary_file:../{input.summary_file} \ - -P rulegraph_file:../{input.rulegraph} \ -P srr_ids:"{params.SRR_IDs}" \ -P gsm_ids:"{params.GSM_IDs}" mv code/supplementary_material.html {output} - """ - -rule generate_rulegraph: - """ - Generate a rulegraph for the workflow. - """ - output: - "results/rulegraph.png" - shell: - """ - snakemake --rulegraph --configfile config.yml | dot -Tpng > {output} - """ + """ \ No newline at end of file diff --git a/tutorials/containers/code/supplementary_material.qmd b/tutorials/containers/code/supplementary_material.qmd index fc8d6c8c..989e6b35 100644 --- a/tutorials/containers/code/supplementary_material.qmd +++ b/tutorials/containers/code/supplementary_material.qmd @@ -9,7 +9,6 @@ params: counts_file: "results/tables/counts.tsv" multiqc_file: "results/multiqc/multiqc_general_stats.txt" summary_file: "results/tables/counts.tsv.summary" - rulegraph_file: "results/rulegraph.png" srr_ids: "SRR935090 SRR935091 SRR935092" gsm_ids: "GSM1186459 GSM1186460 GSM1186461" --- @@ -27,7 +26,6 @@ library("GEOquery") counts_file <- params$counts_file multiqc_file <- params$multiqc_file summary_file <- params$summary_file -rulegraph_file <- params$rulegraph_file srr_ids <- unlist(strsplit(params$srr_ids, " ")) gsm_ids <- unlist(strsplit(params$gsm_ids, " ")) ``` diff --git a/tutorials/containers/config.yml b/tutorials/containers/config.yml index 73e41615..3fd9764b 100644 --- a/tutorials/containers/config.yml +++ b/tutorials/containers/config.yml @@ -4,7 +4,7 @@ sample_ids_geo: ["GSM1186459", "GSM1186460", "GSM1186461"] series_id_geo: "GSE48896" # URLs to gzipped fastq files for each sample in remote repository -sample_urls: +samples: SRR935090: "https://figshare.scilifelab.se/ndownloader/files/39539767" SRR935091: "https://figshare.scilifelab.se/ndownloader/files/39539770" SRR935092: "https://figshare.scilifelab.se/ndownloader/files/39539773" diff --git a/tutorials/containers/environment.yml b/tutorials/containers/environment.yml index ee440128..95e54aa1 100644 --- a/tutorials/containers/environment.yml +++ b/tutorials/containers/environment.yml @@ -5,7 +5,7 @@ dependencies: - python - fastqc - seqtk - - snakemake + - snakemake>=8 - multiqc - jupyter - bowtie2 diff --git a/tutorials/snakemake/environment.yml b/tutorials/snakemake/environment.yml index 6de7ac7e..b3a19c8b 100644 --- a/tutorials/snakemake/environment.yml +++ b/tutorials/snakemake/environment.yml @@ -6,7 +6,7 @@ dependencies: - python - fastqc - seqtk - - snakemake + - snakemake>=8 - multiqc - bowtie2 - tbb diff --git a/tutorials/snakemake/snakefile_mrsa.smk b/tutorials/snakemake/snakefile_mrsa.smk index 00db0b8b..f008f702 100644 --- a/tutorials/snakemake/snakefile_mrsa.smk +++ b/tutorials/snakemake/snakefile_mrsa.smk @@ -1,5 +1,5 @@ from snakemake.utils import min_version -min_version("5.3.0") +min_version("8.0.0") rule all: """ @@ -7,8 +7,7 @@ rule all: """ input: "results/tables/counts.tsv", - "results/multiqc/multiqc.html", - "results/rulegraph.png" + "results/multiqc/multiqc.html" def get_sample_url(wildcards): samples = { @@ -28,7 +27,7 @@ rule get_SRA_by_accession: url = get_sample_url shell: """ - wget -O - {params.url} | seqtk sample - 25000 | gzip -c > {output[0]} + wget -q -O - {params.url} | seqtk sample - 25000 | gzip -c > {output[0]} """ rule fastqc: @@ -122,7 +121,7 @@ rule align_to_genome: Align a fastq file to a genome index using Bowtie 2. """ output: - "results/bam/{sample_id,\w+}.bam" + "results/bam/{sample_id,\\w+}.bam" input: "data/{sample_id}.fastq.gz", "results/bowtie2/NCTC8325.1.bt2", @@ -163,15 +162,4 @@ rule generate_count_table: shell: """ featureCounts -t gene -g gene_id -a {input.annotation} -o {output} {input.bams} - """ - -rule generate_rulegraph: - """ - Generate a rulegraph for the workflow. - """ - output: - "results/rulegraph.png" - shell: - """ - snakemake --snakefile snakefile_mrsa.smk --config max_reads=0 --rulegraph | dot -Tpng > {output} - """ + """ \ No newline at end of file