Skip to content

Commit

Permalink
feat: pass Snakemake profile to ZARP workflow (#77)
Browse files Browse the repository at this point in the history
  • Loading branch information
uniqueg authored Sep 14, 2023
1 parent 102cdd4 commit d9503aa
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/guides/initialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ The following configuration options are available.
| `genome_assemblies_map` | A headerless 3-column semicolon-separated mapping table of organism/source trivial names (e.g., `homo_sapiens`), optional comma-separated aliases such as NCBI taxon IDs and/or organism/source short names (e.g., `7227,dmelanogaster`) and a corresponding genome assembly name (e.g., `GRCm39`); a table in the required format is shipped with _ZARP_cli_ in the location provided in the default location; which can be amended with additional aliases; note that for [`genomepy`][genomepy] to be able to pull genome annotations for organisms/sources that [HTSinfer][htsinfer] inferred, NCBI taxon ID aliases are _required_ | `./data/genome_assemblies.map` relative to the location of the ZARP-cli repository |
| `resources_version` | Whether to always download the latest available version of genome annotations for a given organism/source from Ensembl (enter `None`; default) or whether to use a specific version of the corresponding Ensembl database (e.g., `100`); note that the different Ensembl databases (e.g., for fungi, plants) use a different versioning scheme, so pinning a particular database version may lead to unexpected outcomes | `None` |
| `rule_config` | A configuration file for the _ZARP_ workflow that sets specific parameters for each workflow step ("rule"); see [ZARP][zarp] documentation for details | `None` |
| `profile` | Path to [Snakemake profile][snakemake-profiles] to be used for the _ZARP_ workflow. Use this to optimize _ZARP_ for your specific compute environment |
| `fragment_length_distribution_mean` | HTSinfer currently is unable to infer the mean of the fragment length distribution of RNA-seq libraries; however, this value is required for tools [`kallisto`][kallisto] and [`salmon`][salmon] -which are executed as part of _ZARP_- when run on single-ended libraries only (for paired-ended libraries, the tools are able to infer this parameter from the data); the value provided here is used as a fallback if the value was not determined experimentally (e.g., with [Bioanalyzer][bioanalyzer] instruments) and provided via a sample table | `300` |
| `fragment_length_distribution_sd` | Analogous to `fragment_length_distribution_mean` above, but this parameter is for the _standard deviation_ of the fragment length distribution | `100` |
| `author` | Name of the person or organization executing the _ZARP-cli_ runs; will be added to the _ZARP_ report | `None` |
Expand Down
1 change: 1 addition & 0 deletions docs/includes/references.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
[salmon]: <https://github.com/COMBINE-lab/salmon>
[singularity]: <https://sylabs.io/singularity/>
[snakemake]: <https://github.com/snakemake/snakemake>
[snakemake-profiles]: <https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles>
[sra]: <https://www.ncbi.nlm.nih.gov/sra>
[sra-toolkit]: <https://github.com/ncbi/sra-tools>
[zarp]: <https://github.com/zavolanlab/zarp>
Expand Down
14 changes: 14 additions & 0 deletions tests/snakemake/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,20 @@ def test_compile_command_config_file(self, tmpdir):
cmd = my_run.compile_command(snakefile=snakefile)
assert "--configfile" in cmd

def test_compile_command_profile(self, tmpdir):
"""Execute a run with a Snakemake profile."""
os.chdir(tmpdir)
snakefile = create_snakefile(dir=Path(tmpdir))
run_config = default_run_config.copy(deep=True)
run_config.profile = Path("/path/to/profile")
my_run = SnakemakeExecutor(
run_config=run_config,
exec_dir=tmpdir,
)
cmd = my_run.compile_command(snakefile=snakefile)
assert "--profile" in cmd
os.chdir(default_cwd)

@pytest.mark.parametrize(
"dependency_embedding",
[
Expand Down
11 changes: 11 additions & 0 deletions zarp/config/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class ArgParser:
"execution_mode",
"genome_assemblies_map",
"identifier",
"profile",
"resources_version",
"rule_config",
"working_directory",
Expand Down Expand Up @@ -425,6 +426,16 @@ def _set_run_arguments(
" generated"
),
)
argument_group.add_argument(
"--profile",
default=None,
type=lambda p: Path(p).absolute(),
metavar="PATH",
help=(
"Snakemake profile for ZARP workflow; refer to ZARP"
" documentation for details"
),
)
argument_group.add_argument(
"--resources-version",
default=None,
Expand Down
3 changes: 3 additions & 0 deletions zarp/config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class InitRun(CustomBaseModel):
dependency_embedding: Dependency embedding strategy to use.
execution_mode: Execution mode to use.
genome_assemblies_map: Genome assemblies mapping file.
profile: Snakemake profile for ZARP workflow.
resources_version: Version of Ensembl genome resources to use when
resources are not provided.
rule_config: ZARP rule configuration.
Expand All @@ -84,6 +85,7 @@ class InitRun(CustomBaseModel):
dependency_embedding: Dependency embedding strategy to use.
execution_mode: Execution mode to use.
genome_assemblies_map: Genome assemblies mapping file.
profile: Snakemake profile for ZARP workflow.
resources_version: Version of Ensembl genome resources to use when
resources are not provided.
rule_config: ZARP rule configuration.
Expand All @@ -103,6 +105,7 @@ class InitRun(CustomBaseModel):
)
resources_version: Optional[int] = None
rule_config: Optional[Path] = None
profile: Optional[Path] = None


class InitSample(CustomBaseModel):
Expand Down
3 changes: 3 additions & 0 deletions zarp/snakemake/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def compile_command(self, snakefile: Path) -> List[str]:
cmd_ls.extend(["--directory", str(self.exec_dir)])
if self.config_file is not None:
cmd_ls.extend(["--configfile", str(self.config_file)])
if self.run_config.profile is not None:
if snakefile.name == "Snakefile":
cmd_ls.extend(["--profile", str(self.run_config.profile)])
if self.run_config.execution_mode == "DRY_RUN":
cmd_ls.append("--dry-run")
if self.run_config.dependency_embedding == "CONDA":
Expand Down
2 changes: 1 addition & 1 deletion zarp/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Single source of truth for package version."""

__version__ = "0.3.0"
__version__ = "0.4.0"

0 comments on commit d9503aa

Please sign in to comment.