From c964c801368794d2091290ee33b249701dafc729 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Wed, 6 Nov 2024 16:28:07 -0500 Subject: [PATCH 01/28] Add sample_name test is still not working locally --- assets/schema_input.json | 9 ++++++-- conf/iridanext.config | 2 +- conf/modules.config | 7 +++++++ modules/local/prefetchchecker/main.nf | 14 ++++++++++--- nextflow.config | 4 ++-- tests/data/add-samplesheet.csv | 5 +++++ tests/data/samplesheet-addsamplename.csv | 3 +++ tests/modules/fasterqdump/main.nf.test | 2 +- tests/modules/fasterqdump/main.nf.test.snap | 10 ++++++--- tests/pipelines/fetchdatairidanext.nf.test | 21 +++++++++++++++++++ .../main.nf.test | 12 +++++------ .../main.nf.test.snap | 14 ++++++++----- workflows/fetchdatairidanext.nf | 2 +- 13 files changed, 81 insertions(+), 24 deletions(-) create mode 100644 tests/data/add-samplesheet.csv create mode 100644 tests/data/samplesheet-addsamplename.csv diff --git a/assets/schema_input.json b/assets/schema_input.json index edcf572..dc3b468 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,11 +9,16 @@ "properties": { "sample": { "type": "string", - "pattern": "^\\S+$", - "meta": ["id"], + "pattern": "^[A-Za-z0-9_.-]+$", + "meta": ["irida_id"], "unique": true, "errorMessage": "Sample name must be provided and cannot contain spaces" }, + "sample_name": { + "type": "string", + "meta": ["id"], + "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next" + }, "insdc_accession": { "type": "string", "pattern": "^(SRR|ERR|DRR)\\S+$", diff --git a/conf/iridanext.config b/conf/iridanext.config index b336dfb..739532c 100644 --- a/conf/iridanext.config +++ b/conf/iridanext.config @@ -5,7 +5,7 @@ iridanext { overwrite = true validate = true files { - idkey = "id" + idkey = "irida_id" global = ["**/prefetch/failures_report.csv"] samples = ["**/reads/*.fastq.gz"] } diff --git a/conf/modules.config b/conf/modules.config index 7573f95..f68b79d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -37,5 +37,12 @@ process { mode: params.publish_dir_mode, pattern: 'reads/*.fastq.gz' ] + def rename = {String sample_name, String accession -> "--outfile ${sample_name}_${accession}"} + + ext.args = { + [ + meta.id ? rename(meta.id, meta.insdc_accession) : "" + ].join(" ") + } } } diff --git a/modules/local/prefetchchecker/main.nf b/modules/local/prefetchchecker/main.nf index 4fe0a8d..852061a 100644 --- a/modules/local/prefetchchecker/main.nf +++ b/modules/local/prefetchchecker/main.nf @@ -11,11 +11,19 @@ process PREFETCH_CHECKER { exec: task.workDir.resolve("failures_report.csv").withWriter { writer -> - writer.writeLine("sample,error_accession") // header + sample_name = false + failures.each { if ( it[0].id != null) { + sample_name = true + } + } // Failures - if (failures.size() > 0) { - failures.each { writer.writeLine "${it[0].id},${it[1]}" } + if (failures.size() > 0 && sample_name) { + writer.writeLine("sample,sample_name,error_accession") // header + failures.each { writer.writeLine "${it[0].irida_id},${it[0].id},${it[1]}" } + } else { + writer.writeLine("sample,error_accession") // header + failures.each { writer.writeLine "${it[0].irida_id},${it[1]}" } } } } diff --git a/nextflow.config b/nextflow.config index de1725a..85f9022 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,7 +28,7 @@ params { config_profile_description = null config_profile_contact = null config_profile_url = null - + // Max resource options // Defaults only, expecting to be overwritten @@ -78,7 +78,7 @@ profiles { } docker { docker.enabled = true - docker.userEmulation = true + docker.runOptions = '-u $(id -u):$(id -g)' conda.enabled = false singularity.enabled = false podman.enabled = false diff --git a/tests/data/add-samplesheet.csv b/tests/data/add-samplesheet.csv new file mode 100644 index 0000000..4512f0a --- /dev/null +++ b/tests/data/add-samplesheet.csv @@ -0,0 +1,5 @@ +sample,sample_name,insdc_accession +SAMPLE1,S1,ERR1109373 +ERROR1,S2,SRR999908 +ERROR2,S3,SRR999934 +SAMPLE2,S4,SRR13191702 diff --git a/tests/data/samplesheet-addsamplename.csv b/tests/data/samplesheet-addsamplename.csv new file mode 100644 index 0000000..21ced63 --- /dev/null +++ b/tests/data/samplesheet-addsamplename.csv @@ -0,0 +1,3 @@ +sample,sample_name,insdc_accession +SAMPLE1,S1,ERR1109373 +SAMPLE2,,SRR13191702 diff --git a/tests/modules/fasterqdump/main.nf.test b/tests/modules/fasterqdump/main.nf.test index 104c31f..2ae17c3 100644 --- a/tests/modules/fasterqdump/main.nf.test +++ b/tests/modules/fasterqdump/main.nf.test @@ -11,7 +11,7 @@ nextflow_process { when { process { """ - input[0] = Channel.of([ [id: "SAMPLE1"], file("$baseDir/tests/data/ERR1109373.sra", checkIfExists: true) ]) + input[0] = Channel.of([ [irida_id: "SAMPLE1"], file("$baseDir/tests/data/ERR1109373.sra", checkIfExists: true) ]) input[1] = [] input[2] = [] """ diff --git a/tests/modules/fasterqdump/main.nf.test.snap b/tests/modules/fasterqdump/main.nf.test.snap index 6c6e406..0bea139 100644 --- a/tests/modules/fasterqdump/main.nf.test.snap +++ b/tests/modules/fasterqdump/main.nf.test.snap @@ -5,7 +5,7 @@ "0": [ [ { - "id": "SAMPLE1" + "irida_id": "SAMPLE1" }, [ "ERR1109373.fastq.gz:md5,b9acccb3c5d317a99f604375a09991aa", @@ -20,7 +20,7 @@ "reads": [ [ { - "id": "SAMPLE1" + "irida_id": "SAMPLE1" }, [ "ERR1109373.fastq.gz:md5,b9acccb3c5d317a99f604375a09991aa", @@ -34,6 +34,10 @@ ] } ], - "timestamp": "2024-01-25T23:20:38.2550152" + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-31T11:23:17.630455595" } } \ No newline at end of file diff --git a/tests/pipelines/fetchdatairidanext.nf.test b/tests/pipelines/fetchdatairidanext.nf.test index 4f60f64..a1392ab 100644 --- a/tests/pipelines/fetchdatairidanext.nf.test +++ b/tests/pipelines/fetchdatairidanext.nf.test @@ -54,4 +54,25 @@ nextflow_pipeline { assert path("$launchDir/results/reads/SRR999934_2.fastq.gz").exists() == false } } + + test("Include sample_name in samplesheet") { + tag "sample_name" + when { + params { + input = "$baseDir/tests/data/add-samplesheet.csv" + outdir = "output" + } + } + + then { + assert workflow.success + assert path("$launchDir/output").exists() + + def lines = path("$launchDir/output/prefetch/failures_report.csv").readLines() + assert lines.size() == 3 + assert lines.contains("sample,sample_name,error_accession") + assert lines.contains("ERROR1,S2,SRR999908") + assert lines.contains("ERROR2,S3,INVALID!!") + } + } } diff --git a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test index e4ee30f..1050501 100644 --- a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test +++ b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test @@ -16,8 +16,8 @@ nextflow_workflow { workflow { """ input[0] = Channel.of( - [[ id:'test_single_end', single_end:true ], 'DRR000774'], - [[ id:'test_paired_end', single_end:false ], 'SRR11140744'] + [[ irida_id:'test_single_end', single_end:true ], 'DRR000774'], + [[ irida_id:'test_paired_end', single_end:false ], 'SRR11140744'] ) input[1] = [] """ @@ -49,10 +49,10 @@ nextflow_workflow { workflow { """ input[0] = Channel.of( - [[ id:'SAMPLE1', single_end:false ], 'ERR1109373'], - [[ id:'ERROR1', single_end:false ], 'SRR999908'], - [[ id:'ERROR2', single_end:false ], 'INVALID!!'], - [[ id:'SAMPLE2', single_end:false ], 'SRR13191702'] + [[ irida_id:'SAMPLE1', single_end:false ], 'ERR1109373'], + [[ irida_id:'ERROR1', single_end:false ], 'SRR999908'], + [[ irida_id:'ERROR2', single_end:false ], 'INVALID!!'], + [[ irida_id:'SAMPLE2', single_end:false ], 'SRR13191702'] ) input[1] = [] """ diff --git a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap index 8c168d8..566f55d 100644 --- a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap +++ b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap @@ -5,7 +5,7 @@ "0": [ [ { - "id": "test_paired_end", + "irida_id": "test_paired_end", "single_end": false }, [ @@ -16,7 +16,7 @@ ], [ { - "id": "test_single_end", + "irida_id": "test_single_end", "single_end": true }, "DRR000774.fastq.gz:md5,a110f93f7a9b0271455f5a435bce73c7" @@ -30,7 +30,7 @@ "reads": [ [ { - "id": "test_paired_end", + "irida_id": "test_paired_end", "single_end": false }, [ @@ -41,7 +41,7 @@ ], [ { - "id": "test_single_end", + "irida_id": "test_single_end", "single_end": true }, "DRR000774.fastq.gz:md5,a110f93f7a9b0271455f5a435bce73c7" @@ -54,6 +54,10 @@ ] } ], - "timestamp": "2024-01-25T22:06:20.7303705" + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-11-05T15:53:20.740235848" } } \ No newline at end of file diff --git a/workflows/fetchdatairidanext.nf b/workflows/fetchdatairidanext.nf index 74b5fcc..bb8a5f2 100644 --- a/workflows/fetchdatairidanext.nf +++ b/workflows/fetchdatairidanext.nf @@ -56,7 +56,7 @@ workflow FETCHDATAIRIDANEXT { // Create a new channel of metadata from a sample sheet // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") - meta_accessions = input.map {meta -> tuple(["id": meta.id.first()], meta.insdc_accession.first())} + meta_accessions = input.map {meta -> tuple(["id": meta.id.first(), "irida_id": meta.irida_id.first(), "insdc_accession": meta.insdc_accession.first()], meta.insdc_accession.first())} FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( ch_sra_ids = meta_accessions, From 101bbe52bf31159dbee5a835e05ce5fc11f9eb51 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 7 Nov 2024 14:37:07 -0500 Subject: [PATCH 02/28] Fixes isses with fasterq-dump --- modules/local/sratools/fasterqdump/main.nf | 4 ++-- tests/pipelines/fetchdatairidanext.nf.test | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/sratools/fasterqdump/main.nf b/modules/local/sratools/fasterqdump/main.nf index 66ba956..b7b9bfe 100644 --- a/modules/local/sratools/fasterqdump/main.nf +++ b/modules/local/sratools/fasterqdump/main.nf @@ -4,8 +4,8 @@ process SRATOOLS_FASTERQDUMP { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' : - 'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' : + 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" input: tuple val(meta), path(sra) diff --git a/tests/pipelines/fetchdatairidanext.nf.test b/tests/pipelines/fetchdatairidanext.nf.test index a1392ab..0acd0f3 100644 --- a/tests/pipelines/fetchdatairidanext.nf.test +++ b/tests/pipelines/fetchdatairidanext.nf.test @@ -72,7 +72,7 @@ nextflow_pipeline { assert lines.size() == 3 assert lines.contains("sample,sample_name,error_accession") assert lines.contains("ERROR1,S2,SRR999908") - assert lines.contains("ERROR2,S3,INVALID!!") + assert lines.contains("ERROR2,S3,SRR999934") } } } From 5185a751c80d5e8eeb0148b06062e3e805b2745e Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 7 Nov 2024 14:42:56 -0500 Subject: [PATCH 03/28] Added another test, test read names with sample_names --- tests/pipelines/fetchdatairidanext.nf.test | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/fetchdatairidanext.nf.test b/tests/pipelines/fetchdatairidanext.nf.test index 0acd0f3..b6779c1 100644 --- a/tests/pipelines/fetchdatairidanext.nf.test +++ b/tests/pipelines/fetchdatairidanext.nf.test @@ -67,12 +67,18 @@ nextflow_pipeline { then { assert workflow.success assert path("$launchDir/output").exists() - + // Check that the failures report has a sample_name column when included in the samplesheet def lines = path("$launchDir/output/prefetch/failures_report.csv").readLines() assert lines.size() == 3 assert lines.contains("sample,sample_name,error_accession") assert lines.contains("ERROR1,S2,SRR999908") assert lines.contains("ERROR2,S3,SRR999934") + + // Check reads have been produced with modified names (i.e. suffixed with sample_name): + assert path("$launchDir/output/reads/S1_ERR1109373_1.fastq.gz").exists() + assert path("$launchDir/output/reads/S1_ERR1109373_2.fastq.gz").exists() + assert path("$launchDir/output/reads/S4_SRR13191702_1.fastq.gz").exists() + assert path("$launchDir/output/reads/S4_SRR13191702_2.fastq.gz").exists() } } } From 46bb5fa170d1414c881e831730a868b5bf85b228 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 7 Nov 2024 14:50:31 -0500 Subject: [PATCH 04/28] Fix nfcore lint issues --- .nf-core.yml | 3 + assets/schema_input.json | 2 +- nextflow_schema.json | 2 +- tests/pipelines/fetchdatairidanext.nf.test | 84 ---------------------- 4 files changed, 5 insertions(+), 86 deletions(-) delete mode 100644 tests/pipelines/fetchdatairidanext.nf.test diff --git a/.nf-core.yml b/.nf-core.yml index 1f79c1a..3d97354 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -34,5 +34,8 @@ lint: - custom_config - manifest.name - manifest.homePage + - params.max_cpus + - params.max_memory + - params.max_time readme: - nextflow_badge diff --git a/assets/schema_input.json b/assets/schema_input.json index dc3b468..f1060cf 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/phac-nml/fetchdatairidanext/main/assets/schema_input.json", "title": "phac-nml/fetchdatairidanext pipeline - params.input schema", "description": "Schema for the file provided with params.input", diff --git a/nextflow_schema.json b/nextflow_schema.json index e7709da..a5f1c26 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/phac-nml/fetchdatairidanext/main/nextflow_schema.json", "title": "phac-nml/fetchdatairidanext pipeline parameters", "description": "IRIDA Next NCBI Download pipeline", diff --git a/tests/pipelines/fetchdatairidanext.nf.test b/tests/pipelines/fetchdatairidanext.nf.test deleted file mode 100644 index b6779c1..0000000 --- a/tests/pipelines/fetchdatairidanext.nf.test +++ /dev/null @@ -1,84 +0,0 @@ -nextflow_pipeline { - - name "Test fetching small datasets from NCBI" - script "main.nf" - - test("basic integration test") { - - when { - params { - input = "$baseDir/tests/data/samplesheet.csv" - outdir = "test1_out" - } - } - - then { - assert workflow.success - - // IRIDA Next output file - assert path("$launchDir/test1_out/iridanext.output.json").json == path("$baseDir/tests/data/test1_iridanext.output.json").json - - // Output data - assert path("$launchDir/test1_out/reads/ERR1109373_1.fastq.gz").linesGzip.size() == 512 - assert path("$launchDir/test1_out/reads/ERR1109373_2.fastq.gz").linesGzip.size() == 512 - assert path("$launchDir/test1_out/reads/SRR13191702_1.fastq.gz").linesGzip.size() == 364 - assert path("$launchDir/test1_out/reads/SRR13191702_2.fastq.gz").linesGzip.size() == 364 - } - } - - test("integration test with prefetch failures") { - - when { - params { - input = "$baseDir/tests/data/errorsheet.csv" - outdir = "results" - } - } - - then { - assert workflow.success - - // IRIDA Next output file - assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/prefetch_errors_iridanext.output.json").json - - // Output data: - assert path("$launchDir/results/reads/ERR1109373_1.fastq.gz").linesGzip.size() == 512 - assert path("$launchDir/results/reads/ERR1109373_2.fastq.gz").linesGzip.size() == 512 - assert path("$launchDir/results/reads/SRR13191702_1.fastq.gz").linesGzip.size() == 364 - assert path("$launchDir/results/reads/SRR13191702_2.fastq.gz").linesGzip.size() == 364 - - // These files should have failed, and have no output reads: - assert path("$launchDir/results/reads/SRR999908_1.fastq.gz").exists() == false - assert path("$launchDir/results/reads/SRR999908_2.fastq.gz").exists() == false - assert path("$launchDir/results/reads/SRR999934_1.fastq.gz").exists() == false - assert path("$launchDir/results/reads/SRR999934_2.fastq.gz").exists() == false - } - } - - test("Include sample_name in samplesheet") { - tag "sample_name" - when { - params { - input = "$baseDir/tests/data/add-samplesheet.csv" - outdir = "output" - } - } - - then { - assert workflow.success - assert path("$launchDir/output").exists() - // Check that the failures report has a sample_name column when included in the samplesheet - def lines = path("$launchDir/output/prefetch/failures_report.csv").readLines() - assert lines.size() == 3 - assert lines.contains("sample,sample_name,error_accession") - assert lines.contains("ERROR1,S2,SRR999908") - assert lines.contains("ERROR2,S3,SRR999934") - - // Check reads have been produced with modified names (i.e. suffixed with sample_name): - assert path("$launchDir/output/reads/S1_ERR1109373_1.fastq.gz").exists() - assert path("$launchDir/output/reads/S1_ERR1109373_2.fastq.gz").exists() - assert path("$launchDir/output/reads/S4_SRR13191702_1.fastq.gz").exists() - assert path("$launchDir/output/reads/S4_SRR13191702_2.fastq.gz").exists() - } - } -} From 0849cc7d47576940daa8aa7e85231cbeb5c1e4ce Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 7 Nov 2024 15:10:13 -0500 Subject: [PATCH 05/28] Modified linting with editorconfig-checker --- .github/workflows/linting.yml | 13 ------------- .pre-commit-config.yaml | 5 +++++ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 544a5a8..6d0938e 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -11,19 +11,6 @@ on: types: [published] jobs: - EditorConfig: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-node@v3 - - - name: Install editorconfig-checker - run: npm install -g editorconfig-checker - - - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile\|.sra') - Prettier: runs-on: ubuntu-latest steps: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0c31cdb..98a90d1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,3 +3,8 @@ repos: rev: "v2.7.1" hooks: - id: prettier + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python + rev: "2.7.3" + hooks: + - id: editorconfig-checker + alias: ec From 28fa88f8043dd1776a04d87cc011d9bb2a99fe19 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 7 Nov 2024 15:12:51 -0500 Subject: [PATCH 06/28] Fix lint comment yml --- .github/workflows/linting_comment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875..42e519b 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6 with: workflow: linting.yml workflow_conclusion: completed From f025b1b4065d9b860ff4dcb37993ce7129fb3bbe Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Mon, 11 Nov 2024 21:01:16 -0500 Subject: [PATCH 07/28] Providing a NCBI user setting file for tests --- modules.json | 7 ++++++- modules/local/sratools/fasterqdump/environment.yml | 2 +- tests/data/ncbi_user_settings.mkfg | 2 ++ tests/modules/fasterqdump/main.nf.test | 2 +- tests/modules/fasterqdump/main.nf.test.snap | 6 +++--- .../main.nf.test.snap | 10 +++++----- 6 files changed, 18 insertions(+), 11 deletions(-) create mode 100644 tests/data/ncbi_user_settings.mkfg diff --git a/modules.json b/modules.json index 6250d22..cb385c1 100644 --- a/modules.json +++ b/modules.json @@ -15,9 +15,14 @@ "git_sha": "e719354ba77df0a1bd310836aa2039b45c29d620", "installed_by": ["modules"] }, + "sratools/fasterqdump": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "sratools/prefetch": { "branch": "master", - "git_sha": "e719354ba77df0a1bd310836aa2039b45c29d620", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] } } diff --git a/modules/local/sratools/fasterqdump/environment.yml b/modules/local/sratools/fasterqdump/environment.yml index 4011b69..dd0faa5 100644 --- a/modules/local/sratools/fasterqdump/environment.yml +++ b/modules/local/sratools/fasterqdump/environment.yml @@ -4,5 +4,5 @@ channels: - bioconda - defaults dependencies: - - bioconda::sra-tools=3.0.8 + - bioconda::sra-tools=2.11.0 - conda-forge::pigz=2.6 diff --git a/tests/data/ncbi_user_settings.mkfg b/tests/data/ncbi_user_settings.mkfg new file mode 100644 index 0000000..3c8df9f --- /dev/null +++ b/tests/data/ncbi_user_settings.mkfg @@ -0,0 +1,2 @@ +/LIBS/GUID = "5b0d4b7d-88c7-4802-98fd-e3afd06feb32" +/libs/cloud/report_instance_identity = "true" diff --git a/tests/modules/fasterqdump/main.nf.test b/tests/modules/fasterqdump/main.nf.test index 2ae17c3..52f3d11 100644 --- a/tests/modules/fasterqdump/main.nf.test +++ b/tests/modules/fasterqdump/main.nf.test @@ -12,7 +12,7 @@ nextflow_process { process { """ input[0] = Channel.of([ [irida_id: "SAMPLE1"], file("$baseDir/tests/data/ERR1109373.sra", checkIfExists: true) ]) - input[1] = [] + input[1] = file('$baseDir/tests/data/ncbi_user_settings.mkfg', checkIfExists: true) input[2] = [] """ } diff --git a/tests/modules/fasterqdump/main.nf.test.snap b/tests/modules/fasterqdump/main.nf.test.snap index 0bea139..8399ab9 100644 --- a/tests/modules/fasterqdump/main.nf.test.snap +++ b/tests/modules/fasterqdump/main.nf.test.snap @@ -15,7 +15,7 @@ ] ], "1": [ - "versions.yml:md5,a3d61a9761e1606ef8459f0b68821d7a" + "versions.yml:md5,6ff2d50b15c3f0eb9c72cd13a4a20295" ], "reads": [ [ @@ -30,7 +30,7 @@ ] ], "versions": [ - "versions.yml:md5,a3d61a9761e1606ef8459f0b68821d7a" + "versions.yml:md5,6ff2d50b15c3f0eb9c72cd13a4a20295" ] } ], @@ -38,6 +38,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-10-31T11:23:17.630455595" + "timestamp": "2024-11-11T20:36:55.563890217" } } \ No newline at end of file diff --git a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap index 566f55d..d51d6dd 100644 --- a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap +++ b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap @@ -24,8 +24,8 @@ ], "1": [ "versions.yml:md5,1a2218ff913fc33408bffccb081b5048", - "versions.yml:md5,98d78bba9f3da39a0b7db6e9c7dcc224", - "versions.yml:md5,9c558ff624585a6eee82a19c8c0136db" + "versions.yml:md5,2f3b3a13b36dabf13f09327613d5558d", + "versions.yml:md5,53d6e983afde3a28add2ffc6b7eba4f3" ], "reads": [ [ @@ -49,8 +49,8 @@ ], "versions": [ "versions.yml:md5,1a2218ff913fc33408bffccb081b5048", - "versions.yml:md5,98d78bba9f3da39a0b7db6e9c7dcc224", - "versions.yml:md5,9c558ff624585a6eee82a19c8c0136db" + "versions.yml:md5,2f3b3a13b36dabf13f09327613d5558d", + "versions.yml:md5,53d6e983afde3a28add2ffc6b7eba4f3" ] } ], @@ -58,6 +58,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-11-05T15:53:20.740235848" + "timestamp": "2024-11-11T20:52:02.578591475" } } \ No newline at end of file From 201d5e4705de0a2fca19a87a925aac80ad0e99ab Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Mon, 11 Nov 2024 21:01:27 -0500 Subject: [PATCH 08/28] Providing a NCBI user setting file for tests --- .../nf-core/sratools/prefetch/environment.yml | 7 -- modules/nf-core/sratools/prefetch/main.nf | 35 --------- modules/nf-core/sratools/prefetch/meta.yml | 56 --------------- .../prefetch/templates/retry_with_backoff.sh | 55 -------------- .../sratools/prefetch/tests/main.nf.test | 55 -------------- .../sratools/prefetch/tests/main.nf.test.snap | 72 ------------------- .../nf-core/sratools/prefetch/tests/tags.yml | 2 - 7 files changed, 282 deletions(-) delete mode 100644 modules/nf-core/sratools/prefetch/environment.yml delete mode 100644 modules/nf-core/sratools/prefetch/main.nf delete mode 100644 modules/nf-core/sratools/prefetch/meta.yml delete mode 100755 modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh delete mode 100644 modules/nf-core/sratools/prefetch/tests/main.nf.test delete mode 100644 modules/nf-core/sratools/prefetch/tests/main.nf.test.snap delete mode 100644 modules/nf-core/sratools/prefetch/tests/tags.yml diff --git a/modules/nf-core/sratools/prefetch/environment.yml b/modules/nf-core/sratools/prefetch/environment.yml deleted file mode 100644 index cfc7d9a..0000000 --- a/modules/nf-core/sratools/prefetch/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: sratools_prefetch -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::sra-tools=3.0.8 diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf deleted file mode 100644 index 3c30739..0000000 --- a/modules/nf-core/sratools/prefetch/main.nf +++ /dev/null @@ -1,35 +0,0 @@ -process SRATOOLS_PREFETCH { - tag "$id" - label 'process_low' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/sra-tools:3.0.8--h9f5acd7_0' : - 'biocontainers/sra-tools:3.0.8--h9f5acd7_0' }" - - input: - tuple val(meta), val(id) - path ncbi_settings - path certificate - - output: - tuple val(meta), path(id), emit: sra - path 'versions.yml' , emit: versions - - when: - task.ext.when == null || task.ext.when - - shell: - args = task.ext.args ?: '' - args2 = task.ext.args2 ?: '5 1 100' // - if (certificate) { - if (certificate.toString().endsWith('.jwt')) { - args += " --perm ${certificate}" - } - else if (certificate.toString().endsWith('.ngc')) { - args += " --ngc ${certificate}" - } - } - - template 'retry_with_backoff.sh' -} diff --git a/modules/nf-core/sratools/prefetch/meta.yml b/modules/nf-core/sratools/prefetch/meta.yml deleted file mode 100644 index ff54229..0000000 --- a/modules/nf-core/sratools/prefetch/meta.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: sratools_prefetch -description: Download sequencing data from the NCBI Sequence Read Archive (SRA). -keywords: - - sequencing - - fastq - - prefetch -tools: - - sratools: - description: SRA Toolkit and SDK from NCBI - homepage: https://github.com/ncbi/sra-tools - documentation: https://github.com/ncbi/sra-tools/wiki - tool_dev_url: https://github.com/ncbi/sra-tools - licence: ["Public Domain"] -input: - - meta: - type: map - description: > - Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - - id: - type: string - description: > - A string denoting an SRA id. - - - ncbi_settings: - type: file - description: > - An NCBI user settings file. - - pattern: "*.mkfg" - - certificate: - type: file - description: > - Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit - - pattern: "*.cart" -output: - - meta: - type: map - description: > - Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - - sra: - type: directory - description: > - Directory containing the ETL data for the given SRA id. - - pattern: "*/*.sra" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@Midnighter" -maintainers: - - "@Midnighter" diff --git a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh deleted file mode 100755 index a72a4bf..0000000 --- a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -set -u - -retry_with_backoff() { - local max_attempts=${1} - local delay=${2} - local max_time=${3} - local attempt=1 - local output= - local status= - - # Remove the first three arguments to this function in order to access - # the 'real' command with `${@}`. - shift 3 - - while [ ${attempt} -le ${max_attempts} ]; do - output=$("${@}") - status=${?} - - if [ ${status} -eq 0 ]; then - break - fi - - if [ ${attempt} -lt ${max_attempts} ]; then - echo "Failed attempt ${attempt} of ${max_attempts}. Retrying in ${delay} s." >&2 - sleep ${delay} - elif [ ${attempt} -eq ${max_attempts} ]; then - echo "Failed after ${attempt} attempts." >&2 - return ${status} - fi - - attempt=$(( ${attempt} + 1 )) - delay=$(( ${delay} * 2 )) - if [ ${delay} -ge ${max_time} ]; then - delay=${max_time} - fi - done - - echo "${output}" -} - -export NCBI_SETTINGS="$PWD/!{ncbi_settings}" - -retry_with_backoff !{args2} \ - prefetch \ - !{args} \ - !{id} - -[ -f !{id}.sralite ] && vdb-validate !{id}.sralite || vdb-validate !{id} - -cat <<-END_VERSIONS > versions.yml -"!{task.process}": - sratools: $(prefetch --version 2>&1 | grep -Eo '[0-9.]+') -END_VERSIONS diff --git a/modules/nf-core/sratools/prefetch/tests/main.nf.test b/modules/nf-core/sratools/prefetch/tests/main.nf.test deleted file mode 100644 index ed710ba..0000000 --- a/modules/nf-core/sratools/prefetch/tests/main.nf.test +++ /dev/null @@ -1,55 +0,0 @@ -nextflow_process { - name "Test Process SRATOOLS_PREFETCH" - script "../main.nf" - process "SRATOOLS_PREFETCH" - tag "modules" - tag "modules_nfcore" - tag "sratools" - tag "sratools/prefetch" - - test("sratools/prefetch") { - - when { - params { - outdir = "output" - } - process { - """ - input[0] = Channel.of([ [ id:'test', single_end:false ], 'DRR000774' ]) - input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) - input[2] = [] - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sratools/prefetch with sralite") { - - when { - params { - outdir = "output" - } - process { - """ - input[0] = Channel.of([ [ id:'test', single_end:false ], 'SRR1170046' ]) - input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) - input[2] = [] - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } -} diff --git a/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap b/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap deleted file mode 100644 index ab1d208..0000000 --- a/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap +++ /dev/null @@ -1,72 +0,0 @@ -{ - "sratools/prefetch with sralite": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - [ - "SRR1170046.sralite:md5,7acfce556ca0951aff49d780899c105b" - ] - ] - ], - "1": [ - "versions.yml:md5,c967dea4135cb75490e1e801c4639efc" - ], - "sra": [ - [ - { - "id": "test", - "single_end": false - }, - [ - "SRR1170046.sralite:md5,7acfce556ca0951aff49d780899c105b" - ] - ] - ], - "versions": [ - "versions.yml:md5,c967dea4135cb75490e1e801c4639efc" - ] - } - ], - "timestamp": "2023-10-13T12:11:24.563510389" - }, - "sratools/prefetch": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - [ - "DRR000774.sra:md5,7647dba20c89c0e3d7ad13842f060eb0" - ] - ] - ], - "1": [ - "versions.yml:md5,c967dea4135cb75490e1e801c4639efc" - ], - "sra": [ - [ - { - "id": "test", - "single_end": false - }, - [ - "DRR000774.sra:md5,7647dba20c89c0e3d7ad13842f060eb0" - ] - ] - ], - "versions": [ - "versions.yml:md5,c967dea4135cb75490e1e801c4639efc" - ] - } - ], - "timestamp": "2023-10-13T12:11:02.75256571" - } -} \ No newline at end of file diff --git a/modules/nf-core/sratools/prefetch/tests/tags.yml b/modules/nf-core/sratools/prefetch/tests/tags.yml deleted file mode 100644 index 52110bf..0000000 --- a/modules/nf-core/sratools/prefetch/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -sratools/prefetch: - - modules/nf-core/sratools/prefetch/** From b55683d2cff2f5785368965b8c8d9d1ebcf0b8a2 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Mon, 11 Nov 2024 21:06:17 -0500 Subject: [PATCH 09/28] Somewhere along the way prefetch got deleted --- .../nf-core/sratools/prefetch/environment.yml | 6 + modules/nf-core/sratools/prefetch/main.nf | 47 +++++++ modules/nf-core/sratools/prefetch/meta.yml | 59 +++++++++ .../prefetch/templates/retry_with_backoff.sh | 65 ++++++++++ .../sratools/prefetch/tests/main.nf.test | 71 +++++++++++ .../sratools/prefetch/tests/main.nf.test.snap | 119 ++++++++++++++++++ .../nf-core/sratools/prefetch/tests/tags.yml | 2 + 7 files changed, 369 insertions(+) create mode 100644 modules/nf-core/sratools/prefetch/environment.yml create mode 100644 modules/nf-core/sratools/prefetch/main.nf create mode 100644 modules/nf-core/sratools/prefetch/meta.yml create mode 100755 modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh create mode 100644 modules/nf-core/sratools/prefetch/tests/main.nf.test create mode 100644 modules/nf-core/sratools/prefetch/tests/main.nf.test.snap create mode 100644 modules/nf-core/sratools/prefetch/tests/tags.yml diff --git a/modules/nf-core/sratools/prefetch/environment.yml b/modules/nf-core/sratools/prefetch/environment.yml new file mode 100644 index 0000000..6596bc7 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::sra-tools=3.1.0 + - conda-forge::curl=8.5.0 diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf new file mode 100644 index 0000000..74838d5 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -0,0 +1,47 @@ +process SRATOOLS_PREFETCH { + tag "$id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sra-tools:3.1.0--h9f5acd7_0' : + 'biocontainers/sra-tools:3.1.0--h9f5acd7_0' }" + + input: + tuple val(meta), val(id) + path ncbi_settings + path certificate + + output: + tuple val(meta), path(id, type: 'dir'), emit: sra + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + args = task.ext.args ?: '' + args2 = task.ext.args2 ?: '5 1 100' // + if (certificate) { + if (certificate.toString().endsWith('.jwt')) { + args += " --perm ${certificate}" + } + else if (certificate.toString().endsWith('.ngc')) { + args += " --ngc ${certificate}" + } + } + + template 'retry_with_backoff.sh' + + stub: + """ + mkdir $id + touch $id/${id}.sra + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sratools: \$(prefetch --version 2>&1 | grep -Eo '[0-9.]+') + curl: \$(curl --version | head -n 1 | sed 's/^curl //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/sratools/prefetch/meta.yml b/modules/nf-core/sratools/prefetch/meta.yml new file mode 100644 index 0000000..3a537bf --- /dev/null +++ b/modules/nf-core/sratools/prefetch/meta.yml @@ -0,0 +1,59 @@ +name: sratools_prefetch +description: Download sequencing data from the NCBI Sequence Read Archive (SRA). +keywords: + - sequencing + - fastq + - prefetch +tools: + - sratools: + description: SRA Toolkit and SDK from NCBI + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ["Public Domain"] + identifier: "" +input: + - - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + - id: + type: string + description: > + A string denoting an SRA id. + - - ncbi_settings: + type: file + description: > + An NCBI user settings file. + pattern: "*.mkfg" + - - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the + sra-toolkit + pattern: "*.cart" +output: + - sra: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + pattern: "*/*.sra" + - "id, type: 'dir": + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + pattern: "*/*.sra" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" + - "@gallvp" diff --git a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh new file mode 100755 index 0000000..bfee607 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +set -u + +retry_with_backoff() { + local max_attempts=${1} + local delay=${2} + local max_time=${3} + local attempt=1 + local output= + local status= + + # Remove the first three arguments to this function in order to access + # the 'real' command with `${@}`. + shift 3 + + while [ ${attempt} -le ${max_attempts} ]; do + output=$("${@}") + status=${?} + + if [ ${status} -eq 0 ]; then + break + fi + + if [ ${attempt} -lt ${max_attempts} ]; then + echo "Failed attempt ${attempt} of ${max_attempts}. Retrying in ${delay} s." >&2 + sleep ${delay} + elif [ ${attempt} -eq ${max_attempts} ]; then + echo "Failed after ${attempt} attempts." >&2 + return ${status} + fi + + attempt=$(( ${attempt} + 1 )) + delay=$(( ${delay} * 2 )) + if [ ${delay} -ge ${max_time} ]; then + delay=${max_time} + fi + done + + echo "${output}" +} + +export NCBI_SETTINGS="$PWD/!{ncbi_settings}" + +retry_with_backoff !{args2} \ + prefetch \ + !{args} \ + !{id} + +# check file integrity using vdb-validate or (when archive contains no checksums) md5sum +vdb-validate !{id} > vdb-validate_result.txt 2>&1 || exit 1 +if grep -q "checksums missing" vdb-validate_result.txt; then + VALID_MD5SUMS=$(curl --silent --fail --location --retry 3 --retry-delay 60 'https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?filetype=run&acc=!{id}') + LOCAL_MD5SUMS=$(md5sum !{id}/* | cut -f1 -d' ') + if ! grep -q -F -f <(echo "$LOCAL_MD5SUMS") <(echo "$VALID_MD5SUMS"); then + echo "MD5 sum check failed" 1>&2 + exit 1 + fi +fi + +cat <<-END_VERSIONS > versions.yml +"!{task.process}": + sratools: $(prefetch --version 2>&1 | grep -Eo '[0-9.]+') + curl: $(curl --version | head -n 1 | sed 's/^curl //; s/ .*$//') +END_VERSIONS diff --git a/modules/nf-core/sratools/prefetch/tests/main.nf.test b/modules/nf-core/sratools/prefetch/tests/main.nf.test new file mode 100644 index 0000000..92034d4 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/main.nf.test @@ -0,0 +1,71 @@ +nextflow_process { + name "Test Process SRATOOLS_PREFETCH" + script "../main.nf" + process "SRATOOLS_PREFETCH" + tag "modules" + tag "modules_nfcore" + tag "sratools" + tag "sratools/prefetch" + + test("sratools/prefetch") { + + when { + process { + """ + input[0] = Channel.of([ [ id:'test', single_end:false ], 'DRR000774' ]) + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sratools/prefetch with sralite") { + + when { + process { + """ + input[0] = Channel.of([ [ id:'test', single_end:false ], 'SRR1170046' ]) + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sratools/prefetch/stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ [ id:'test', single_end:false ], 'DRR000774' ]) + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap b/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap new file mode 100644 index 0000000..82a1969 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap @@ -0,0 +1,119 @@ +{ + "sratools/prefetch/stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "DRR000774.sra:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "DRR000774.sra:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-17T20:07:31.627115" + }, + "sratools/prefetch with sralite": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "SRR1170046.sralite:md5,7acfce556ca0951aff49d780899c105b" + ] + ] + ], + "1": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "SRR1170046.sralite:md5,7acfce556ca0951aff49d780899c105b" + ] + ] + ], + "versions": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:49:02.309737" + }, + "sratools/prefetch": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "DRR000774.sra:md5,7647dba20c89c0e3d7ad13842f060eb0" + ] + ] + ], + "1": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "DRR000774.sra:md5,7647dba20c89c0e3d7ad13842f060eb0" + ] + ] + ], + "versions": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:48:37.428307" + } +} \ No newline at end of file diff --git a/modules/nf-core/sratools/prefetch/tests/tags.yml b/modules/nf-core/sratools/prefetch/tests/tags.yml new file mode 100644 index 0000000..52110bf --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/tags.yml @@ -0,0 +1,2 @@ +sratools/prefetch: + - modules/nf-core/sratools/prefetch/** From 9e1e92a357c00ac867b0608dbd79a7839dc806d6 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 14 Nov 2024 13:51:10 -0500 Subject: [PATCH 10/28] Add sample_name removal of non-alphanumeric characters and nf-test for sample_name --- CHANGELOG.md | 12 +++++++ tests/data/add-samplesheet.csv | 8 ++--- tests/data/samplesheet-addsamplename.csv | 3 -- .../workflows/fetchdatairidanext/main.nf.test | 36 +++++++++++++++++++ workflows/fetchdatairidanext.nf | 13 +++++-- 5 files changed, 63 insertions(+), 9 deletions(-) delete mode 100644 tests/data/samplesheet-addsamplename.csv create mode 100644 tests/workflows/fetchdatairidanext/main.nf.test diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f2017f..482746b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.2.0] + +### `Changed` + +- Modified the template for input csv file to include a `sample_name` column in addition to `sample` in-line with changes to [IRIDA-Next update] as seen with the [speciesabundance pipeline] + - `sample_name` special characters will be replaced with `"_"` + - If no `sample_name` is supplied in the column `sample` will be used + - To avoid repeat values for `sample_name` all `sample_name` values will be suffixed with the unique `sample` value from the input file + +[IRIDA-Next update]: https://github.com/phac-nml/irida-next/pull/678 +[speciesabundance pipeline]: https://github.com/phac-nml/speciesabundance/pull/24 + ## [1.1.1] - 2024-04-19 ### Added diff --git a/tests/data/add-samplesheet.csv b/tests/data/add-samplesheet.csv index 4512f0a..f4e437d 100644 --- a/tests/data/add-samplesheet.csv +++ b/tests/data/add-samplesheet.csv @@ -1,5 +1,5 @@ sample,sample_name,insdc_accession -SAMPLE1,S1,ERR1109373 -ERROR1,S2,SRR999908 -ERROR2,S3,SRR999934 -SAMPLE2,S4,SRR13191702 +SAMPLE1,S 1,ERR1109373 +SAMPLE2,S2,ERR1109373 +SAMPLE3,S2,SRR13191702 +SAMPLE4,,SRR13191702 diff --git a/tests/data/samplesheet-addsamplename.csv b/tests/data/samplesheet-addsamplename.csv deleted file mode 100644 index 21ced63..0000000 --- a/tests/data/samplesheet-addsamplename.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,sample_name,insdc_accession -SAMPLE1,S1,ERR1109373 -SAMPLE2,,SRR13191702 diff --git a/tests/workflows/fetchdatairidanext/main.nf.test b/tests/workflows/fetchdatairidanext/main.nf.test new file mode 100644 index 0000000..f97c1d0 --- /dev/null +++ b/tests/workflows/fetchdatairidanext/main.nf.test @@ -0,0 +1,36 @@ +nextflow_workflow { + + name "Test workflow: workflows/fetchdatairidanext.nf" + script "workflows/fetchdatairidanext.nf" + workflow "FETCHDATAIRIDANEXT" + tag "full workflow" + tag "fetchdatairdanext" + + test("Samplesheets with sample_name") { + tag "sample_name" + when { + params { + input = "$baseDir/tests/data/add-samplesheet.csv" + outdir = "output" + } + } + + then { + assert workflow.success + assert path("$launchDir/output").exists() + + // Check that reads have the correct filename when supplying a sample_name + assert path("$launchDir/output/iridanext.output.json").exists() + def iridanext_json = path("$launchDir/output/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + + assert iridanext_samples.SAMPLE1 == [['path':'reads/S_1_ERR1109373_2.fastq.gz'], ['path':'reads/S_1_ERR1109373_1.fastq.gz']] + assert iridanext_samples.SAMPLE2 == [['path':'reads/S2_ERR1109373_2.fastq.gz'], ['path':'reads/S2_ERR1109373_1.fastq.gz']] + assert iridanext_samples.SAMPLE3 == [['path':'reads/S2_SRR13191702_2.fastq.gz'], ['path':'reads/S2_SRR13191702_1.fastq.gz']] + assert iridanext_samples.SAMPLE4 == [['path':'reads/SRR13191702_2.fastq.gz'], ['path':'reads/SRR13191702_1.fastq.gz']] + + + } + } + +} diff --git a/workflows/fetchdatairidanext.nf b/workflows/fetchdatairidanext.nf index bb8a5f2..b55d968 100644 --- a/workflows/fetchdatairidanext.nf +++ b/workflows/fetchdatairidanext.nf @@ -56,10 +56,19 @@ workflow FETCHDATAIRIDANEXT { // Create a new channel of metadata from a sample sheet // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") - meta_accessions = input.map {meta -> tuple(["id": meta.id.first(), "irida_id": meta.irida_id.first(), "insdc_accession": meta.insdc_accession.first()], meta.insdc_accession.first())} + // and remove non-alphanumeric characters in sample_names (meta.id) + .map { meta -> + if (meta.id[0]) { + // Non-alphanumeric characters (excluding _,-,.) will be replaced with "_" + new_id = meta.id[0].replaceAll(/[^A-Za-z0-9_\.\-]/, '_') // meta.id appears to be an immutable list, the workaround is to create a new variable + } else { + new_id = meta.id[0] + } + return [["id": new_id, "irida_id": meta.irida_id[0], "insdc_accession": meta.insdc_accession[0]], meta.insdc_accession[0]] + } FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( - ch_sra_ids = meta_accessions, + ch_sra_ids = input, ch_dbgap_key = [] ) ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions) From d6093db02e066b5e337dc89716ef970d6e7cee1b Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 14 Nov 2024 15:01:41 -0500 Subject: [PATCH 11/28] update documenantation, and param to override sample_name rename --- CHANGELOG.md | 14 ++++++++++++-- README.md | 12 ++++++++++++ conf/modules.config | 2 +- docs/output.md | 2 +- docs/usage.md | 24 ++++++++++++++++++++++++ nextflow.config | 4 ++++ 6 files changed, 54 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 482746b..49dd608 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,12 +8,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - Modified the template for input csv file to include a `sample_name` column in addition to `sample` in-line with changes to [IRIDA-Next update] as seen with the [speciesabundance pipeline] + - If `sample_name` is supplied, then the reads will have `sample_name` prefixed before the accession code - `sample_name` special characters will be replaced with `"_"` - - If no `sample_name` is supplied in the column `sample` will be used - - To avoid repeat values for `sample_name` all `sample_name` values will be suffixed with the unique `sample` value from the input file +- Reverted `fasterq-dump` version to 2.11.0 from 3.0.8 due to [issue #865]. Solution proposed by `fetchngs` in [PR #261] +- Fixed linting issues in CI caused by `nf-core` 3.0.1 +- Updated `nf-test` snapshots and added new tests for `sample_name` feature [IRIDA-Next update]: https://github.com/phac-nml/irida-next/pull/678 [speciesabundance pipeline]: https://github.com/phac-nml/speciesabundance/pull/24 +[issue #865]: https://github.com/ncbi/sra-tools/issues/865 +[PR #261]: https://github.com/nf-core/fetchngs/pull/261 ## [1.1.1] - 2024-04-19 @@ -41,3 +45,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Initial release of fetchdatairidanext pipeline which will download reads from NCBI/INSDC archives. + +[1.2.0]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.2.0 +[1.1.1]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.1.1 +[1.1.0]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.1.0 +[1.0.1]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.0.1 +[1.0.0]: https://github.com/phac-nml/fetchdatairidanext/releases/tag/1.0.0 diff --git a/README.md b/README.md index 2390e32..93a6949 100644 --- a/README.md +++ b/README.md @@ -20,10 +20,22 @@ That is, there are two columns: The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). An example of this file is provided at [assets/samplesheet.csv](assets/samplesheet.csv). +## IRIDA-Next Optional Input Configuration + +`fetchdatairidanext` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name` + +`sample_name`: An **optional** column, to allow for an override the default reads name with a `sample_name` prefix before the accession code. + +`sample_name`, allows more flexibility in naming reads. Unlike `sample`, `sample_name` is not required to contain unique values. Non-alphanumeric characters (excluding `_`,`-`,`.`) will be replaced with `"_"`. `sample_name` can be provided without renaming by changing parameters. + +An [example samplesheet](tests/data/samplesheets/samplesheet-sample_name.csv) has been provided with the pipeline. + # Parameters The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers (or `-profile docker` for docker) and `-r [branch]` to specify which GitHub branch you would like to run. +`--rename_with_samplename` (Default: `true`) can changed to `false` to override reads renaming when `sample_name` provided in samplesheet input + Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schema.json). # Running diff --git a/conf/modules.config b/conf/modules.config index f68b79d..1399215 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -41,7 +41,7 @@ process { ext.args = { [ - meta.id ? rename(meta.id, meta.insdc_accession) : "" + (meta.id && params.rename_with_samplename) ? rename(meta.id, meta.insdc_accession) : "" ].join(" ") } } diff --git a/docs/output.md b/docs/output.md index 7e95254..8247aae 100644 --- a/docs/output.md +++ b/docs/output.md @@ -29,7 +29,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - `sratools/` - Sequence data in SRA format: `INSDC_ACCESSION/INSDC_ACCESSION.sra` - `reads/` - - Reads in fastq format: `INSDC_ACCESSION.fastq.gz` + - Reads in fastq format: `INSDC_ACCESSION.fastq.gz` (or alternatively `SAMPLE_NAME_INSDC_ACCESSION.fastq.gz` if `sample_name` provided) diff --git a/docs/usage.md b/docs/usage.md index a261f28..8621d40 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -31,6 +31,26 @@ SAMPLE2,SRR13191702 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +### IRIDA-Next Optional Samplesheet Configuration + +`fetchdatairidanext` also accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which contain the following columns: `sample`, `sample_name`, `insdc_accession`. The `sample` column values within a samplesheet should be unique. + +A final samplesheet file consisting of mlst_alleles and addresses may look something like the one below: + +```console +sample,sample_name,insdc_accession +SAMPLE1,S1,ERR1109373 +SAMPLE2,,SRR13191702 +``` + +| Column | Description | +| ----------------- | ----------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. Samples should be unique within a samplesheet. | +| `sample_name` | Provides custom prefix to read filenames | +| `insdc_accession` | The accession (run accession) from one of the INSDC databases (NCBI, ENA, or DDBJ). | + +An [example samplesheet](tests/data/samplesheets/samplesheet-sample_name.csv) has been provided with the pipeline. + ## Running the pipeline The typical command for running the pipeline is as follows: @@ -132,6 +152,10 @@ You can also supply a run name to resume a specific run: `-resume [run-name]`. U Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. +### `--rename_with_samplename` + +When `sample_name` is included in samplesheet will add prefix to read filenames (Default: true) + ## Custom configuration ### Resource requests diff --git a/nextflow.config b/nextflow.config index 85f9022..2994933 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,6 +12,10 @@ params { // Input options input = null + // Fetchdatairidanext options + + rename_with_samplename = true + // Boilerplate options outdir = null publish_dir_mode = 'copy' From f1a50f3159e5c899b1b6546ccc6e6bbdd5898ee2 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 14 Nov 2024 15:02:56 -0500 Subject: [PATCH 12/28] Update usage.md --- docs/usage.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 8621d40..4c63b38 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -43,11 +43,11 @@ SAMPLE1,S1,ERR1109373 SAMPLE2,,SRR13191702 ``` -| Column | Description | -| ----------------- | ----------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. Samples should be unique within a samplesheet. | -| `sample_name` | Provides custom prefix to read filenames | -| `insdc_accession` | The accession (run accession) from one of the INSDC databases (NCBI, ENA, or DDBJ). | +| Column | Description | +| ----------------- | ----------------------------------------------------------------------------------- | +| `sample` | Custom sample name. Samples should be unique within a samplesheet. | +| `sample_name` | Provides custom prefix to read filenames | +| `insdc_accession` | The accession (run accession) from one of the INSDC databases (NCBI, ENA, or DDBJ). | An [example samplesheet](tests/data/samplesheets/samplesheet-sample_name.csv) has been provided with the pipeline. From 0dd868a54ac3d3945505a9d78c42c17c0899ee08 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 14 Nov 2024 15:13:31 -0500 Subject: [PATCH 13/28] Add new param to schema --- nextflow_schema.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index a5f1c26..394b2a1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -214,5 +214,11 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "rename_with_samplename": { + "type": "boolean", + "default": true + } + } } From 82064589fc2f9af471790bfe461e518b3cef6838 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 14 Nov 2024 15:48:35 -0500 Subject: [PATCH 14/28] Updating UI for IRIDA-Next --- nextflow_schema.json | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 394b2a1..6852b8c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -40,6 +43,18 @@ } } }, + "additional_parameters": { + "title": "Additional Parameters", + "type": "object", + "description": "IRIDA-Next Parameters", + "default": "", + "properties": { + "rename_with_samplename": { + "type": "boolean", + "default": true + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -133,7 +148,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -205,6 +227,9 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/additional_parameters" + }, { "$ref": "#/definitions/institutional_config_options" }, @@ -214,11 +239,5 @@ { "$ref": "#/definitions/generic_options" } - ], - "properties": { - "rename_with_samplename": { - "type": "boolean", - "default": true - } - } + ] } From c877cd443ccaef616ccf7d3b27116db2aaec126c Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 14 Nov 2024 15:57:19 -0500 Subject: [PATCH 15/28] Made it pretty --- nextflow_schema.json | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 6852b8c..6257e59 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -148,14 +145,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { From 86bccf9653ba78dd17b6ac6a34168a59f2f341cf Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 14 Nov 2024 16:06:24 -0500 Subject: [PATCH 16/28] Improved description for IRIDA-Next parameter --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 6257e59..508fa6b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -43,11 +43,11 @@ "additional_parameters": { "title": "Additional Parameters", "type": "object", - "description": "IRIDA-Next Parameters", "default": "", "properties": { "rename_with_samplename": { "type": "boolean", + "description": "Filenames for reads will have 'SAMPLE_NAME' prefixed to 'INSDC_ACCESSION' (True)", "default": true } } From 1238b144196fcc94ad466ebdd2fb03b685ee5956 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Fri, 15 Nov 2024 10:05:50 -0500 Subject: [PATCH 17/28] Update version for minor release --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 2994933..6f29485 100644 --- a/nextflow.config +++ b/nextflow.config @@ -202,7 +202,7 @@ manifest { description = """IRIDA Next pipeline for fetching data from NCBI""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.1.1' + version = '1.2.0' doi = '' defaultBranch = 'main' } From dc8596830cc0f8bf59caa5996102d7d07945e152 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Wed, 20 Nov 2024 09:55:50 -0500 Subject: [PATCH 18/28] Addressing accessions with both pair and single end reads --- conf/modules.config | 12 ++++++++++-- modules/local/sratools/fasterqdump/main.nf | 5 ++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 1399215..54b5ea8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -37,11 +37,19 @@ process { mode: params.publish_dir_mode, pattern: 'reads/*.fastq.gz' ] - def rename = {String sample_name, String accession -> "--outfile ${sample_name}_${accession}"} + def fasterq_rename = {String sample_name, String accession -> "--outfile ${sample_name}_${accession}"} + def add_extension = {String sample_name, String accession -> "${sample_name}_${accession}"} + + ext.args = { [ - (meta.id && params.rename_with_samplename) ? rename(meta.id, meta.insdc_accession) : "" + (meta.id && params.rename_with_samplename) ? fasterq_rename(meta.id, meta.insdc_accession) : "" + ].join(" ") + } + ext.args2 = { + [ + (meta.id && params.rename_with_samplename) ? add_extension(meta.id, meta.insdc_accession) : meta.insdc_accession ].join(" ") } } diff --git a/modules/local/sratools/fasterqdump/main.nf b/modules/local/sratools/fasterqdump/main.nf index b7b9bfe..8195085 100644 --- a/modules/local/sratools/fasterqdump/main.nf +++ b/modules/local/sratools/fasterqdump/main.nf @@ -22,6 +22,7 @@ process SRATOOLS_FASTERQDUMP { script: def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def key_file = '' @@ -46,8 +47,10 @@ process SRATOOLS_FASTERQDUMP { ${key_file} \\ ${sra} + find reads/ -type f -name "$args2" -exec mv {} {}.fastq \\; + pigz \\ - $args2 \\ + $args3 \\ --no-name \\ --processes $task.cpus \\ reads/*.fastq From 3f5404059254e437b6f8308d820a6bfa06d64e1a Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Wed, 20 Nov 2024 10:12:20 -0500 Subject: [PATCH 19/28] Updated tests for previous commit --- tests/workflows/fetchdatairidanext/main.nf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/workflows/fetchdatairidanext/main.nf.test b/tests/workflows/fetchdatairidanext/main.nf.test index f97c1d0..447f5d1 100644 --- a/tests/workflows/fetchdatairidanext/main.nf.test +++ b/tests/workflows/fetchdatairidanext/main.nf.test @@ -24,8 +24,8 @@ nextflow_workflow { def iridanext_json = path("$launchDir/output/iridanext.output.json").json def iridanext_samples = iridanext_json.files.samples - assert iridanext_samples.SAMPLE1 == [['path':'reads/S_1_ERR1109373_2.fastq.gz'], ['path':'reads/S_1_ERR1109373_1.fastq.gz']] - assert iridanext_samples.SAMPLE2 == [['path':'reads/S2_ERR1109373_2.fastq.gz'], ['path':'reads/S2_ERR1109373_1.fastq.gz']] + assert iridanext_samples.SAMPLE1 == [['path':'reads/S_1_ERR1109373_2.fastq.gz'], ['path':'reads/S_1_ERR1109373_1.fastq.gz'], ['path':'reads/S_1_ERR1109373.fastq.gz']] + assert iridanext_samples.SAMPLE2 == [['path':'reads/S2_ERR1109373_2.fastq.gz'], ['path':'reads/S2_ERR1109373_1.fastq.gz'], ['path':'reads/S2_ERR1109373.fastq.gz']] assert iridanext_samples.SAMPLE3 == [['path':'reads/S2_SRR13191702_2.fastq.gz'], ['path':'reads/S2_SRR13191702_1.fastq.gz']] assert iridanext_samples.SAMPLE4 == [['path':'reads/SRR13191702_2.fastq.gz'], ['path':'reads/SRR13191702_1.fastq.gz']] From 5ad42b5061026b61a04ca6525fa36e487dabad53 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Wed, 20 Nov 2024 10:16:51 -0500 Subject: [PATCH 20/28] Clarify text --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 93a6949..9c06d86 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ The structure of this file is defined in [assets/schema_input.json](assets/schem `fetchdatairidanext` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name` -`sample_name`: An **optional** column, to allow for an override the default reads name with a `sample_name` prefix before the accession code. +`sample_name`: An **optional** column, to add the `sample_name` prefix before the accession code. `sample_name`, allows more flexibility in naming reads. Unlike `sample`, `sample_name` is not required to contain unique values. Non-alphanumeric characters (excluding `_`,`-`,`.`) will be replaced with `"_"`. `sample_name` can be provided without renaming by changing parameters. From 02ecdb5280bf8cda13a19aa5738c82e0f0464c65 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Wed, 20 Nov 2024 12:28:54 -0500 Subject: [PATCH 21/28] Clean up conditional operator --- workflows/fetchdatairidanext.nf | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/workflows/fetchdatairidanext.nf b/workflows/fetchdatairidanext.nf index b55d968..9a70a39 100644 --- a/workflows/fetchdatairidanext.nf +++ b/workflows/fetchdatairidanext.nf @@ -58,12 +58,7 @@ workflow FETCHDATAIRIDANEXT { input = Channel.fromSamplesheet("input") // and remove non-alphanumeric characters in sample_names (meta.id) .map { meta -> - if (meta.id[0]) { - // Non-alphanumeric characters (excluding _,-,.) will be replaced with "_" - new_id = meta.id[0].replaceAll(/[^A-Za-z0-9_\.\-]/, '_') // meta.id appears to be an immutable list, the workaround is to create a new variable - } else { - new_id = meta.id[0] - } + def new_id = meta.id[0]?.replaceAll(/[^A-Za-z0-9_\.\-]/, '_') ?: meta.id[0] return [["id": new_id, "irida_id": meta.irida_id[0], "insdc_accession": meta.insdc_accession[0]], meta.insdc_accession[0]] } From 18f3eb35defdc48ddedc18b3fe77c1d6af1e37d3 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Wed, 20 Nov 2024 12:30:57 -0500 Subject: [PATCH 22/28] Fixed container URL --- modules/local/sratools/fasterqdump/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/sratools/fasterqdump/main.nf b/modules/local/sratools/fasterqdump/main.nf index 8195085..28ebe07 100644 --- a/modules/local/sratools/fasterqdump/main.nf +++ b/modules/local/sratools/fasterqdump/main.nf @@ -5,7 +5,7 @@ process SRATOOLS_FASTERQDUMP { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' : - 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" + 'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" input: tuple val(meta), path(sra) From 261f455a0b8803b1caed393461cf9cbd1db28e13 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Wed, 20 Nov 2024 12:39:56 -0500 Subject: [PATCH 23/28] Improved readibility --- modules/local/prefetchchecker/main.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/local/prefetchchecker/main.nf b/modules/local/prefetchchecker/main.nf index 852061a..4917d1d 100644 --- a/modules/local/prefetchchecker/main.nf +++ b/modules/local/prefetchchecker/main.nf @@ -12,8 +12,9 @@ process PREFETCH_CHECKER { task.workDir.resolve("failures_report.csv").withWriter { writer -> sample_name = false - failures.each { if ( it[0].id != null) { - sample_name = true + failures.each { + if ( it[0].id != null) { + sample_name = true } } From 09ea6f03a15ea7bc89fbb71019947aeeb28c7315 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Wed, 20 Nov 2024 12:53:07 -0500 Subject: [PATCH 24/28] return the test, but remove section not used --- tests/pipelines/fetchdatairidanext.nf.test | 57 ++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/pipelines/fetchdatairidanext.nf.test diff --git a/tests/pipelines/fetchdatairidanext.nf.test b/tests/pipelines/fetchdatairidanext.nf.test new file mode 100644 index 0000000..4f60f64 --- /dev/null +++ b/tests/pipelines/fetchdatairidanext.nf.test @@ -0,0 +1,57 @@ +nextflow_pipeline { + + name "Test fetching small datasets from NCBI" + script "main.nf" + + test("basic integration test") { + + when { + params { + input = "$baseDir/tests/data/samplesheet.csv" + outdir = "test1_out" + } + } + + then { + assert workflow.success + + // IRIDA Next output file + assert path("$launchDir/test1_out/iridanext.output.json").json == path("$baseDir/tests/data/test1_iridanext.output.json").json + + // Output data + assert path("$launchDir/test1_out/reads/ERR1109373_1.fastq.gz").linesGzip.size() == 512 + assert path("$launchDir/test1_out/reads/ERR1109373_2.fastq.gz").linesGzip.size() == 512 + assert path("$launchDir/test1_out/reads/SRR13191702_1.fastq.gz").linesGzip.size() == 364 + assert path("$launchDir/test1_out/reads/SRR13191702_2.fastq.gz").linesGzip.size() == 364 + } + } + + test("integration test with prefetch failures") { + + when { + params { + input = "$baseDir/tests/data/errorsheet.csv" + outdir = "results" + } + } + + then { + assert workflow.success + + // IRIDA Next output file + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/prefetch_errors_iridanext.output.json").json + + // Output data: + assert path("$launchDir/results/reads/ERR1109373_1.fastq.gz").linesGzip.size() == 512 + assert path("$launchDir/results/reads/ERR1109373_2.fastq.gz").linesGzip.size() == 512 + assert path("$launchDir/results/reads/SRR13191702_1.fastq.gz").linesGzip.size() == 364 + assert path("$launchDir/results/reads/SRR13191702_2.fastq.gz").linesGzip.size() == 364 + + // These files should have failed, and have no output reads: + assert path("$launchDir/results/reads/SRR999908_1.fastq.gz").exists() == false + assert path("$launchDir/results/reads/SRR999908_2.fastq.gz").exists() == false + assert path("$launchDir/results/reads/SRR999934_1.fastq.gz").exists() == false + assert path("$launchDir/results/reads/SRR999934_2.fastq.gz").exists() == false + } + } +} From 3ba740ee985b64594a7609020d7adcf03950d5e2 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 21 Nov 2024 20:32:15 -0500 Subject: [PATCH 25/28] Claify parameter description --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9c06d86..26f4b59 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ An [example samplesheet](tests/data/samplesheets/samplesheet-sample_name.csv) ha The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers (or `-profile docker` for docker) and `-r [branch]` to specify which GitHub branch you would like to run. -`--rename_with_samplename` (Default: `true`) can changed to `false` to override reads renaming when `sample_name` provided in samplesheet input +`--rename_with_samplename` (Default: `true`) When `false`, samplesheet column `sample_name` not used for reads-renaming. Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schema.json). From dbe8bf68c821cd62ac3b171ab45221774134da26 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 21 Nov 2024 20:34:50 -0500 Subject: [PATCH 26/28] Sample_name description --- assets/schema_input.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index f1060cf..077a2f5 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -17,7 +17,7 @@ "sample_name": { "type": "string", "meta": ["id"], - "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next" + "errorMessage": "Optional. Used to override reads filename when used in tools like IRIDA-Next" }, "insdc_accession": { "type": "string", From 36f4bbb4318bbf04dd3ff580e1860a87c2a73b66 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 21 Nov 2024 20:40:29 -0500 Subject: [PATCH 27/28] Missing filepath --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 4c63b38..3a578f0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -49,7 +49,7 @@ SAMPLE2,,SRR13191702 | `sample_name` | Provides custom prefix to read filenames | | `insdc_accession` | The accession (run accession) from one of the INSDC databases (NCBI, ENA, or DDBJ). | -An [example samplesheet](tests/data/samplesheets/samplesheet-sample_name.csv) has been provided with the pipeline. +An [example samplesheet](tests/data/add-samplesheet.csv) has been provided with the pipeline. ## Running the pipeline From da65a5a7c923b150ea2bfa81930a4c6578824157 Mon Sep 17 00:00:00 2001 From: Steven Sutcliffe Date: Thu, 21 Nov 2024 20:43:25 -0500 Subject: [PATCH 28/28] Fixed wording --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 3a578f0..060b3cf 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -154,7 +154,7 @@ Specify the path to a specific config file (this is a core Nextflow command). Se ### `--rename_with_samplename` -When `sample_name` is included in samplesheet will add prefix to read filenames (Default: true) +When `sample_name` is included in the sample sheet, it will be prefixed to read filenames (Default: true) ## Custom configuration