Merge pull request #143 from phac-nml/dev

Dev
phac-nml · Nov 27, 2024 · ddf5f2b · ddf5f2b
2 parents 1560548 + 9170fbc
commit ddf5f2b
Show file tree

Hide file tree

Showing 32 changed files with 25,633 additions and 72 deletions.
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -53,7 +53,7 @@ jobs:
           GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
-        run: nf-core -l lint_log.txt lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
+        run: nf-core -l lint_log.txt pipelines lint --release --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
 
       - name: Save PR number
         if: ${{ always() }}

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3
+        uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6
         with:
           workflow: linting.yml
           workflow_conclusion: completed

diff --git a/.gitignore b/.gitignore
@@ -23,3 +23,4 @@ docs/TODO.md
 assets/schema_input_nfv2.0.0.json
 nextflow_schema_nfv2.json
 .vscode
+.nf-test.log
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,5 +1,5 @@
 repository_type: pipeline
-nf_core_version: "2.14.1"
+nf_core_version: "3.0.2"
 lint:
   files_exist:
     - CODE_OF_CONDUCT.md
@@ -27,6 +27,9 @@ lint:
   nextflow_config:
     - manifest.name
     - manifest.homePage
+    - params.max_cpus
+    - params.max_memory
+    - params.max_time
   multiqc_config: False
 template:
   prefix: phac-nml
diff --git a/.wordlist.txt b/.wordlist.txt
@@ -174,4 +174,5 @@ downsampling
 Christy
 Marinier
 Petkau
-
+gzipped
+monocytogenes
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,18 +3,34 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## Unreleased
+## [0.5.0] - 2024-11-27
 
-### `Changed`
+### `Added`
 
 - Added RASUSA for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
 
+- Added a new `sample_name` field to the `schema_input.json` file: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
+- Incorporated a `--skip_read_merging` parameter to prevent read merging [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
+### `Changed`
+
+- Added a `sample_name` field, `sample` still exists but is used to incorporate additional names/identifiers in IRIDANext [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
+- RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
+
+- Default *Listeria* quality control parameters apply only to *monocytogenes* now. [PR 142](https://github.com/phac-nml/mikrokondo/pull/142)
+
 ### `Updated`
 
 - Documentation and workflow diagram has been updated. [PR 123](https://github.com/phac-nml/mikrokondo/pull/123)
 
 - Documentation and Readme has been updated. [PR 126](https://github.com/phac-nml/mikrokondo/pull/126)
 
+- Adjusted `schema_input.json` to allow for non-gzipped inputs. [PR 137](https://github.com/phac-nml/mikrokondo/pull/137)
+
+- Updated github actions workflows for nf-core version 3.0.1. [PR 137](https://github.com/phac-nml/mikrokondo/pull/137)
+
 ## [0.4.2] - 2024-09-25
 
 ### `Fixed`
@@ -176,6 +192,7 @@ Initial release of phac-nml/mikrokondo. Mikrokondo currently supports: read trim
 
 - Added integration testing using [nf-test](https://www.nf-test.com/).
 
+[0.5.0]: https://github.com/phac-nml/mikrokondo/releases/tag/0.5.0
 [0.4.2]: https://github.com/phac-nml/mikrokondo/releases/tag/0.4.2
 [0.4.1]: https://github.com/phac-nml/mikrokondo/releases/tag/0.4.1
 [0.4.0]: https://github.com/phac-nml/mikrokondo/releases/tag/0.4.0

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft-07/schema",
     "$id": "https://raw.githubusercontent.com/mk-kondo/mikrokondo/master/assets/schema_input.json",
     "title": "Samplesheet schema validation",
     "description": "Schema for the file provided with params.input",
@@ -10,12 +10,17 @@
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces",
+                "meta": ["external_id"],
+                "errorMessage": "Sample name to be used in report generation. Valid characters include alphanumeric and -. All other characters will be replaced by underscores."
+            },
+            "sample_name": {
+                "type": "string",
+                "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Valid characters include alphanumeric and -. All other characters will be replaced by underscores.",
                 "meta": ["id"]
             },
             "fastq_1": {
                 "type": "string",
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
+                "pattern": "^\\S+\\.f(ast)?q(\\.gz)?$",
                 "format": "file-path",
                 "errorMessage": "FastQ file for reads 1 (forward reads) must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'. If this is meant to be a run of mikrokondo with long read data please specify the paths under long_reads",
                 "dependentRequired": ["fastq_2"],
@@ -24,23 +29,23 @@
             },
             "fastq_2": {
                 "type": "string",
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
+                "pattern": "^\\S+\\.f(ast)?q(\\.gz)?$",
                 "format": "file-path",
                 "errorMessage": "FastQ file for reads 2 (reverse reads) cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
                 "meta": ["fastq_2"],
                 "unique": true
             },
             "long_reads": {
                 "type": "string",
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
+                "pattern": "^\\S+\\.f(ast)?q(\\.gz)?$",
                 "format": "file-path",
                 "errorMessage": "FastQ file for long reads must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'.",
                 "meta": ["long_reads"],
                 "unique": true
             },
             "assembly": {
                 "type": "string",
-                "pattern": "^\\S+\\.f(ast)?n?a\\.gz$",
+                "pattern": "^\\S+\\.f(ast)?n?a(\\.gz)?$",
                 "format": "file-path",
                 "errorMessage": "Fasta file, cannot contain spaces and must have extension '.fa.gz' or '.fasta.gz'.",
                 "meta": ["assembly"],

diff --git a/bin/report_summaries.py b/bin/report_summaries.py
@@ -37,9 +37,10 @@ class JsonImport:
     __keep_keys = frozenset(__key_order.keys())
     __delimiter = "\t"
     __key_delimiter = "."
+    __inx_irida_key = "meta.external_id"
 
     def __init__(self, report_fp, output_name, sample_suffix):
-        self.tool_data = None # TODO set this in output of group tool fields
+        self.tool_data = None
         self.output_name = output_name
         self.output_transposed = os.path.splitext(os.path.basename(self.output_name))[0] + "_transposed.tsv"
         self.output_dir = os.path.dirname(self.output_name)
@@ -49,7 +50,7 @@ def __init__(self, report_fp, output_name, sample_suffix):
         self.flat_sample_string = sample_suffix
         self.data = self.ingest_report(self.report_fp)
         self.flat_data, self.common_fields, self.tool_fields, self.table = self.flatten_json(self.data)
-        self.output_indv_json(self.flat_data)
+        self.flat_data = self.output_indv_json(self.flat_data)
         self.output_flat_json(self.flat_data)
         self.write_table(self.table)
 
@@ -64,7 +65,6 @@ def write_table(self, table_data: Dict[str, Dict[str, str]]):
         """
         keys = set([k for k in table_data])
         ordered_keys = []
-
         # Get the wanted information to the top of the page
         poisoned_keys = set()
         for option in self.__key_order:
@@ -79,7 +79,6 @@ def write_table(self, table_data: Dict[str, Dict[str, str]]):
         ordered_keys.extend(scalar_keys)
         ordered_keys.extend(sorted([i for i in keys if i not in ordered_keys and i not in poisoned_keys]))
         row_labels = sorted([i for i in next(iter(table_data.values()))])
-
         self.write_tsv(table_data, row_labels, ordered_keys)
         self.write_transposed_tsv(table_data, row_labels, ordered_keys)
 
@@ -233,7 +232,6 @@ def remove_prefix_id_fields(self, flattened_dict):
                     top_level_keys.add(item_key)
                 temp[item_key] = v
 
-        #self.tool_data = tool_data
         return reformatted_data, top_level_keys, tool_keys
 
 
@@ -242,7 +240,7 @@ def ingest_report(self, report_fp):
         report_fp: File path to the json report to be read in
         """
         data = None
-        with open(report_fp, "r", encoding="utf8") as report:
+        with open(report_fp, "r") as report:
             data = json.load(report)
         return data
 
@@ -262,11 +260,27 @@ def output_indv_json(self, flattened_data):
         Args:
             flattened_data (json: Dict[sample_id: Dict[tool_info: value]]):
         """
+        updated_items = dict()
         for k, v in flattened_data.items():
-            with open(os.path.join(self.output_dir, k + self.flat_sample_string), "w") as output:
+            out_key = k
+            sample_dir = k
+            dir_name = v.get(self.__inx_irida_key)
+            if k != dir_name:
+                sample_dir = dir_name
+                #! this field affects the identification of the irida next id being passed out of the pipeline
+                out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field
+            out_dir = os.path.join(self.output_dir, sample_dir)
+            out_path = os.path.join(out_dir, k + self.flat_sample_string)
+            if not os.path.isdir(out_dir): # Check for directory existence, as it will still exist on pipeline resumes
+                os.mkdir(out_dir)
+
+            with open(out_path, "w") as output:
                 json_data = json.dumps({k: v}, indent=2)
                 output.write(json_data)
+            updated_items[out_key] = v
 
+        flattened_data = updated_items
+        return flattened_data
 
     def to_file(self):
         with open(self.output_name, "w") as out_file:
@@ -282,7 +296,6 @@ def to_file(self):
                         out_file.write(f'"{val_write}"')
                     else:
                         out_file.write(val_write)
-                        # out_file.write(str(ii[1][i]).replace('\n', ' \\'))
                     out_file.write(self.__delimiter)
                 out_file.write("\n")
 
@@ -291,7 +304,7 @@ def to_file(self):
 
 
 
-def main_(args_in):
+def main(args_in):
     default_samp_suffix = "_flat_sample.json"
     parser = argparse.ArgumentParser("Table Summary")
     parser.add_argument("-f", "--file-in", help="Path to the mikrokondo json summary")
@@ -307,4 +320,4 @@ def main_(args_in):
 
 if __name__ == "__main__":
     # pass json file to program to parse it
-    main_(sys.argv[1:])
+    main(sys.argv[1:])
diff --git a/conf/irida_next.config b/conf/irida_next.config
@@ -11,7 +11,7 @@ iridanext {
         overwrite = true
         validate = false
         files {
-            idkey = "sample"
+            idkey = 'external_id'  // Previously sample
             global = [
                 "**/FinalReports/Aggregated/Json/final_report.json",
                 "**/FinalReports/Aggregated/Tables/final_report.tsv"

diff --git a/docs/usage/usage.md b/docs/usage/usage.md
@@ -23,32 +23,33 @@ Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) con
 - long_reads
 - assembly
 
+> **Note:** Illegal characters (e.g. characters that match the expression [^A-Za-z0-9_\-] ) in the sample name will be replaced with underscores.
 
 Example layouts for different sample-sheets include:
 
 _Illumina paired-end data_
 
 |sample|fastq_1|fastq_2|
 |------|-------|-------|
-|sample_name|path_to_forward_reads|path_to_reversed_reads|
+|sample|path_to_forward_reads|path_to_reversed_reads|
 
 _Nanopore_
 
 |sample|long_reads|
 |------|----------|
-|sample_name|path_to_reads|
+|sample|path_to_reads|
 
 _Hybrid Assembly_
 
 |sample|fastq_1|fastq_2|long_reads|
 |-------|-------|------|----------|
-|sample_name|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|
+|sample|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|
 
 _Starting with assembly only_
 
 |sample|assembly|
 |------|--------|
-|sample_name|path_to_assembly|
+|sample|path_to_assembly|
 
 _Example merging paired-end data_
 
@@ -96,6 +97,8 @@ _Example merging paired-end data_
 Numerous steps within mikrokondo can be turned off without compromising the stability of the pipeline. This skip options can reduce run-time of the pipeline or allow for completion of the pipeline despite errors.
 ** All of the above options can be turned on by entering `--{skip_option} true` in the command line arguments to the pipeline (where optional parameters can be added)**
 
+
+- `--skip_read_merging`: Do not merge reads, if duplicate sample names are present the names will be made unique.
 - `--skip_abricate`: turn off abricate AMR detection
 - `--skip_bakta`: turn off bakta annotation pipeline (generally a slow step, requiring a database to be specified).
 - `--skip_checkm`: used as part of the contamination detection within mikrokondo, its run time and resource usage can be quite lengthy.

diff --git a/main.nf b/main.nf
@@ -42,9 +42,6 @@ if (params.help) {
 if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
 
 
-
-
-
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     NAMED WORKFLOW FOR PIPELINE
@@ -111,15 +108,17 @@ workflow MIKROKONDO {
         REPORT_AGGREGATE(REPORT.out.final_report)
         ch_versions = ch_versions.mix(REPORT_AGGREGATE.out.versions)
 
-
         updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{
                     sample ->
                         def name_trim = sample.getName()
                         def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length())
-                        tuple([
+                        def external_id_name = sample.getParent().getBaseName()
+                        def output_map = [
                             "id": trimmed_name,
-                            "sample": trimmed_name],
-                            sample)
+                            "sample": trimmed_name,
+                            "external_id": external_id_name]
+
+                        tuple(output_map, sample)
                     }
 
         GZIP_FILES(updated_samples)

diff --git a/modules/local/combine_data.nf b/modules/local/combine_data.nf
@@ -20,16 +20,16 @@ process COMBINE_DATA{
     def fields_merge = meta.fields_merge
 
     if(fastq_1){
-        cmd_ << "cat ${meta.fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;"
+        cmd_ << "cat ${fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;"
     }
     if(fastq_2){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;"
+        cmd_ << "cat ${fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;"
     }
     if(long_reads){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;"
+        cmd_ << "cat ${long_reads.join(' ')} > out/${prefix}.merged.fastq.gz;"
     }
     if(assembly){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;"
+        cmd_ << "cat ${assembly.join(' ')} > out/${prefix}.merged.fastq.gz;"
     }
     def cmd = cmd_.join("\n")
     // creating dummy outputs so that all outputs exist for any scenario

diff --git a/modules/local/report.nf b/modules/local/report.nf
@@ -43,11 +43,13 @@ process REPORT{
 
         if(!sample_data.containsKey(meta_data.sample)){
             sample_data[meta_data.sample] = [:]
-            // TODO add strings to constants file
             sample_data[meta_data.sample]["meta"] = [:]
         }
 
         update_map_values(sample_data, meta_data, "metagenomic")
+        update_map_values(sample_data, meta_data, "id")
+        update_map_values(sample_data, meta_data, "sample")
+        update_map_values(sample_data, meta_data, "external_id")
         update_map_values(sample_data, meta_data, "assembly")
         update_map_values(sample_data, meta_data, "hybrid")
         update_map_values(sample_data, meta_data, "single_end")
@@ -63,7 +65,6 @@ process REPORT{
             if(!check_file_params(report_tag, extension)){
                 continue
             }
-            // TODO pass in report metadata
             def output_data = parse_data(report_value, extension, report_tag, headers_list)
             if(output_data){
                 report_value = output_data

diff --git a/modules/local/report_aggregate.nf b/modules/local/report_aggregate.nf
@@ -14,7 +14,7 @@ process REPORT_AGGREGATE{
     path("final_report.tsv"), emit: final_report
     path("final_report_transposed.tsv"), emit: final_report_transposed
     path("final_report_flattened.json"), emit: flattened_files
-    path("*${sample_flat_suffix}"), emit: flat_samples
+    path("*/*${sample_flat_suffix}"), emit: flat_samples
     path "versions.yml", emit: versions
 
     script:
-Original file line number
+Diff line change
@@ Expand Up / @@ -174,4 +174,5 @@ downsampling @@
     Christy
     Marinier
     Petkau
+    gzipped
+    monocytogenes