From 92f2692a9f7160bbd47ed68f5627a6856b4b204c Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 23 Nov 2023 17:04:18 +0000 Subject: [PATCH 1/4] WIP: Adding sentieon/tnseq modules and subworkflow. Currently only support for no_intervals. --- conf/modules/prepare_genome.config | 6 +- conf/modules/sentieon_tnhaplotyper2.config | 117 +++++++ conf/test.config | 10 + conf/test/cache.config | 12 + conf/test/tools_somatic_tnhaplotyper2.config | 26 ++ modules.json | 10 + modules/nf-core/sentieon/tnfilter/main.nf | 83 +++++ modules/nf-core/sentieon/tnfilter/meta.yml | 85 ++++++ .../nf-core/sentieon/tnhaplotyper2/main.nf | 116 +++++++ .../nf-core/sentieon/tnhaplotyper2/meta.yml | 133 ++++++++ nextflow.config | 54 ++-- nextflow_schema.json | 2 +- .../bam_variant_calling_somatic_all/main.nf | 69 +++-- .../main.nf | 285 ++++++++++++++++++ .../meta.yml | 128 ++++++++ tests/test_sentieon_tnhaplotyper2.yml | 89 ++++++ .../test_sentieon_tnhaplotyper2_manually.yml | 186 ++++++++++++ workflows/sarek.nf | 5 +- 18 files changed, 1366 insertions(+), 50 deletions(-) create mode 100644 conf/modules/sentieon_tnhaplotyper2.config create mode 100644 conf/test/tools_somatic_tnhaplotyper2.config create mode 100644 modules/nf-core/sentieon/tnfilter/main.nf create mode 100644 modules/nf-core/sentieon/tnfilter/meta.yml create mode 100644 modules/nf-core/sentieon/tnhaplotyper2/main.nf create mode 100644 modules/nf-core/sentieon/tnhaplotyper2/meta.yml create mode 100644 subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/main.nf create mode 100644 subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/meta.yml create mode 100644 tests/test_sentieon_tnhaplotyper2.yml create mode 100644 tests/test_sentieon_tnhaplotyper2_manually.yml diff --git a/conf/modules/prepare_genome.config b/conf/modules/prepare_genome.config index 85367196c2..4ff265503a 100644 --- a/conf/modules/prepare_genome.config +++ b/conf/modules/prepare_genome.config @@ -76,7 +76,7 @@ process { } withName: 'TABIX_DBSNP' { - ext.when = { !params.dbsnp_tbi && params.dbsnp && ((params.step == "mapping" || params.step == "markduplicates" || params.step == "prepare_recalibration") || params.tools && (params.tools.split(',').contains('controlfreec') || params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope') || params.tools.split(',').contains('mutect2'))) } + ext.when = { !params.dbsnp_tbi && params.dbsnp && ((params.step == "mapping" || params.step == "markduplicates" || params.step == "prepare_recalibration") || params.tools && (params.tools.split(',').contains('controlfreec') || params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope') || params.tools.split(',').contains('mutect2') || params.tools.split(',').contains('sentieon_tnhaplotyper2'))) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, @@ -86,7 +86,7 @@ process { } withName: 'TABIX_GERMLINE_RESOURCE' { - ext.when = { !params.germline_resource_tbi && params.germline_resource && params.tools && params.tools.split(',').contains('mutect2') } + ext.when = { !params.germline_resource_tbi && params.germline_resource && params.tools && (params.tools.split(',').contains('mutect2') || params.tools.split(',').contains('sentieon_tnhaplotyper2')) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, @@ -116,7 +116,7 @@ process { } withName: 'TABIX_PON' { - ext.when = { !params.pon_tbi && params.pon && params.tools && params.tools.split(',').contains('mutect2') } + ext.when = { !params.pon_tbi && params.pon && params.tools && (params.tools.split(',').contains('mutect2') || params.tools.split(',').contains('sentieon_tnhaplotyper2')) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, diff --git a/conf/modules/sentieon_tnhaplotyper2.config b/conf/modules/sentieon_tnhaplotyper2.config new file mode 100644 index 0000000000..251fe47ed8 --- /dev/null +++ b/conf/modules/sentieon_tnhaplotyper2.config @@ -0,0 +1,117 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// TNHAPLOTYPER2 + +process { + + if (params.tools && (params.tools.split(',').contains('sentieon_tnhaplotyper2'))) { + withName: 'SENTIEON_TNHAPLOTYPER2' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.tnhaplotyper2" : "${meta.id}.tnhaplotyper2.${intervals.simpleName}" } + ext.when = { params.tools && (params.tools.split(',').contains('sentieon_tnhaplotyper2')) } + // TO-DO: sort out the following options which are options for mutect2 + // ext.args = { params.ignore_soft_clipped_bases ? "--dont-use-soft-clipped-bases true --f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz" : "--f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, + pattern: "*{vcf.gz,vcf.gz.tbi,stats}", + saveAs: { meta.num_intervals > 1 ? null : "tnhaplotyper2/${meta.id}/${it}" } + ] + } + + // PAIR_VARIANT_CALLING + withName: 'SENTIEON_TNHAPLOTYPER2_PAIRED' { + ext.args2 = { params.ignore_soft_clipped_bases ? + "--trim_soft_clip --normal_sample ${meta.patient}_${meta.normal_id} --tumor_sample ${meta.patient}_${meta.tumor_id}" : + "--normal_sample ${meta.patient}_${meta.normal_id} --tumor_sample ${meta.patient}_${meta.tumor_id}" } + ext.args3 = { "--tumor_sample ${meta.patient}_${meta.tumor_id}" } + ext.args4 = { "--normal_sample ${meta.patient}_${meta.normal_id} --tumor_sample ${meta.patient}_${meta.tumor_id}" } + } + + withName: 'MERGE_TNHAPLOTYPER2.*' { + ext.prefix = { "${meta.id}.tnhaplotyper2" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/tnhaplotyper2/${meta.id}" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + withName: 'SENTIEON_TNFILTER' { + ext.prefix = { "${meta.id}.filtered" } + ext.args2 = { "--normal_sample ${meta.patient}_${meta.normal_id} --tumor_sample ${meta.patient}_${meta.tumor_id}" } + } + + // TO-DO: I guess that kind of prefixing with the name of the subworkflow is necessary since the user may request tnhaplotyper2 and mutect2 in the same run. + // TO-DO: Add a similar prefixing for the mutect2-subworkflow in mutect2.config ` + withName: '.*_SENTIEON_TNHAPLOTYPER2:GATHERPILEUPSUMMARIES.*' { + ext.prefix = { "${meta.id}.tnhaplotyper2" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/tnhaplotyper2/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*_SENTIEON_TNHAPLOTYPER2:GETPILEUPSUMMARIES.*' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.tnhaplotyper2" : "${meta.id}.tnhaplotyper2.${intervals.simpleName}" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, + pattern: "*.table", + saveAs: { meta.num_intervals > 1 ? null : "tnhaplotyper2/${meta.id}/${it}" } + ] + } + + withName: '.*_SENTIEON_TNHAPLOTYPER2:CALCULATECONTAMINATION' { + ext.prefix = { "${meta.id}.tnhaplotyper2" } + ext.args = { "-tumor-segmentation ${meta.id}.tnhaplotyper2.segmentation.table" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/tnhaplotyper2/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // TO-DO: Don't use joint_mutect2 for tnhaplotyper2 + if (params.joint_mutect2) { + withName: '.*_SENTIEON_TNHAPLOTYPER2:CALCULATECONTAMINATION' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/tnhaplotyper2/${meta.patient}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + withName: '.*_SENTIEON_TNHAPLOTYPER2:SENTIEON_TNFILTER.*' { + ext.prefix = {"${meta.id}.tnhaplotyper2.filtered"} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : "tnhaplotyper2/${meta.id}/${filename}" } + ] + } + + // TO-DO: If possible, get rid of MERGEMUTECTSTATS in the tnhaplotyper2-subworkflow and use sentieon version instead. + withName: '.*_SENTIEON_TNHAPLOTYPER2:MERGEMUTECTSTATS' { + ext.prefix = { "${meta.id}.tnhaplotyper2" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/tnhaplotyper2/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + } +} diff --git a/conf/test.config b/conf/test.config index 2403fc63e9..90eb6dff54 100644 --- a/conf/test.config +++ b/conf/test.config @@ -75,6 +75,16 @@ process { ext.args = { "--f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz --normal-sample normal" } } + withName: '.*:SENTIEON_TNHAPLOTYPER2.*'{ + //sample name from when the test data was generated + ext.args2 = { "--normal_sample normal --tumor_sample tumour" } + } + + withName: '.*:SENTIEON_TNFILTER.*' { + ext.prefix = { "${meta.id}.filtered" } + ext.args2 = { "--tumor_sample tumour --normal_sample normal" } + } + withName: '.*:FILTERVARIANTTRANCHES'{ ext.args = { "--info-key CNN_1D --indel-tranche 0" } } diff --git a/conf/test/cache.config b/conf/test/cache.config index 9f51f72354..e1c0fb8db7 100644 --- a/conf/test/cache.config +++ b/conf/test/cache.config @@ -87,6 +87,18 @@ process { ext.args = { "--f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz --normal-sample normal" } } + withName: '.*:SENTIEON_TNHAPLOTYPER2.*'{ + //sample name from when the test data was generated + ext.args2 = { "--normal_sample normal --tumor_sample tumour" } + ext.args3 = { "--tumor_sample tumour" } + ext.args4 = { "--normal_sample normal --tumor_sample tumour" } + } + + withName: '.*:SENTIEON_TNFILTER.*' { + ext.prefix = { "${meta.id}.filtered" } + ext.args2 = { "--tumor_sample tumour --normal_sample normal" } + } + withName: '.*:FILTERVARIANTTRANCHES'{ ext.args = { "--info-key CNN_1D --indel-tranche 0" } } diff --git a/conf/test/tools_somatic_tnhaplotyper2.config b/conf/test/tools_somatic_tnhaplotyper2.config new file mode 100644 index 0000000000..8e7514c1ce --- /dev/null +++ b/conf/test/tools_somatic_tnhaplotyper2.config @@ -0,0 +1,26 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + // TO-DO: Put test-data in repo. + // TO-DO: Figure out which of the following settings are relevant for tnhaplotyper2 + input = "/home/ubuntu/test_data_tnhaplotyper2/fixed_readgroup.csv" + chr_dir = params.test_data['homo_sapiens']['genome']['genome_21_chromosomes_dir'] + dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] + intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + pon = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] + nucleotides_per_second = 20 + step = 'variant_calling' + tools = 'sentieon_tnhaplotyper2' + wes = true +} diff --git a/modules.json b/modules.json index 06ff0b6dab..a69072d17c 100644 --- a/modules.json +++ b/modules.json @@ -408,6 +408,16 @@ "git_sha": "b9172e8c26a3db5009f7872654c44587e254f094", "installed_by": ["modules"] }, + "sentieon/tnfilter": { + "branch": "master", + "git_sha": "388b763df527bb267cde359ded753a05e33fc0aa", + "installed_by": ["modules"] + }, + "sentieon/tnhaplotyper2": { + "branch": "master", + "git_sha": "30e382429b9761f39906982cc2cb1c31e0706ebe", + "installed_by": ["modules"] + }, "sentieon/varcal": { "branch": "master", "git_sha": "6c9c11ee96796e53a01b4719286acce6af14bc3a", diff --git a/modules/nf-core/sentieon/tnfilter/main.nf b/modules/nf-core/sentieon/tnfilter/main.nf new file mode 100644 index 0000000000..f1fbbe9b33 --- /dev/null +++ b/modules/nf-core/sentieon/tnfilter/main.nf @@ -0,0 +1,83 @@ +process SENTIEON_TNFILTER { + tag "$meta.id" + label 'process_medium' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + container 'nf-core/sentieon:202112.06' + + input: + tuple val(meta), path(vcf), path(vcf_tbi), path(stats), path(contamination), path(segments), path(orientation_priors) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.vcf.gz.tbi") , emit: vcf_tbi + tuple val(meta), path("*.vcf.gz.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' // options for the driver + def args2 = task.ext.args2 ?: '' // options for --algo TNfilter + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + def contamination_command = contamination ? " --contamination ${contamination} " : '' + def segments_command = segments ? segments.collect{"--tumor_segments $it"}.join(' ') : '' + def orientation_priors_command = orientation_priors ? orientation_priors.collect{"--orientation_priors $it"}.join(' ') : '' + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + sentieon driver -r $fasta \\ + $args \\ + --algo TNfilter \\ + $args2 \\ + -v $vcf \\ + $contamination_command \\ + $segments_command \\ + $orientation_priors_command \\ + ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + touch ${prefix}.vcf.gz.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/tnfilter/meta.yml b/modules/nf-core/sentieon/tnfilter/meta.yml new file mode 100644 index 0000000000..d184dd6b7b --- /dev/null +++ b/modules/nf-core/sentieon/tnfilter/meta.yml @@ -0,0 +1,85 @@ +name: sentieon_tnfilter +description: | + Filters the raw output of sentieon/tnhaplotyper2. +keywords: + - tnfilter + - filter + - sentieon + - tnhaplotyper2 + - vcf +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - vcf: + type: file + description: compressed vcf file from tnhaplotyper2 + pattern: "*.vcf.gz" + - vcf_tbi: + type: file + description: Tabix index of vcf file + pattern: "*vcf.gz.tbi" + - stats: + type: file + description: Stats file that pairs with output vcf file + pattern: "*vcf.gz.stats" + - contamination: + type: file + description: the location and file name of the file containing the contamination information produced by ContaminationModel. + pattern: "*.contamination_data.tsv" + - segments: + type: file + description: the location and file name of the file containing the tumor segments information produced by ContaminationModel. + pattern: "*.segments" + - orientation_priors: + type: file + description: the location and file name of the file containing the orientation bias information produced by OrientationBias. + pattern: "*.orientation_data.tsv" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + +output: + - vcf: + type: file + description: file containing filtered tnhaplotyper2 calls. + pattern: "*.vcf.gz" + - vcf_tbi: + type: file + description: tbi file that pairs with vcf. + pattern: "*.vcf.gz.tbi" + - stats: + type: file + description: file containing statistics of the tnfilter run. + pattern: "*.stats" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@asp8200" diff --git a/modules/nf-core/sentieon/tnhaplotyper2/main.nf b/modules/nf-core/sentieon/tnhaplotyper2/main.nf new file mode 100644 index 0000000000..36d8bbb1b1 --- /dev/null +++ b/modules/nf-core/sentieon/tnhaplotyper2/main.nf @@ -0,0 +1,116 @@ +process SENTIEON_TNHAPLOTYPER2 { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + container 'nf-core/sentieon:202112.06' + + input: + tuple val(meta), path(input), path(input_index), path(intervals) + tuple val(meta2), path(dict) + tuple val(meta3), path(fasta) + tuple val(meta4), path(fai) + tuple val(meta5), path(germline_resource) + tuple val(meta6), path(germline_resource_tbi) + tuple val(meta7), path(panel_of_normals) + tuple val(meta8), path(panel_of_normals_tbi) + val(emit_orientation_data) + val(emit_contamination_data) + + output: + tuple val(meta), path("*.orientation_data.tsv") , optional:true , emit: orientation_data + tuple val(meta), path("*.contamination_data.tsv"), optional:true , emit: contamination_data + tuple val(meta), path("*.segments") , optional:true , emit: contamination_segments + tuple val(meta), path("*.stats") , emit: stats + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.vcf.gz.tbi") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + def args = task.ext.args ?: '' // options for "sentieon driver" + def args2 = task.ext.args2 ?: '' // options for the TNhaplotyper2 algorithm. It could be something like "--tumor_sample --normal_sample " + def args3 = task.ext.args3 ?: '' // options for the OrientationBias algorithm. It could be something like "--tumor_sample " + def args4 = task.ext.args4 ?: '' // options for the ContaminationModel algorithm. It could be something like "--tumor_sample --normal_sample " + def prefix = task.ext.prefix ?: "${meta.id}" + def gr_command = germline_resource ? "--germline_vcf $germline_resource" : "" + def interval_command = intervals ? "--interval $intervals" : "" + def pon_command = panel_of_normals ? "--pon $panel_of_normals" : "" + def inputs = input.collect{ "-i $it"}.join(" ") + def orientation_bias_cmd = "" + def contamination_cmd = "" + + if (emit_orientation_data) { + orientation_bias_cmd = "--algo OrientationBias $args3 ${prefix}.orientation_data.tsv" + } + + if (emit_contamination_data) { + contamination_cmd = "--algo ContaminationModel $args4 --vcf $germline_resource --tumor_segments ${prefix}.segments ${prefix}.contamination_data.tsv" + } + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + sentieon driver \\ + -t $task.cpus \\ + -r $fasta \\ + $args \\ + $inputs \\ + $interval_command \\ + --algo TNhaplotyper2 \\ + $args2 \\ + $gr_command \\ + $pon_command \\ + ${prefix}.vcf.gz \\ + $orientation_bias_cmd \\ + $contamination_cmd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + touch ${prefix}.contamination_data.tsv + touch ${prefix}.orientation_data.tsv + touch ${prefix}.segments + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/tnhaplotyper2/meta.yml b/modules/nf-core/sentieon/tnhaplotyper2/meta.yml new file mode 100644 index 0000000000..0ef8704ad9 --- /dev/null +++ b/modules/nf-core/sentieon/tnhaplotyper2/meta.yml @@ -0,0 +1,133 @@ +name: sentieon_tnhaplotyper2 +description: Tnhaplotyper2 performs somatic variant calling on the tumor-normal matched pairs. +keywords: + - tnseq + - tnhaplotyper2 + - sentieon + - variant_calling +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test', single_end:false ] + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta4: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta5: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta6: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta7: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta8: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - fai: + type: file + description: Index of the genome fasta file + pattern: "*.fai" + - germline_resource: + type: file + description: Population vcf of germline sequencing, containing allele fractions. + pattern: "*.vcf.gz" + - germline_resource_tbi: + type: file + description: Index file for the germline resource. + pattern: "*.vcf.gz.tbi" + - panel_of_normals: + type: file + description: vcf file to be used as a panel of normals. + pattern: "*.vcf.gz" + - panel_of_normals_tbi: + type: file + description: Index for the panel of normals. + pattern: "*.vcf.gz.tbi" + - emit_orientation_data: + type: boolean + description: If true, the module will run the sentieon algorithm TNhaplotyper2 followed by the sentieon algorithm OrientationBias. + - emit_contamination_data: + type: boolean + description: If true, the module will run the sentieon algorithm TNhaplotyper2 followed by the sentieon algorithm ContaminationModel. + +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - orientation_data: + type: file + description: TSV file from Sentieon's algorithm OrientationBias + pattern: "*.orientation_data.tsv" + - contamination_data: + type: file + description: TSV file from Sentieon's algorithm ContaminationModel + pattern: "*.contamination_data.tsv" + - contamination_segments: + type: file + description: Tumour segments file from Sentieon's algorithm ContaminationModel + pattern: "*.segments" + - vcf: + type: file + description: VCF file + pattern: "*.{vcf.gz}" + - index: + type: file + description: Index of the VCF file + pattern: "*.vcf.gz.tbi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@asp8200" diff --git a/nextflow.config b/nextflow.config index ca6e2ed00a..217bbf5842 100644 --- a/nextflow.config +++ b/nextflow.config @@ -252,32 +252,33 @@ profiles { test_full { includeConfig 'conf/test_full.config' } test_full_germline { includeConfig 'conf/test_full_germline.config' } // Extra test profiles for more complete CI - alignment_to_fastq { includeConfig 'conf/test/alignment_to_fastq.config' } - annotation { includeConfig 'conf/test/annotation.config' } - markduplicates_bam { includeConfig 'conf/test/markduplicates_bam.config' } - markduplicates_cram { includeConfig 'conf/test/markduplicates_cram.config' } - no_intervals { includeConfig 'conf/test/no_intervals.config' } - pair { includeConfig 'conf/test/pair.config' } - prepare_recalibration_bam { includeConfig 'conf/test/prepare_recalibration_bam.config' } - prepare_recalibration_cram { includeConfig 'conf/test/prepare_recalibration_cram.config' } - recalibrate_bam { includeConfig 'conf/test/recalibrate_bam.config' } - recalibrate_cram { includeConfig 'conf/test/recalibrate_cram.config' } - save_bam_mapped { includeConfig 'conf/test/save_bam_mapped.config' } - sentieon_dedup_bam { includeConfig 'conf/test/sentieon_dedup_bam.config' } - sentieon_dedup_cram { includeConfig 'conf/test/sentieon_dedup_cram.config' } - skip_bqsr { includeConfig 'conf/test/skip_bqsr.config' } - skip_markduplicates { includeConfig 'conf/test/skip_markduplicates.config' } - split_fastq { includeConfig 'conf/test/split_fastq.config' } - targeted { includeConfig 'conf/test/targeted.config' } - tools { includeConfig 'conf/test/tools.config' } - tools_germline { includeConfig 'conf/test/tools_germline.config' } - tools_somatic { includeConfig 'conf/test/tools_somatic.config' } - tools_somatic_ascat { includeConfig 'conf/test/tools_somatic_ascat.config' } - tools_tumoronly { includeConfig 'conf/test/tools_tumoronly.config' } - trimming { includeConfig 'conf/test/trimming.config' } - umi { includeConfig 'conf/test/umi.config' } - use_gatk_spark { includeConfig 'conf/test/use_gatk_spark.config' } - variantcalling_channels { includeConfig 'conf/test/variantcalling_channels.config' } + alignment_to_fastq { includeConfig 'conf/test/alignment_to_fastq.config' } + annotation { includeConfig 'conf/test/annotation.config' } + markduplicates_bam { includeConfig 'conf/test/markduplicates_bam.config' } + markduplicates_cram { includeConfig 'conf/test/markduplicates_cram.config' } + no_intervals { includeConfig 'conf/test/no_intervals.config' } + pair { includeConfig 'conf/test/pair.config' } + prepare_recalibration_bam { includeConfig 'conf/test/prepare_recalibration_bam.config' } + prepare_recalibration_cram { includeConfig 'conf/test/prepare_recalibration_cram.config' } + recalibrate_bam { includeConfig 'conf/test/recalibrate_bam.config' } + recalibrate_cram { includeConfig 'conf/test/recalibrate_cram.config' } + save_bam_mapped { includeConfig 'conf/test/save_bam_mapped.config' } + sentieon_dedup_bam { includeConfig 'conf/test/sentieon_dedup_bam.config' } + sentieon_dedup_cram { includeConfig 'conf/test/sentieon_dedup_cram.config' } + skip_bqsr { includeConfig 'conf/test/skip_bqsr.config' } + skip_markduplicates { includeConfig 'conf/test/skip_markduplicates.config' } + split_fastq { includeConfig 'conf/test/split_fastq.config' } + targeted { includeConfig 'conf/test/targeted.config' } + tools { includeConfig 'conf/test/tools.config' } + tools_germline { includeConfig 'conf/test/tools_germline.config' } + tools_somatic { includeConfig 'conf/test/tools_somatic.config' } + tools_somatic_ascat { includeConfig 'conf/test/tools_somatic_ascat.config' } + tools_somatic_tnhaplotyper2 { includeConfig 'conf/test/tools_somatic_tnhaplotyper2.config' } + tools_tumoronly { includeConfig 'conf/test/tools_tumoronly.config' } + trimming { includeConfig 'conf/test/trimming.config' } + umi { includeConfig 'conf/test/umi.config' } + use_gatk_spark { includeConfig 'conf/test/use_gatk_spark.config' } + variantcalling_channels { includeConfig 'conf/test/variantcalling_channels.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile @@ -384,6 +385,7 @@ includeConfig 'conf/modules/sentieon_dnascope.config' includeConfig 'conf/modules/sentieon_dnascope_joint_germline.config' includeConfig 'conf/modules/sentieon_haplotyper.config' includeConfig 'conf/modules/sentieon_haplotyper_joint_germline.config' +includeConfig 'conf/modules/sentieon_tnhaplotyper2.config' includeConfig 'conf/modules/strelka.config' includeConfig 'conf/modules/tiddit.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 68c6b77146..15a90d8815 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -100,7 +100,7 @@ "fa_icon": "fas fa-toolbox", "description": "Tools to use for duplicate marking, variant calling and/or for annotation.", "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka2\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", - "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|sentieon_dnascope|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(? + joint_mutect2 ? + //we need to keep all fields and then remove on a per-tool-basis to ensure proper joining at the filtering step + [ meta + [ id:meta.patient ], [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] : + [ meta, [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] + }, + // Remap channel to match module/subworkflow + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + // Remap channel to match module/subworkflow + fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }, + dict, + germline_resource, + germline_resource_tbi, + panel_of_normals, + panel_of_normals_tbi, + intervals, + joint_mutect2 + ) + + vcf_sentieon_tnhaplotyper2 = BAM_VARIANT_CALLING_SOMATIC_SENTIEON_TNHAPLOTYPER2.out.vcf_filtered + versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_SENTIEON_TNHAPLOTYPER2.out.versions) + } + // MANTA if (tools.split(',').contains('manta')) { BAM_VARIANT_CALLING_SOMATIC_MANTA( @@ -230,7 +261,8 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL { vcf_manta, vcf_mutect2, vcf_strelka, - vcf_tiddit + vcf_tiddit, + vcf_sentieon_tnhaplotyper2 ) emit: @@ -241,6 +273,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL { vcf_mutect2 vcf_strelka vcf_tiddit + vcf_sentieon_tnhaplotyper2 versions } diff --git a/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/main.nf b/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/main.nf new file mode 100644 index 0000000000..e9d268e251 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/main.nf @@ -0,0 +1,285 @@ + +include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../modules/nf-core/gatk4/calculatecontamination/main' +include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES_NORMAL } from '../../../modules/nf-core/gatk4/gatherpileupsummaries/main' +include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES_TUMOR } from '../../../modules/nf-core/gatk4/gatherpileupsummaries/main' +include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_NORMAL } from '../../../modules/nf-core/gatk4/getpileupsummaries/main' +include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_TUMOR } from '../../../modules/nf-core/gatk4/getpileupsummaries/main' +// TO-DO: Remove the following out-commented include-statement - if it is not needed. +// include { GATK4_LEARNREADORIENTATIONMODEL as LEARNREADORIENTATIONMODEL } from '../../../modules/nf-core/gatk4/learnreadorientationmodel/main' +include { GATK4_MERGEMUTECTSTATS as MERGEMUTECTSTATS } from '../../../modules/nf-core/gatk4/mergemutectstats/main' +include { GATK4_MERGEVCFS as MERGE_TNHAPLOTYPER2 } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { SENTIEON_TNFILTER } from '../../../modules/nf-core/sentieon/tnfilter/main' +include { SENTIEON_TNHAPLOTYPER2 as SENTIEON_TNHAPLOTYPER2_PAIRED } from '../../../modules/nf-core/sentieon/tnhaplotyper2/main' + +workflow BAM_VARIANT_CALLING_SOMATIC_SENTIEON_TNHAPLOTYPER2 { + take: + input // channel: [ meta, [ input ], [ input_index ] ] + fasta // channel: /path/to/reference/fasta + fai // channel: /path/to/reference/fasta/index + dict // channel: /path/to/reference/fasta/dictionary + germline_resource // channel: /path/to/germline/resource + germline_resource_tbi // channel: /path/to/germline/index + panel_of_normals // channel: /path/to/panel/of/normals + panel_of_normals_tbi // channel: /path/to/panel/of/normals/index + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + + main: + versions = Channel.empty() + + //If no germline resource is provided, then create an empty channel to avoid GetPileupsummaries from being run + germline_resource_pileup = germline_resource_tbi ? germline_resource : Channel.empty() + germline_resource_pileup_tbi = germline_resource_tbi ?: Channel.empty() + + // Combine input and intervals for spread and gather strategy + input_intervals = input.combine(intervals) + // Move num_intervals to meta map and reorganize channel for SENTIEON_TNHAPLOTYPER2 module + .map{ meta, input_list, input_index_list, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], input_list, input_index_list, intervals ] } + + // TO-DO: Figure out if the variable joint_mutect2 should be (re)used for the tnhaplotyper2-subworkflow + // or perhaps we should introduce joint_tnhaplotyper2 + if (joint_mutect2) { + // Separate normal cram files + // Extract tumor cram files + ch_cram = input.multiMap{ meta, cram, crai -> + normal: [ meta - meta.subMap('tumor_id') , cram[0], crai[0] ] + tumor: [ meta - meta.subMap('tumor_id') , cram[1], crai[1] ] + } + + // Remove duplicates from normal channel and merge normal and tumor crams by patient + ch_tn_cram = ch_cram.normal.unique().mix(ch_cram.tumor).groupTuple() + // Combine input and intervals for scatter and gather strategy + ch_tn_intervals = ch_tn_cram.combine(intervals) + // Move num_intervals to meta map and reorganize channel for SENTIEON_TNHAPLOTYPER2 module + // meta: [id:patient_id, num_intervals, patient, sex] + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals ] } + + SENTIEON_TNHAPLOTYPER2_PAIRED( + ch_tn_intervals, + dict, + fasta, + fai, + germline_resource.map{ it -> [ [ id:it.baseName ], it ] }, + germline_resource_tbi.map{ it -> [ [ id:it.baseName ], it ] }, + panel_of_normals.map{ it -> [ [ id:it.baseName ], it ] }, + panel_of_normals_tbi.map{ it -> [ [ id:it.baseName ], it ] }, + true, // TO-DO: These things shouldn't be hardcoded + true + ) + } else { + // Perform variant calling using mutect2 module pair mode + // meta: [id:tumor_id_vs_normal_id, normal_id, num_intervals, patient, sex, tumor_id] + SENTIEON_TNHAPLOTYPER2_PAIRED( + input_intervals, + dict, + fasta, + fai, + germline_resource.map{ it -> [ [ id:it.baseName ], it ] }, + germline_resource_tbi.map{ it -> [ [ id:it.baseName ], it ] }, + panel_of_normals.map{ it -> [ [ id:it.baseName ], it ] }, + panel_of_normals_tbi.map{ it -> [ [ id:it.baseName ], it ] }, + true, // TO-DO: These things shouldn't be hardcoded + true + ) + } + + vcf_to_filter = SENTIEON_TNHAPLOTYPER2_PAIRED.out.vcf + .join(SENTIEON_TNHAPLOTYPER2_PAIRED.out.index, failOnDuplicate: true, failOnMismatch: true) + .join(SENTIEON_TNHAPLOTYPER2_PAIRED.out.stats, failOnDuplicate: true, failOnMismatch: true) + .join(SENTIEON_TNHAPLOTYPER2_PAIRED.out.contamination_data, failOnDuplicate: true, failOnMismatch: true) + .join(SENTIEON_TNHAPLOTYPER2_PAIRED.out.contamination_segments, failOnDuplicate: true, failOnMismatch: true) + .join(SENTIEON_TNHAPLOTYPER2_PAIRED.out.orientation_data, failOnDuplicate: true, failOnMismatch: true) + // .map{ meta, vcf, tbi, stats, seg, cont -> [ meta, vcf, tbi, stats, [], seg, cont, [] ] } + +/* + TO-DO: Clean up the following + // Figuring out if there is one or more vcf(s) from the same sample + vcf_branch = SENTIEON_TNHAPLOTYPER2_PAIRED.out.vcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more tbi(s) from the same sample + tbi_branch = SENTIEON_TNHAPLOTYPER2_PAIRED.out.index.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more vcf(s) from the same sample + stats_branch = SENTIEON_TNHAPLOTYPER2_PAIRED.out.stats.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + /* TO-DO: Doesn't seem relevant for tnhaplotyper2 + // Figuring out if there is one or more vcf(s) from the same sample + f1r2_branch = SENTIEON_TNHAPLOTYPER2.out.f1r2.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + */ + +/* + // Only when using intervals + vcf_to_merge = vcf_branch.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ] }.groupTuple() + stats_to_merge = stats_branch.intervals.map{ meta, stats -> [ groupKey(meta, meta.num_intervals), stats ] }.groupTuple() + // TO-DO: Doesn't seem relevant for tnhaplotyper2 + // f1r2_to_merge = f1r2_branch.intervals.map{ meta, f1r2 -> [ groupKey(meta, meta.num_intervals), f1r2 ] }.groupTuple() + + MERGE_TNHAPLOTYPER2(vcf_to_merge, dict) + MERGEMUTECTSTATS(stats_to_merge) + + // Mix intervals and no_intervals channels together and remove no longer necessary field: normal_id, tumor_id, num_intervals + vcf = Channel.empty().mix(MERGE_TNHAPLOTYPER2.out.vcf, vcf_branch.no_intervals).map{ meta, vcf -> + [ joint_mutect2 ? meta - meta.subMap('normal_id', 'num_intervals') : meta - meta.subMap('num_intervals') , vcf ] + } + tbi = Channel.empty().mix(MERGE_TNHAPLOTYPER2.out.tbi, tbi_branch.no_intervals).map{ meta, tbi-> + [ joint_mutect2 ? meta - meta.subMap('normal_id', 'num_intervals') : meta - meta.subMap('num_intervals'), tbi ] + } + stats = Channel.empty().mix(MERGEMUTECTSTATS.out.stats, stats_branch.no_intervals).map{ meta, stats -> + [ joint_mutect2 ? meta - meta.subMap('normal_id', 'num_intervals') : meta - meta.subMap('num_intervals'), stats ] + } + // TO-DO: Doesn't seem relevant for tnhaplotyper2 + /* + f1r2 = Channel.empty().mix(f1r2_to_merge, f1r2_branch.no_intervals).map{ meta, f1r2-> + [ joint_mutect2 ? meta - meta.subMap('normal_id', 'num_intervals') : meta - meta.subMap('num_intervals') , f1r2 ] + } + */ + + // TO-DO: Doesn't seem relevant for tnhaplotyper2 + // Generate artifactpriors using learnreadorientationmodel on the f1r2 output of mutect2 + // LEARNREADORIENTATIONMODEL(f1r2) +/* + pileup = input_intervals.multiMap{ meta, input_list, input_index_list, intervals -> + tumor: [ meta, input_list[1], input_index_list[1], intervals ] + normal: [ meta, input_list[0], input_index_list[0], intervals ] + } + + // Prepare input channel for normal pileup summaries. + // Remember, the input channel contains tumor-normal pairs, so there will be multiple copies of the normal sample for each tumor for a given patient. + // Therefore, we use unique function to generate normal pileup summaries once for each patient for better efficiency. + pileup_normal = pileup.normal.map{ meta, cram, crai, intervals -> [ meta - meta.subMap('tumor_id') + [ id:meta.normal_id ], cram, crai, intervals] }.unique() + // Prepare input channel for tumor pileup summaries. + pileup_tumor = pileup.tumor.map{ meta, cram, crai, intervals -> [ meta - meta.subMap('normal_id') + [ id:meta.tumor_id ], cram, crai, intervals ] } + + // Generate pileup summary tables using getepileupsummaries. tumor sample should always be passed in as the first input and input list entries of vcf_to_filter, + GETPILEUPSUMMARIES_NORMAL(pileup_normal, fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi) + GETPILEUPSUMMARIES_TUMOR(pileup_tumor, fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi) + + // Figuring out if there is one or more table(s) from the same sample + pileup_table_normal_branch = GETPILEUPSUMMARIES_NORMAL.out.table.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more table(s) from the same sample + pileup_table_tumor_branch = GETPILEUPSUMMARIES_TUMOR.out.table.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + pileup_table_normal_to_merge = pileup_table_normal_branch.intervals.map{ meta, table -> [ groupKey(meta, meta.num_intervals), table ] }.groupTuple() + pileup_table_tumor_to_merge = pileup_table_tumor_branch.intervals.map{ meta, table -> [ groupKey(meta, meta.num_intervals), table ] }.groupTuple() + + // Merge Pileup Summaries + GATHERPILEUPSUMMARIES_NORMAL(pileup_table_normal_to_merge, dict.map{ meta, dict -> [ dict ] }) + GATHERPILEUPSUMMARIES_TUMOR(pileup_table_tumor_to_merge, dict.map{ meta, dict -> [ dict ] }) + + // Do some channel magic to generate tumor-normal pairs again. + // This is necessary because we generated one normal pileup summary for each patient but we need run calculate contamination for each tumor-normal pair. + pileup_table_tumor = Channel.empty().mix(GATHERPILEUPSUMMARIES_TUMOR.out.table, pileup_table_tumor_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals') + [id:meta.patient], meta.id, table ] } + pileup_table_normal = Channel.empty().mix(GATHERPILEUPSUMMARIES_NORMAL.out.table, pileup_table_normal_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals') + [id:meta.patient], meta.id, table ] } + + ch_calculatecontamination_in_tables = pileup_table_tumor.combine( + pileup_table_normal, by:0).map{ + meta, tumor_id, tumor_table, normal_id, normal_table -> + if(joint_mutect2){ + [ meta + [ id: tumor_id + "_vs_" + normal_id], tumor_table, normal_table] + }else{ + // we need tumor and normal ID for further post processing + [ meta + [ id: tumor_id + "_vs_" + normal_id, normal_id:normal_id, tumor_id:tumor_id ], tumor_table, normal_table] + } + } + + CALCULATECONTAMINATION(ch_calculatecontamination_in_tables) +*/ + // Initialize empty channel: Contamination calculation is run on pileup table, pileup is not run if germline resource is not provided + calculatecontamination_out_seg = Channel.empty() + calculatecontamination_out_cont = Channel.empty() +/* + if (joint_mutect2) { + // Reduce the meta to only patient name + calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta + [id: meta.patient], seg]}.groupTuple() + calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination.map{ meta, cont -> [ meta + [id: meta.patient], cont]}.groupTuple() + } + else { + // Keep tumor_vs_normal ID + calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation + calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination + } + + // Mutect2 calls filtered by filtermutectcalls using the artifactpriors, contamination and segmentation tables + // meta joint calling: [id:patient_id, patient, sex] + // meta paired calling: [id:tumorID_vs_normalID, normal_ID, patient, sex, tumorID] + vcf_to_filter = vcf.join(tbi, failOnDuplicate: true, failOnMismatch: true) + .join(stats, failOnDuplicate: true, failOnMismatch: true) + // .join(LEARNREADORIENTATIONMODEL.out.artifactprior, failOnDuplicate: true, failOnMismatch: true) + .join(calculatecontamination_out_seg) + .join(calculatecontamination_out_cont) + // .map{ meta, vcf, tbi, stats, orientation, seg, cont -> [ meta, vcf, tbi, stats, orientation, seg, cont, [] ] } + .map{ meta, vcf, tbi, stats, seg, cont -> [ meta, vcf, tbi, stats, [], seg, cont, [] ] } + +*/ + + SENTIEON_TNFILTER(vcf_to_filter, fasta, fai) + + vcf_filtered = SENTIEON_TNFILTER.out.vcf + // add variantcaller to meta map + .map{ meta, vcf -> [ meta + [ variantcaller:'sentieon_tnhaplotyper2' ], vcf ] } + + /* INFO: Some of the following may be needed later... + TO-DO: Clean up the following + versions = versions.mix(MERGE_TNHAPLOTYPER2.out.versions) + versions = versions.mix(CALCULATECONTAMINATION.out.versions) + */ + versions = versions.mix(SENTIEON_TNFILTER.out.versions) + /* + versions = versions.mix(GETPILEUPSUMMARIES_NORMAL.out.versions) + versions = versions.mix(GETPILEUPSUMMARIES_TUMOR.out.versions) + versions = versions.mix(GATHERPILEUPSUMMARIES_NORMAL.out.versions) + versions = versions.mix(GATHERPILEUPSUMMARIES_TUMOR.out.versions) + */ + // TO-DO: Doesn't seem relevant for tnhaplotyper2 + // versions = versions.mix(LEARNREADORIENTATIONMODEL.out.versions) + // INFO: May be needed later... + // versions = versions.mix(MERGEMUTECTSTATS.out.versions) + versions = versions.mix(SENTIEON_TNHAPLOTYPER2_PAIRED.out.versions) + + emit: + // INFO: vcf and stats before came from vcf = Channel.empty().mix(MERGE_TNHAPLOTYPER2.out.vcf etc. + vcf = SENTIEON_TNHAPLOTYPER2_PAIRED.out.vcf // channel: [ meta, vcf ] + stats = SENTIEON_TNHAPLOTYPER2_PAIRED.out.stats // channel: [ meta, stats ] + + vcf_filtered // channel: [ meta, vcf ] + index_filtered = SENTIEON_TNFILTER.out.vcf_tbi // channel: [ meta, tbi ] + stats_filtered = SENTIEON_TNFILTER.out.stats // channel: [ meta, stats ] + + // TO-DO: Doesn't seem relevant for tnhaplotyper2 + // artifact_priors = LEARNREADORIENTATIONMODEL.out.artifactprior // channel: [ meta, artifactprior ] + + // INFO: Temporarily just set to empty channels + pileup_table_normal = Channel.empty() // channel: [ meta, table_normal ] + pileup_table_tumor = Channel.empty() // channel: [ meta, table_tumor ] + + contamination_table = calculatecontamination_out_cont // channel: [ meta, contamination ] + segmentation_table = calculatecontamination_out_seg // channel: [ meta, segmentation ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/meta.yml b/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/meta.yml new file mode 100644 index 0000000000..b6ce6980fb --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/meta.yml @@ -0,0 +1,128 @@ +name: gatk_tumor_normal_somatic_variant_calling +description: | + TO-DO: Update the docs. Mutect2 -> Tnhaplotyper2 + Perform variant calling on a paired tumor normal set of samples using mutect2 tumor normal mode. + f1r2 output of mutect2 is run through learnreadorientationmodel to get the artifact priors. + Run the input bam files through getpileupsummarries and then calculatecontamination to get the contamination and segmentation tables. + Filter the mutect2 output vcf using filtermutectcalls, artifact priors and the contamination & segmentation tables for additional filtering. +keywords: + - gatk4 + - mutect2 + - learnreadorientationmodel + - getpileupsummaries + - calculatecontamination + - filtermutectcalls + - variant_calling + - tumor_only + - filtered_vcf +modules: + - gatk4/mutect2 + - gatk4/learnreadorientationmodel + - gatk4/getpileupsummaries + - gatk4/calculatecontamination + - gatk4/filtermutectcalls +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - input: + type: list + description: list containing the tumor and normal BAM files, in that order, also able to take CRAM as an input + pattern: "[ *.{bam/cram} ]" + - input_index: + type: list + description: list containing the tumor and normal BAM file indexes, in that order, also able to take CRAM index as an input + pattern: "[ *.{bam.bai/cram.crai} ]" + - which_norm: + type: list + description: optional list of sample headers contained in the normal sample input file. + pattern: "testN" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - germline_resource: + type: file + description: Population vcf of germline sequencing, containing allele fractions. + pattern: "*.vcf.gz" + - germline_resource_tbi: + type: file + description: Index file for the germline resource. + pattern: "*.vcf.gz.tbi" + - panel_of_normals: + type: file + description: vcf file to be used as a panel of normals. + pattern: "*.vcf.gz" + - panel_of_normals_tbi: + type: file + description: Index for the panel of normals. + pattern: "*.vcf.gz.tbi" + - interval_file: + type: file + description: File containing intervals. + pattern: "*.interval_list" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - mutect2_vcf: + type: file + description: Compressed vcf file to be used for variant_calling. + pattern: "[ *.vcf.gz ]" + - mutect2_tbi: + type: file + description: Indexes of the mutect2_vcf file + pattern: "[ *vcf.gz.tbi ]" + - mutect2_stats: + type: file + description: Stats files for the mutect2 vcf + pattern: "[ *vcf.gz.stats ]" + - mutect2_f1r2: + type: file + description: file containing information to be passed to LearnReadOrientationModel. + pattern: "*.f1r2.tar.gz" + - artifact_priors: + type: file + description: file containing artifact-priors to be used by filtermutectcalls. + pattern: "*.tar.gz" + - pileup_table_tumor: + type: file + description: File containing the tumor pileup summary table, kept separate as calculatecontamination needs them individually specified. + pattern: "*_tumor.pileups.table" + - pileup_table_normal: + type: file + description: File containing the normal pileup summary table, kept separate as calculatecontamination needs them individually specified. + pattern: "*_normal.pileups.table" + - contamination_table: + type: file + description: File containing the contamination table. + pattern: "*.contamination.table" + - segmentation_table: + type: file + description: Output table containing segmentation of tumor minor allele fractions. + pattern: "*.segmentation.table" + - filtered_vcf: + type: file + description: file containing filtered mutect2 calls. + pattern: "*.vcf.gz" + - filtered_tbi: + type: file + description: tbi file that pairs with filtered vcf. + pattern: "*.vcf.gz.tbi" + - filtered_stats: + type: file + description: file containing statistics of the filtermutectcalls run. + pattern: "*.filteringStats.tsv" +authors: + - "@GCJMackenzie" diff --git a/tests/test_sentieon_tnhaplotyper2.yml b/tests/test_sentieon_tnhaplotyper2.yml new file mode 100644 index 0000000000..c3ac7c80b3 --- /dev/null +++ b/tests/test_sentieon_tnhaplotyper2.yml @@ -0,0 +1,89 @@ +# TO-DO: Adjust these tests +# - name: Run variant calling on tumor only sample with mutect2 +# command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools mutect2 --outdir results +# tags: +# - mutect2 +# - tumor_only +# - variant_calling +# files: +# - path: results/csv/variantcalled.csv +# md5sum: d57c1beba9005e9790a573bd93398b72 +# - path: results/multiqc +# - path: results/reports/bcftools/mutect2/sample2/sample2.mutect2.filtered.bcftools_stats.txt +# # conda changes md5sums for test +# - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.FILTER.summary +# md5sum: ef9bd9a2f41d8872ba25e5616e4c2a5e +# - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.TsTv.count +# md5sum: fe3ff1f0c2ead72f037552727438e00a +# - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.TsTv.qual +# # conda changes md5sums for test +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.artifactprior.tar.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.contamination.table +# md5sum: 46c708c943b453da89a3da08acfdb2a7 +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz.filteringStats.tsv +# md5sum: 9a8439d0bb5875f1e673cf592af85ffb +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz.tbi +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.pileups.table +# md5sum: 9afe42339f590937166edcf4746c22ec +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.segmentation.table +# md5sum: f4643d9319bde4efbfbe516d6fb13052 +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz.stats +# md5sum: 3cc40a35727af6c5223fb45678f3f172 +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz.tbi +# # binary changes md5sums on reruns +# - path: results/mutect2 +# should_exist: false +- name: Run variant calling on tumor only sample with sentieon tnhaplotyper2 without intervals + command: nextflow run main.nf -profile test_cache,tools_tumoronly,software_license --sentieon_extension --tools sentieon_tnhaplotyper2 --no_intervals --outdir results + tags: + - tnhaplotyper2 + - sentieon + - no_intervals + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d57c1beba9005e9790a573bd93398b72 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/mutect2/sample2/sample2.mutect2.filtered.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.FILTER.summary + md5sum: 5a833fd50e6efb26d1df2336eb0caf5e + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.TsTv.count + md5sum: f5295a61da80f12babae74fe4e104aad + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.artifactprior.tar.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.contamination.table + md5sum: 46c708c943b453da89a3da08acfdb2a7 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz.filteringStats.tsv + md5sum: e4eac0c602dd25aa61a6dc26a2b61844 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.pileups.table + md5sum: fe35b6bc041f2df8bd1f23420af3ddf9 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.segmentation.table + md5sum: f4643d9319bde4efbfbe516d6fb13052 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz.stats + md5sum: 55ed641e16089afb33cdbc478e202d3d + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/mutect2 + should_exist: false diff --git a/tests/test_sentieon_tnhaplotyper2_manually.yml b/tests/test_sentieon_tnhaplotyper2_manually.yml new file mode 100644 index 0000000000..db1075bdff --- /dev/null +++ b/tests/test_sentieon_tnhaplotyper2_manually.yml @@ -0,0 +1,186 @@ +- name: Run variant calling on somatic sample with tnhaplotyper2 without intervals + command: nextflow run main.nf -profile test_cache,tools_somatic_tnhaplotyper2,software_license --sentieon_extension --no_intervals --outdir results + tags: + - tnhaplotyper2_manual + - sentieon + - manual + - no_intervals + - somatic + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d3c9f0559d48696c54f3c463b1606586 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.bcftools_stats.txt + md5sum: 9876607145d11c6b8492264936d7a82c + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.FILTER.summary + md5sum: b25d4d2a64f9590d0ffb119fd3adb06e + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.TsTv.count + md5sum: 3739f24da2d2019cc4bc2821e30658eb + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.TsTv.qual + md5sum: 05c0cbb017d9232bc728d48f9d0c7afd + - path: results/variant_calling/mutect2/sample3/sample3.mutect2.pileups.table + md5sum: 8e0ca6f66e112bd2f7ec1d31a2d62469 + - path: results/variant_calling/mutect2/sample4/sample4.mutect2.pileups.table + md5sum: fe35b6bc041f2df8bd1f23420af3ddf9 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.artifactprior.tar.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.contamination.table + md5sum: 46c708c943b453da89a3da08acfdb2a7 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz.filteringStats.tsv + md5sum: 9ae27fbd04af1a2ea574e2ff1c3a683b + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.segmentation.table + md5sum: f4643d9319bde4efbfbe516d6fb13052 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz.stats + md5sum: 17d2091015d04cbd4a26b7a67dc659e6 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz.tbi + # binary changes md5sums on reruns +# All the following needs to be changed to tests of sentieon/tnhaplotyper2 +# - name: Run variant calling on somatic sample with mutect2 +# command: nextflow run main.nf -profile test_cache,tools_somatic --tools mutect2 --outdir results +# tags: +# - mutect2_manual +# - manual +# - somatic +# - variant_calling +# files: +# - path: results/csv/variantcalled.csv +# md5sum: d3c9f0559d48696c54f3c463b1606586 +# - path: results/multiqc +# - path: results/reports/bcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.bcftools_stats.txt +# md5sum: 9876607145d11c6b8492264936d7a82c +# - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.FILTER.summary +# md5sum: b25d4d2a64f9590d0ffb119fd3adb06e +# - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.TsTv.count +# md5sum: 3739f24da2d2019cc4bc2821e30658eb +# - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.TsTv.qual +# md5sum: 05c0cbb017d9232bc728d48f9d0c7afd +# - path: results/variant_calling/mutect2/sample3/sample3.mutect2.pileups.table +# md5sum: 16077fdb885a8afe64c7669477471354 +# - path: results/variant_calling/mutect2/sample4/sample4.mutect2.pileups.table +# md5sum: 9afe42339f590937166edcf4746c22ec +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.artifactprior.tar.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.contamination.table +# md5sum: 46c708c943b453da89a3da08acfdb2a7 +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz.filteringStats.tsv +# md5sum: 9ae27fbd04af1a2ea574e2ff1c3a683b +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz.tbi +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.segmentation.table +# md5sum: f4643d9319bde4efbfbe516d6fb13052 +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz.stats +# md5sum: c09dff3f145d77d4848992e244811c08 +# - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz.tbi +# # binary changes md5sums on reruns +# - name: Run joint calling on tumor only samples with mutect2 +# command: nextflow run main.nf -profile test_cache,tools_tumoronly --input tests/csv/3.0/recalibrated_tumoronly_joint.csv --tools mutect2 --joint_mutect2 --outdir results +# tags: +# - mutect2_manual +# - manual +# - tumor_only +# - variant_calling +# - multi_sample +# - joint_tumoronly +# files: +# - path: results/csv/variantcalled.csv +# md5sum: f87290ce1c6ea523e08354ed6c258b0b +# - path: results/multiqc +# - path: results/reports/bcftools/mutect2/test/test.mutect2.filtered.bcftools_stats.txt +# md5sum: a0cdc26fb7d8c446dd0283fed71a24d5 +# - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.FILTER.summary +# md5sum: e1e42b6f65cbdba116cff72a56e40c4b +# - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.TsTv.count +# md5sum: c00e1639a41deb107099487676a6cf37 +# - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.TsTv.qual +# md5sum: a21016aa99e5cbf32eeae1b405ca6d8d +# - path: results/variant_calling/mutect2/test/sample2.mutect2.contamination.table +# md5sum: 46c708c943b453da89a3da08acfdb2a7 +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.pileups.table +# md5sum: 9afe42339f590937166edcf4746c22ec +# - path: results/variant_calling/mutect2/test/sample2.mutect2.segmentation.table +# md5sum: f4643d9319bde4efbfbe516d6fb13052 +# - path: results/variant_calling/mutect2/test/sample3.mutect2.contamination.table +# md5sum: 11440fe64b5b953d7efb9cf47e330364 +# - path: results/variant_calling/mutect2/sample3/sample3.mutect2.pileups.table +# md5sum: fd0c1f7819717b7f94e52f6611f4b2e0 +# - path: results/variant_calling/mutect2/test/sample3.mutect2.segmentation.table +# md5sum: 38f83e2f98b206640644dd93d5e96f4e +# - path: results/variant_calling/mutect2/test/test.mutect2.artifactprior.tar.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz.filteringStats.tsv +# md5sum: f237666ae325fde0c06b8bc62d2846fc +# - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz.tbi +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz.stats +# md5sum: 22e58aef3b14b335fa487d40b590ffeb +# - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz.tbi +# # binary changes md5sums on reruns +# - name: Run joint calling on somatic samples with mutect2 +# command: nextflow run main.nf -profile test_cache,tools_somatic --input tests/csv/3.0/recalibrated_somatic_joint.csv --tools mutect2 --joint_mutect2 --outdir results +# tags: +# - mutect2_manual +# - somatic +# - variant_calling +# - multi_sample +# - joint_somatic +# files: +# - path: results/csv/variantcalled.csv +# md5sum: f87290ce1c6ea523e08354ed6c258b0b +# - path: results/multiqc +# - path: results/reports/bcftools/mutect2/test/test.mutect2.filtered.bcftools_stats.txt +# md5sum: d75da410d57960944f54d02b2b5cdcac +# - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.FILTER.summary +# md5sum: e0eb3e34fc15f3b452bfc43f032cc8be +# - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.TsTv.count +# md5sum: aa51bde6080c015c6aa6c8254977dd11 +# - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.TsTv.qual +# md5sum: 262f843f68d072c457ca28b56da3ede8 +# - path: results/variant_calling/mutect2/sample1/sample1.mutect2.pileups.table +# md5sum: 16077fdb885a8afe64c7669477471354 +# - path: results/variant_calling/mutect2/sample2/sample2.mutect2.pileups.table +# md5sum: 9afe42339f590937166edcf4746c22ec +# - path: results/variant_calling/mutect2/sample3/sample3.mutect2.pileups.table +# md5sum: fd0c1f7819717b7f94e52f6611f4b2e0 +# - path: results/variant_calling/mutect2/test/sample2_vs_sample1.mutect2.contamination.table +# md5sum: 46c708c943b453da89a3da08acfdb2a7 +# - path: results/variant_calling/mutect2/test/sample3_vs_sample1.mutect2.contamination.table +# md5sum: 11440fe64b5b953d7efb9cf47e330364 +# - path: results/variant_calling/mutect2/test/sample2_vs_sample1.mutect2.segmentation.table +# md5sum: f4643d9319bde4efbfbe516d6fb13052 +# - path: results/variant_calling/mutect2/test/sample3_vs_sample1.mutect2.segmentation.table +# md5sum: 38f83e2f98b206640644dd93d5e96f4e +# - path: results/variant_calling/mutect2/test/test.mutect2.artifactprior.tar.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz.filteringStats.tsv +# md5sum: dee72b4c5c9bbda01d44fd3e00f1b404 +# - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz.tbi +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz +# # binary changes md5sums on reruns +# - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz.stats +# md5sum: 094cb75b0bda28e92b6718ff33d136e2 +# - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz.tbi +# # binary changes md5sums on reruns diff --git a/workflows/sarek.nf b/workflows/sarek.nf index a6ed5b2e50..2b1045b020 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -310,12 +310,13 @@ if (params.tools && error("When using Sentieon Haplotyper for joint-germline variant-calling the option `--sentieon_haplotyper_emit_mode` has to include `gvcf`.") } - +/* INFO: Temporarily disabled + TO-DO: Clean this up! // Fails when --joint_mutect2 is used without enabling mutect2 if (params.joint_mutect2 && (!params.tools || !params.tools.split(',').contains('mutect2'))) { error("The mutect2 should be specified as one of the tools when doing joint somatic variant calling with Mutect2. (The mutect2 could be specified by adding `--tools mutect2` to the nextflow command.)") } - +*/ // Fails when missing tools for variant_calling or annotate if ((params.step == 'variant_calling' || params.step == 'annotate') && !params.tools) { error("Please specify at least one tool when using `--step ${params.step}`.\nhttps://nf-co.re/sarek/parameters#tools") From 12114748c3fa3dbfdecc1f04994f62dfbcecb2eb Mon Sep 17 00:00:00 2001 From: asp8200 Date: Fri, 24 Nov 2023 10:19:56 +0000 Subject: [PATCH 2/4] Introducing option joint_tnhaplotyper2 as replacement of joint_mutect2 in tnhaplotyper-subworkflow --- conf/modules/sentieon_tnhaplotyper2.config | 3 +-- nextflow.config | 3 ++- nextflow_schema.json | 5 +++++ subworkflows/local/bam_variant_calling_somatic_all/main.nf | 7 ++++--- .../main.nf | 6 ++---- workflows/sarek.nf | 1 + 6 files changed, 15 insertions(+), 10 deletions(-) diff --git a/conf/modules/sentieon_tnhaplotyper2.config b/conf/modules/sentieon_tnhaplotyper2.config index 251fe47ed8..6b43914e86 100644 --- a/conf/modules/sentieon_tnhaplotyper2.config +++ b/conf/modules/sentieon_tnhaplotyper2.config @@ -83,8 +83,7 @@ process { ] } - // TO-DO: Don't use joint_mutect2 for tnhaplotyper2 - if (params.joint_mutect2) { + if (params.joint_tnhaplotyper2) { withName: '.*_SENTIEON_TNHAPLOTYPER2:CALCULATECONTAMINATION' { publishDir = [ mode: params.publish_dir_mode, diff --git a/nextflow.config b/nextflow.config index ea9c34d807..327c3d33ae 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,7 +69,8 @@ params { concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected - joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling + joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling with mutect2 + joint_tnhaplotyper2 = false // if true, enables patient-wise multi-sample somatic variant calling with tnhaplotyper2 only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired sample sentieon_dnascope_emit_mode = 'variant' // default value for Sentieon dnascope sentieon_dnascope_pcr_indel_model = 'CONSERVATIVE' diff --git a/nextflow_schema.json b/nextflow_schema.json index e806139f1d..eb061d52b1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -254,6 +254,11 @@ "description": "Turn on the joint germline variant calling for GATK haplotypecaller", "help_text": "Uses all normal germline samples (as designated by `status` in the input csv) in the joint germline variant calling process." }, + "joint_tnhyplotyper2": { + "type": "boolean", + "fa_icon": "fas fa-angle-double-right", + "description": "TO-DO: Check if the following is OK: Runs tnhaplotyper2 in joint (multi-sample) mode for better concordance among variant calls of tumor samples from the same patient. tnhaplotyper2 outputs will be stored in a subfolder named with patient ID under `variant_calling/tnhaplotyper2/` folder. Only a single normal sample per patient is allowed. Tumor-only mode is also supported." + }, "joint_mutect2": { "type": "boolean", "fa_icon": "fas fa-angle-double-right", diff --git a/subworkflows/local/bam_variant_calling_somatic_all/main.nf b/subworkflows/local/bam_variant_calling_somatic_all/main.nf index 68fa768c88..e59d895da2 100644 --- a/subworkflows/local/bam_variant_calling_somatic_all/main.nf +++ b/subworkflows/local/bam_variant_calling_somatic_all/main.nf @@ -42,6 +42,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL { gc_file // channel: [optional] ascat gc content file rt_file // channel: [optional] ascat rt file joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + joint_tnhaplotyper2 // boolean: [mandatory] [default: false] run tnhaplotyper2 in joint mode wes // boolean: [mandatory] [default: false] whether targeted data is processed main: @@ -148,9 +149,9 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL { BAM_VARIANT_CALLING_SOMATIC_SENTIEON_TNHAPLOTYPER2( // Remap channel to match module/subworkflow // Adjust meta.map to simplify joining channels - // joint_mutect2 mode needs different meta.map than regular mode + // joint_tnhaplotyper2 mode needs different meta.map than regular mode cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> - joint_mutect2 ? + joint_tnhaplotyper2 ? //we need to keep all fields and then remove on a per-tool-basis to ensure proper joining at the filtering step [ meta + [ id:meta.patient ], [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] : [ meta, [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] @@ -165,7 +166,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL { panel_of_normals, panel_of_normals_tbi, intervals, - joint_mutect2 + joint_tnhaplotyper2 ) vcf_sentieon_tnhaplotyper2 = BAM_VARIANT_CALLING_SOMATIC_SENTIEON_TNHAPLOTYPER2.out.vcf_filtered diff --git a/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/main.nf b/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/main.nf index e9d268e251..c38b4bc9c4 100644 --- a/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/main.nf +++ b/subworkflows/local/bam_variant_calling_somatic_sentieon_tnhaplotyper2/main.nf @@ -22,7 +22,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_SENTIEON_TNHAPLOTYPER2 { panel_of_normals // channel: /path/to/panel/of/normals panel_of_normals_tbi // channel: /path/to/panel/of/normals/index intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals - joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + joint_tnhaplotyper2 // boolean: [mandatory] [default: false] run mutect2 in joint mode main: versions = Channel.empty() @@ -36,9 +36,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_SENTIEON_TNHAPLOTYPER2 { // Move num_intervals to meta map and reorganize channel for SENTIEON_TNHAPLOTYPER2 module .map{ meta, input_list, input_index_list, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], input_list, input_index_list, intervals ] } - // TO-DO: Figure out if the variable joint_mutect2 should be (re)used for the tnhaplotyper2-subworkflow - // or perhaps we should introduce joint_tnhaplotyper2 - if (joint_mutect2) { + if (joint_tnhaplotyper2) { // Separate normal cram files // Extract tumor cram files ch_cram = input.multiMap{ meta, cram, crai -> diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 5b0a9dcde9..b130c2f28b 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -1002,6 +1002,7 @@ workflow SAREK { gc_file, rt_file, params.joint_mutect2, + params.joint_tnhaplotyper2, params.wes ) From 9ddbcfbd02118c1f8c85db403a53dabc9eccd863 Mon Sep 17 00:00:00 2001 From: Simon Pearce <24893913+SPPearce@users.noreply.github.com> Date: Tue, 7 May 2024 10:24:38 +0000 Subject: [PATCH 3/4] Update subworkflow calling --- subworkflows/local/bam_variant_calling_somatic_all/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/bam_variant_calling_somatic_all/main.nf b/subworkflows/local/bam_variant_calling_somatic_all/main.nf index 7f2bd02281..33e4e4a64f 100644 --- a/subworkflows/local/bam_variant_calling_somatic_all/main.nf +++ b/subworkflows/local/bam_variant_calling_somatic_all/main.nf @@ -157,9 +157,9 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL { [ meta, [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] }, // Remap channel to match module/subworkflow - fasta.map{ it -> [ [ id:'fasta' ], it ] }, + fasta, // Remap channel to match module/subworkflow - fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }, + fasta_fai, dict, germline_resource, germline_resource_tbi, From 2d8e078df1144e6ee788d80eef0662f5302cc8a5 Mon Sep 17 00:00:00 2001 From: Simon Pearce <24893913+SPPearce@users.noreply.github.com> Date: Tue, 7 May 2024 10:34:53 +0000 Subject: [PATCH 4/4] Fix typo --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 921878cf61..5d2da46d98 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -265,7 +265,7 @@ "description": "Turn on the joint germline variant calling for GATK haplotypecaller", "help_text": "Uses all normal germline samples (as designated by `status` in the input csv) in the joint germline variant calling process." }, - "joint_tnhyplotyper2": { + "joint_tnhaplotyper2": { "type": "boolean", "fa_icon": "fas fa-angle-double-right", "description": "TO-DO: Check if the following is OK: Runs tnhaplotyper2 in joint (multi-sample) mode for better concordance among variant calls of tumor samples from the same patient. tnhaplotyper2 outputs will be stored in a subfolder named with patient ID under `variant_calling/tnhaplotyper2/` folder. Only a single normal sample per patient is allowed. Tumor-only mode is also supported."