Merge pull request #80 from aertslab/develop

Develop Former-commit-id: c25c31f
vib-singlecell-nf · Dec 23, 2019 · 1ae363c · 1ae363c
2 parents b957eb1 + 1aee2e1
commit 1ae363c
Show file tree

Hide file tree

Showing 64 changed files with 1,366 additions and 454 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -26,11 +26,11 @@ This will take only **~3min** to run.
 In your working directory, run `nextflow config ...` with the appropriate profiles:
 ```bash
 nextflow config aertslab/SingleCellTxBenchmark \
-    -profile singularity,single_sample > single_sample.config
+    -profile tenx,singularity,single_sample > single_sample.config
 ```
 Now, edit `single_sample.config`.
 Most of the default values are already set for the test dataset, but certain variables (e.g. container links) may need to be changed.
-In particular, `params.global.tenx_folder` should point to the `outs/` folder in the 10x data, and
+In particular, `params.data.tenx.cellranger_outs_dir_path` should point to the `outs/` folder generated by CellRanger, and
     `params.sc.file_converter` should be a path to the sample metadata file.
 
 3. The pipeline can be run using the config file just generated (`-C ...`), and specifying the `single_sample` workflow as an entrypoint:
@@ -77,7 +77,7 @@ For example, to run the `single_sample` workflow in a new working directory usin
 mkdir single_sample_test && cd single_sample_test
 
 nextflow config aertslab/SingleCellTxBenchmark \
-    -profile singularity,single_sample > single_sample.config
+    -profile tenx,singularity,single_sample > single_sample.config
 ```
 2. Now run the workflow using the new config file (using `-C` to use **only** this file), specifying the proper workflow as the entry point:
 ```bash
@@ -189,10 +189,10 @@ Let's say the file structure of your data looks like this,
             └── ...
 ```
 
-Setting the input directory appropriately will collect all the samples listed in the `filtered_feature_bc_matrix` directories listed above.
-For example, in `params.global`, setting:
+Setting the input directory appropriately will collect all the samples listed in the `filtered_[feature|gene]_bc_matrix` directories listed above.
+For example, in `params.data.tenx`, setting:
 ```
-tenx_folder = "/home/data/cellranger/Sample*/outs/"
+cellranger_outs_dir_path = "/home/data/cellranger/Sample*/outs/"
 ```
 will recursively find all 10x samples in that directory.
 

diff --git a/conf/generic.config b/conf/generic.config
@@ -0,0 +1,17 @@
+params {
+    // This closure facilitates the usage of sample specific parameters
+   parseConfig = { sample, paramsGlobal, paramsLocal ->
+       def pL = paramsLocal.collectEntries { k,v ->
+           if (v instanceof Map) {
+               if (v.containsKey(sample))
+                   return [k, v[sample]]
+               if (v.containsKey('default'))
+                   return [k, v['default']]
+                throw new Exception("Not a valid entry in " + k + ". The sample " + sample + " is not found in " + v +" ; Make sure your samples are correctly specified when using the multi-sample feature.")
+           } else {
+               return [k,v]
+           }
+       }
+       return [global: paramsGlobal, local: pL]
+   }
+}
diff --git a/conf/test__bbknn.config b/conf/test__bbknn.config
@@ -2,7 +2,11 @@
 params {
     global {
         project_name = 'bbknn_CI'
-        tenx_folder = "testdata/*/outs/"
+    }
+    data {
+        tenx {
+            cellranger_outs_dir_path = "testdata/*/outs/"
+        }
     }
     sc {
         file_annotator {

diff --git a/conf/test__scenic.config b/conf/test__scenic.config
@@ -2,7 +2,6 @@
 params {
     global {
         project_name = 'scenic_CI'
-        tenx_folder = ''
     }
     sc {
         file_annotator {

diff --git a/conf/test__single_sample.config b/conf/test__single_sample.config
@@ -2,7 +2,11 @@
 params {
     global {
         project_name = 'single_sample_CI'
-        tenx_folder = 'sample_data/outs/filtered_feature_bc_matrix/'
+    }
+    data {
+        tenx {
+            cellranger_outs_dir_path = 'sample_data/outs'
+        }
     }
     sc {
         file_annotator {

diff --git a/conf/test__single_sample_scenic.config b/conf/test__single_sample_scenic.config
@@ -2,7 +2,11 @@
 params {
     global {
         project_name = 'single_sample_scenic_CI'
-        tenx_folder = 'sample_data/outs/filtered_feature_bc_matrix/'
+    }
+    data {
+        tenx {
+            cellranger_outs_dir_path = 'sample_data/outs'
+        }
     }
     sc {
         file_annotator {

diff --git a/nextflow.config b/nextflow.config
@@ -3,7 +3,7 @@ manifest {
     name = 'aertslab/SingleCellTxBenchmark'
     description = 'A repository of pipelines for single-cell data in Nextflow DSL2'
     homePage = 'https://github.com/aertslab/SingleCellTxBenchmark'
-    version = '0.5.0'
+    version = '0.6.0'
     mainScript = 'main.nf'
     defaultBranch = 'master'
     nextflowVersion = '!19.10.0' // with ! prefix, stop execution if current version does not match required version.
@@ -14,7 +14,6 @@ params {
     global {
         project_name = '10x_PBMC'
         outdir = 'out'
-        tenx_folder = "data/10x/1k_pbmc/1k_pbmc_*/outs/"
         qsubaccount = ''
     }
 }
@@ -31,8 +30,6 @@ process {
     }
 }
 
-includeConfig 'src/utils/utils.config' // utilities config
-
 profiles {
 
     standard {
@@ -104,14 +101,19 @@ profiles {
         includeConfig 'src/star/star.config'
         includeConfig 'src/dropletutils/dropletutils.config'
     }
+    cellranger {
+        includeConfig 'src/cellranger/cellranger.config'
+    }
+
+    // data profiles
+    tenx {
+        includeConfig 'src/channels/conf/tenx.config'
+    }
     sra {
         includeConfig 'src/channels/conf/sra.config'
         includeConfig 'src/utils/conf/sra_metadata.config'
         includeConfig 'src/sratoolkit/sratoolkit.config'
     }
-    cellranger {
-        includeConfig 'src/cellranger/cellranger.config'
-    }
 
     // utility profiles
     utils_sample_annotate {
@@ -140,6 +142,11 @@ profiles {
 
 }
 
+includeConfig 'src/utils/conf/scope.config'
+includeConfig 'src/utils/utils.config' // utilities config
+includeConfig 'https://raw.githubusercontent.com/aertslab/SingleCellTxBenchmark/feature/20-sample_specific_config_params/conf/generic.config?token=AD2GEQB64A5PXYHIHMHZECK6AOF5I'
+
+
 timeline {
     enabled = true
     file = "${params.global.outdir}/nextflow_reports/execution_timeline.html"

diff --git a/setup.py b/setup.py
diff --git a/src/cellranger/main.nf b/src/cellranger/main.nf
@@ -1,12 +1,3 @@
-//
-// Version: 0.1.0
-// Test: passed
-// Command: 
-//  nextflow run src/singlecelltxbenchmark/pipelines/bec__bbknn -profile singularity --tenx_folder data/01.count/**/filtered_feature_bc_matrix --project_name tiny
-//
-/*
- */ 
-
 nextflow.preview.dsl=2
 
 // include groupParams from '../../utils/utils.nf'

diff --git a/src/cellranger/processes/count.nf b/src/cellranger/processes/count.nf
@@ -1,41 +1,45 @@
 nextflow.preview.dsl=2
 
+toolParams = params.sc.cellranger
+
 process SC__CELLRANGER__COUNT {
 
-	label params.sc.cellranger.labels.processExecutor
-    cache 'deep'
-    container params.sc.cellranger.container
-    publishDir "${params.global.outdir}/counts", mode: 'link', overwrite: true
-    clusterOptions "-l nodes=1:ppn=${params.sc.cellranger.count.ppn} -l pmem=${params.sc.cellranger.count.pmem} -l walltime=24:00:00 -A ${params.global.qsubaccount}"
-	maxForks = params.sc.cellranger.count.maxForks
+	  label toolParams.labels.processExecutor
+	  cache 'deep'
+	  container toolParams.container
+	  publishDir "${params.global.outdir}/counts", mode: 'link', overwrite: true
+	  clusterOptions "-l nodes=1:ppn=${toolParams.count.ppn} -l pmem=${toolParams.count.pmem} -l walltime=24:00:00 -A ${params.global.qsubaccount}"
+	  maxForks = toolParams.count.maxForks
 
-  	input:
+    input:
 		file(transcriptome)
 		tuple val(sampleId), file(fastqs)
 
   	output:
     	tuple val(sampleId), file("${sampleId}/outs")
 
   	script:
+	  	def sampleParams = params.parseConfig(sampleId, params.global, toolParams.count)
+		  processParams = sampleParams.local
 		"""
 		cellranger count \
 			--id=${sampleId} \
 			--sample=${sampleId} \
 			--fastqs=${fastqs.join(",")} \
 			--transcriptome=${transcriptome} \
-			${(params.sc.cellranger.count.containsKey('libraries')) ? '--libraries ' + params.sc.cellranger.count.libraries: ''} \
-			${(params.sc.cellranger.count.containsKey('featureRef')) ? '--feature-ref ' + params.sc.cellranger.count.featureRef: ''} \
-			${(params.sc.cellranger.count.containsKey('expectCells')) ? '--expect-cells ' + params.sc.cellranger.count.expectCells: ''} \
-			${(params.sc.cellranger.count.containsKey('forceCells')) ? '--force-cells ' + params.sc.cellranger.count.forceCells: ''} \
-			${(params.sc.cellranger.count.containsKey('nosecondary')) ? '--nosecondary ' + params.sc.cellranger.count.nosecondary: ''} \
-			${(params.sc.cellranger.count.containsKey('noLibraries')) ? '--no-libraries ' + params.sc.cellranger.count.noLibraries: ''} \
-			${(params.sc.cellranger.count.containsKey('chemistry')) ? '--chemistry ' + params.sc.cellranger.count.chemistry: ''} \
-			${(params.sc.cellranger.count.containsKey('r1Length')) ? '--r1-length ' + params.sc.cellranger.count.r1Length: ''} \
-			${(params.sc.cellranger.count.containsKey('r2Length')) ? '--r2-length ' + params.sc.cellranger.count.r2Length: ''} \
-			${(params.sc.cellranger.count.containsKey('lanes')) ? '--lanes ' + params.sc.cellranger.count.lanes: ''} \
-			${(params.sc.cellranger.count.containsKey('localCores')) ? '--localcores ' + params.sc.cellranger.count.localCores: ''} \
-			${(params.sc.cellranger.count.containsKey('localMem')) ? '--localmem ' + params.sc.cellranger.count.localMem: ''} \
-			${(params.sc.cellranger.count.containsKey('indicies')) ? '--indicies ' + params.sc.cellranger.count.indicies: ''} 
+			${(processParams.containsKey('libraries')) ? '--libraries ' + processParams.libraries: ''} \
+			${(processParams.containsKey('featureRef')) ? '--feature-ref ' + processParams.featureRef: ''} \
+			${(processParams.containsKey('expectCells')) ? '--expect-cells ' + processParams.expectCells: ''} \
+			${(processParams.containsKey('forceCells')) ? '--force-cells ' + processParams.forceCells: ''} \
+			${(processParams.containsKey('nosecondary')) ? '--nosecondary ' + processParams.nosecondary: ''} \
+			${(processParams.containsKey('noLibraries')) ? '--no-libraries ' + processParams.noLibraries: ''} \
+			${(processParams.containsKey('chemistry')) ? '--chemistry ' + processParams.chemistry: ''} \
+			${(processParams.containsKey('r1Length')) ? '--r1-length ' + processParams.r1Length: ''} \
+			${(processParams.containsKey('r2Length')) ? '--r2-length ' + processParams.r2Length: ''} \
+			${(processParams.containsKey('lanes')) ? '--lanes ' + processParams.lanes: ''} \
+			${(processParams.containsKey('localCores')) ? '--localcores ' + processParams.localCores: ''} \
+			${(processParams.containsKey('localMem')) ? '--localmem ' + processParams.localMem: ''} \
+			${(processParams.containsKey('indicies')) ? '--indicies ' + processParams.indicies: ''} 
 		"""
 
 }
diff --git a/src/cellranger/processes/mkfastq.nf b/src/cellranger/processes/mkfastq.nf
@@ -1,9 +1,11 @@
 nextflow.preview.dsl=2
 
+toolParams = params.sc.cellranger
+
 process SC__CELLRANGER__MKFASTQ {
 
 	publishDir "${params.global.outdir}/fastqs", saveAs: { filename -> dirname = filename =~ /(.*)_fastqOut/; "${dirname[0][1]}" }, mode: 'link', overwrite: true
-  	container params.sc.cellranger.container
+  	container toolParams.container
 
   	input:
 		file(csv)
@@ -17,18 +19,18 @@ process SC__CELLRANGER__MKFASTQ {
 		cellranger mkfastq \
 			--run=${runFolder} \
 			--csv=${csv} \
-			${(params.sc.cellranger.mkfastq.containsKey('runID')) ? '--id ' + params.sc.cellranger.mkfastq.runID: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('samplesheet')) ? '--samplesheet ' + params.sc.cellranger.mkfastq.samplesheet: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('ignoreDualIndex')) ? '--ignore-dual-index ' + params.sc.cellranger.mkfastq.ignoreDualIndex: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('qc')) ? '--qc ' + params.sc.cellranger.mkfastq.qc: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('lanes')) ? '--lanes ' + params.sc.cellranger.mkfastq.lanes: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('useBasesMask')) ? '--use-bases-mask ' + params.sc.cellranger.mkfastq.useBasesMask: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('deleteUndetermined')) ? '--delete-undetermined ' + params.sc.cellranger.mkfastq.deleteUndetermined: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('outputDir')) ? '--output-dir ' + params.sc.cellranger.mkfastq.outputDir: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('project')) ? '--project ' + params.sc.cellranger.mkfastq.project: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('jobMode')) ? '--jobmode ' + params.sc.cellranger.mkfastq.jobMode: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('localCores')) ? '--localcores ' + params.sc.cellranger.mkfastq.localCores: ''} \
-			${(params.sc.cellranger.mkfastq.containsKey('localMem')) ? '--localmem ' + params.sc.cellranger.mkfastq.localMem: ''}
+			${(toolParams.mkfastq.containsKey('runID')) ? '--id ' + toolParams.mkfastq.runID: ''} \
+			${(toolParams.mkfastq.containsKey('samplesheet')) ? '--samplesheet ' + toolParams.mkfastq.samplesheet: ''} \
+			${(toolParams.mkfastq.containsKey('ignoreDualIndex')) ? '--ignore-dual-index ' + toolParams.mkfastq.ignoreDualIndex: ''} \
+			${(toolParams.mkfastq.containsKey('qc')) ? '--qc ' + toolParams.mkfastq.qc: ''} \
+			${(toolParams.mkfastq.containsKey('lanes')) ? '--lanes ' + toolParams.mkfastq.lanes: ''} \
+			${(toolParams.mkfastq.containsKey('useBasesMask')) ? '--use-bases-mask ' + toolParams.mkfastq.useBasesMask: ''} \
+			${(toolParams.mkfastq.containsKey('deleteUndetermined')) ? '--delete-undetermined ' + toolParams.mkfastq.deleteUndetermined: ''} \
+			${(toolParams.mkfastq.containsKey('outputDir')) ? '--output-dir ' + toolParams.mkfastq.outputDir: ''} \
+			${(toolParams.mkfastq.containsKey('project')) ? '--project ' + toolParams.mkfastq.project: ''} \
+			${(toolParams.mkfastq.containsKey('jobMode')) ? '--jobmode ' + toolParams.mkfastq.jobMode: ''} \
+			${(toolParams.mkfastq.containsKey('localCores')) ? '--localcores ' + toolParams.mkfastq.localCores: ''} \
+			${(toolParams.mkfastq.containsKey('localMem')) ? '--localmem ' + toolParams.mkfastq.localMem: ''}
 		
 		for sample in \$(tail -n+2 ${csv} | cut -f2 -d','); do
 			ln -s ${(params.global.containsKey('outputDir')) ? params.global.outputDir + "*/\${sample}" : "*/outs/fastq_path/*/\${sample}"} \${sample}_fastqOut

diff --git a/src/cellranger/processes/utils.nf b/src/cellranger/processes/utils.nf
@@ -9,7 +9,7 @@ if(!params.containsKey("test")) {
 process SC__CELLRANGER__PREPARE_FOLDER {
 
     clusterOptions "-l nodes=1:ppn=2 -l pmem=30gb -l walltime=1:00:00 -A ${params.global.qsubaccount}"
-    // publishDir "${params.outdir}/data", mode: 'link', overwrite: true
+    publishDir "${params.outdir}/data/raw/cellranger_fastq_folders", mode: 'symlink', overwrite: true
 
     input:
         tuple val(sampleId), val(fastqs)

diff --git a/src/channels/conf/tenx.config b/src/channels/conf/tenx.config
@@ -0,0 +1,7 @@
+params {
+    data {
+        tenx {
+            cellranger_outs_dir_path = 'data/10x/1k_pbmc/1k_pbmc_*/outs/'
+        }
+    }
+}
diff --git a/src/dropseqtools/processes/bam_tag_histogram.nf b/src/dropseqtools/processes/bam_tag_histogram.nf
@@ -13,7 +13,8 @@ process SC__DROP_SEQ_TOOLS__BAM_TAG_HISTOGRAM {
 	    tuple val(sample), path("*.cell_readcounts.txt.gz")
 
     script:
-        processParams = params.sc.dropseqtools.bam_tag_histogram
+        def sampleParams = params.parseConfig(sampleId, params.global, params.sc.dropseqtools.bam_tag_histogram)
+		processParams = sampleParams.local
         """
         BAMTagHistogram \
             I=${bam} \

diff --git a/src/dropseqtools/processes/detect_bead_synthesis_errors.nf b/src/dropseqtools/processes/detect_bead_synthesis_errors.nf
@@ -15,7 +15,8 @@ process SC__DROP_SEQ_TOOLS__DETECT_REPAIR_BARCODE_SYNTHESIS_ERRORS {
 		// tuple file("*.synthesis_stats.summary.txt"), emit: statsSummary
 
 	script:
-		processParams = params.sc.dropseqtools.detect_repair_barcode_synthesis_errors
+		def sampleParams = params.parseConfig(sampleId, params.global, params.sc.dropseqtools.detect_repair_barcode_synthesis_errors)
+		processParams = sampleParams.local
 		"""
 		DetectBeadSynthesisErrors \
 			I=${bam} \

diff --git a/src/dropseqtools/processes/filter_bam.nf b/src/dropseqtools/processes/filter_bam.nf
@@ -13,7 +13,8 @@ process SC__DROP_SEQ_TOOLS__FILTER_UNALIGNED_TAGGED_BAM {
         tuple val(sample), path('*.unaligned_tagged_filtered.bam'), emit: bam
 
     script:
-        processParams = params.sc.dropseqtools.filter_unaligned_tagged_bam
+        def sampleParams = params.parseConfig(sampleId, params.global, params.sc.dropseqtools.filter_unaligned_tagged_bam)
+		processParams = sampleParams.local
         """
         FilterBAM \
             TAG_REJECT=${processParams.tagReject} \

diff --git a/src/dropseqtools/processes/polya_trimmer.nf b/src/dropseqtools/processes/polya_trimmer.nf
@@ -14,7 +14,8 @@ process SC__DROP_SEQ_TOOLS__TRIM_POLYA_UNALIGNED_TAGGED_TRIMMED_SMART {
     tuple file('*.polyA_trimming_report.txt'), emit: report
 
     script:
-    processParams = params.sc.dropseqtools.trim_polya_unaligned_tagged_trimmed_smart
+    def sampleParams = params.parseConfig(sampleId, params.global, params.sc.dropseqtools.trim_polya_unaligned_tagged_trimmed_smart)
+		processParams = sampleParams.local
     """
     PolyATrimmer \
         INPUT=${bam} \

diff --git a/src/dropseqtools/processes/tag_bam_with_read_sequence_extended.nf b/src/dropseqtools/processes/tag_bam_with_read_sequence_extended.nf
@@ -13,7 +13,8 @@ process SC__DROP_SEQ_TOOLS__TAG_UNALIGNED_BAM_WITH_CELLBARCODE {
 		tuple file('*.unaligned_tagged_Cellular.bam_summary.txt'), emit: report
 
 	script:
-		processParams = params.sc.dropseqtools.tag_unaligned_bam_with_cellbarcode
+		def sampleParams = params.parseConfig(sampleId, params.global, params.sc.dropseqtools.tag_unaligned_bam_with_cellbarcode)
+		processParams = sampleParams.local
 		"""
 		TagBamWithReadSequenceExtended \
 			INPUT=${bam} \
@@ -42,7 +43,8 @@ process SC__DROP_SEQ_TOOLS__TAG_UNALIGNED_BAM_WITH_CELLMOLECULAR {
 		tuple file('*.unaligned_tagged_Molecular.bam_summary.txt'), emit: report
 
 	script:
-		processParams = params.sc.dropseqtools.tag_unaligned_bam_with_cellmolecular
+		def sampleParams = params.parseConfig(sampleId, params.global, params.sc.dropseqtools.tag_unaligned_bam_with_cellmolecular)
+		processParams = sampleParams.local
 		"""
 		source $DWMAX/documents/aertslab/scripts/src_dwmax/bash-utils/utils.sh
 		software load drop-seq_tools/1.12