Merge pull request #48 from bedapub/29-slurm-support

Added SLURM support
bedapub · Jan 12, 2024 · 82df535 · 82df535
2 parents d50eca9 + 57b8f50
commit 82df535
Show file tree

Hide file tree

Showing 7 changed files with 167 additions and 34 deletions.
diff --git a/splicekit/__init__.py b/splicekit/__init__.py
@@ -85,6 +85,9 @@ def junctions_master():
     if splicekit.config.platform=="desktop":
         splicekit.core.mprocess("jobs/count_junctions/process.sh")
         splicekit.core.junctions.make_master()
+    if splicekit.config.platform=="SLURM":
+        os.system("jobs=( $(ls jobs/count_junctions/*.job) ); g=10; " "for((i=0; i < ${#jobs[@]}; i+=g)); do " "part=( \"${jobs[@]:i:g}\" ); " "for job_fname in ${part[*]}; do " "echo \"splicekit | features | junctions | submitted $job_fname\"; " "sbatch --mem=8G --parsable ${job_fname} & " "done; wait; " "echo \"splicekit | features | junctions | processing next 10\"; " "done; " "echo \"splicekit | features | junctions | processing complete\"")
+        os.system("sbatch --partition=short --mem=16G --output=/dev/null --error=/dev/null " "--wrap=\"python -c 'import splicekit; splicekit.core.junctions.make_master()'\"")
 
 def junctions():
     splicekit.core.annotation.make_comparisons()

diff --git a/splicekit/core/anchors.py b/splicekit/core/anchors.py
@@ -70,7 +70,33 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'):
         jobs_dir = f'jobs/count_{anchor_type}_anchors'
         logs_dir = f'logs/count_{anchor_type}_anchors'
 
-        job_anchors="""
+        if config.platform == 'SLURM':
+
+            job_anchors="""
+ #!/bin/bash
+#SBATCH --job-name={anchor_type}_anchors_{sample_id}  # Job name
+#SBATCH --ntasks=12                                   # Number of tasks
+#SBATCH --nodes=1                                     # All tasks on one node
+#SBATCH --partition=short                             # Select queue
+#SBATCH --output={logs_dir}/{anchor_type}_anchors_{sample_id}.out # Output file
+#SBATCH --error={logs_dir}/{anchor_type}_anchors_{sample_id}.err  # Error file
+
+module load Subread/2.0.3-GCC-9.3.0
+{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t anchor -g {anchor_type}_anchor_id -a {gtf_fname} -o {out_fname} {sam_fname} 
+# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
+# next, we are only interested in the 1st and 7th column (anchor_id and count)
+cp {out_fname} {out_fname}_temp
+# make header line of file and overwrite out_fname as new file
+echo "{header_line}" >| {out_fname}
+tail -n +3 {out_fname}_temp| cut -f1,7 >> {out_fname} 
+rm {out_fname}_temp
+# move summary from featureCount to logs
+mv {out_fname}.summary {logs_dir}/
+gzip -f {out_fname}
+            """
+        else:
+
+            job_anchors="""
 #!/bin/bash
 #BSUB -J {anchor_type}_anchors_{sample_id}  # Job name
 #BSUB -n 12                                 # number of tasks
@@ -92,7 +118,7 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'):
 # move summary from featureCount to logs
 mv {out_fname}.summary {logs_dir}/
 gzip -f {out_fname}
-        """
+            """
 
         job_sh_anchors="""
 {container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t anchor -g {anchor_type}_anchor_id -a {gtf_fname} -o {out_fname} {sam_fname} 

diff --git a/splicekit/core/annotation.py b/splicekit/core/annotation.py
@@ -137,7 +137,23 @@ def make_comparisons():
     annotation.samples = [str(el) for el in annotation.samples]
 
 def write_comparisons():
-    job_rmats="""
+    if config.platform == 'SLURM':
+        job_rmats="""
+#!/bin/bash
+#SBATCH --job-name={job_name}                        # Job name
+#SBATCH --ntasks=1                                   # Number of tasks
+#SBATCH --mem=8G                                     # Allocate memory
+#SBATCH --nodes=1                                    # All tasks on one node
+#SBATCH --partition=short                            # Select queue
+#SBATCH --output=logs/rmats/{comparison_name}.out    # Output file
+#SBATCH --error=logs/rmats/{comparison_name}.err     # Error file
+
+{container} run_rmats --b1 results/rmats/{comparison_name}_test.tab --b2 results/rmats/{comparison_name}_control.tab --gtf {gtf_path} -t paired --readLength 150 --variable-read-length --allow-clipping --nthread 4 --od results/rmats/{comparison_name}_results --tmp results/rmats/{comparison_name}_temp
+        """
+
+    else:
+
+        job_rmats="""
 #!/bin/bash
 #BSUB -J {job_name}                        # job name
 #BSUB -n 1                                 # number of tasks
@@ -148,7 +164,7 @@ def write_comparisons():
 #BSUB -e logs/rmats/{comparison_name}.err   # error file
 
 {container} run_rmats --b1 results/rmats/{comparison_name}_test.tab --b2 results/rmats/{comparison_name}_control.tab --gtf {gtf_path} -t paired --readLength 150 --variable-read-length --allow-clipping --nthread 4 --od results/rmats/{comparison_name}_results --tmp results/rmats/{comparison_name}_temp
-"""
+        """
 
     comps_table = open("annotation/comparisons.tab", "wt")
     header = ["comparison", "control_samples", "test_samples", "control_group_id", "test_group_id"]
@@ -186,7 +202,25 @@ def write_comparisons():
     write_edgeR_jobs()
 
 def write_edgeR_jobs():
-    job_edgeR="""
+    if config.platform == 'SLURM':
+            job_edgeR="""
+#!/bin/bash
+#SBATCH --job-name={job_name}                                     # Job name
+#SBATCH --ntasks=1                                                # Number of tasks
+#SBATCH --nodes=1                                                 # All tasks on one node
+#SBATCH --mem=8G                                                  # Allocate memory
+#SBATCH --partition=short                                         # Select queue
+#SBATCH --output=logs/edgeR/{atype}/{comparison_name}.out         # Output file
+#SBATCH --error=logs/edgeR/{atype}/{comparison_name}.err          # Error file
+
+module load R
+{container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R
+    """
+
+
+    else:
+
+        job_edgeR="""
 #!/bin/bash
 #BSUB -J {job_name}                                     # job name
 #BSUB -n 1                                              # number of tasks
@@ -198,7 +232,7 @@ def write_edgeR_jobs():
 
 ml R
 {container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R
-"""
+    """
 
     job_sh_edgeR="""{container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R"""
     fsh_exons = open(f"jobs/edgeR/exons/process.sh", "wt")

diff --git a/splicekit/core/exons.py b/splicekit/core/exons.py
@@ -75,7 +75,33 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'):
     jobs_dir = f'jobs/count_exons'
     logs_dir = f'logs/count_exons'
 
-    job_exons="""
+    if config.platform == 'SLURM':
+        job_exons="""
+#!/bin/bash
+#SBATCH --job-name=count_exons_{sample_id}            # Job name
+#SBATCH --ntasks=12                                   # Number of tasks
+#SBATCH --nodes=1                                     # All tasks on one node
+#SBATCH --partition=short                             # Select queue
+#SBATCH --output={logs_dir}/exons_{sample_id}.out     # Output file
+#SBATCH --error={logs_dir}/exons_{sample_id}.err      # Error file
+    
+module load Subread/2.0.3-GCC-9.3.0
+{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g exon_id -a {gtf_fname} -o {out_fname} {sam_fname} 
+# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
+# next, we are only interested in the 1st and 7th column (exon_id and count)
+cp {out_fname} {out_fname}_temp
+# make header line of file and overwrite out_fname as new file
+echo "{header_line}" >| {out_fname}
+tail -n +3 {out_fname}_temp| cut -f1,7 >> {out_fname} 
+rm {out_fname}_temp
+gzip -f {out_fname}
+# move summary from featureCount to logs
+mv {out_fname}.summary {logs_dir}/
+gzip -f {out_fname}
+    """
+
+    else:
+        job_exons="""
 #!/bin/bash
 #BSUB -J count_exons_{sample_id}            # Job name
 #BSUB -n 12                                 # number of tasks

diff --git a/splicekit/core/genes.py b/splicekit/core/genes.py
@@ -75,30 +75,57 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'):
     jobs_dir = f"jobs/count_genes"
     logs_dir = f"logs/count_genes"
 
-    job_genes="""
-    #!/bin/bash
-    #BSUB -J count_genes_{sample_id}            # Job name
-    #BSUB -n 12                                 # number of tasks
-    #BSUB -R "span[hosts=1]"                    # Allocate all tasks in 1 host
-    #BSUB -q short                              # Select queue
-    #BSUB -o {logs_dir}/genes_{sample_id}.out   # Output file
-    #BSUB -e {logs_dir}/genes_{sample_id}.err   # Error file    
-        
-    ml .testing
-    ml Subread/2.0.3-GCC-9.3.0
-    {container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname} 
-    # featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
-    # next, we are only interested in the 1st and 7th column (gene_id and count)
-    cp {out_fname} {out_fname}_temp
-    # make header line of file and overwrite out_fname as new file
-    echo "{header_line}" >| {out_fname}
-    # sum exon counts to gene counts with awk
-    tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname}
-    rm {out_fname}_temp
-    # move summary from featureCount to logs
-    mv {out_fname}.summary {logs_dir}/
-    gzip -f {out_fname}
-    """
+    if config.platform == 'SLURM':
+        job_genes="""
+#!/bin/bash
+#SBATCH --job-name=count_genes_{sample_id}            # Job name
+#SBATCH --ntasks=12                                   # Number of tasks
+#SBATCH --nodes=1                                     # All tasks on one node
+#SBATCH --partition=short                             # Select queue
+#SBATCH --output={logs_dir}/genes_{sample_id}.out     # Output file
+#SBATCH --error={logs_dir}/genes_{sample_id}.err      # Error file    
+            
+
+module load Subread/2.0.3-GCC-9.3.0
+{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname} 
+# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
+# next, we are only interested in the 1st and 7th column (gene_id and count)
+cp {out_fname} {out_fname}_temp
+# make header line of file and overwrite out_fname as new file
+echo "{header_line}" >| {out_fname}
+# sum exon counts to gene counts with awk
+tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname}
+rm {out_fname}_temp
+# move summary from featureCount to logs
+mv {out_fname}.summary {logs_dir}/
+gzip -f {out_fname}
+        """
+
+    else:
+        job_genes="""
+#!/bin/bash
+#BSUB -J count_genes_{sample_id}            # Job name
+#BSUB -n 12                                 # number of tasks
+#BSUB -R "span[hosts=1]"                    # Allocate all tasks in 1 host
+#BSUB -q short                              # Select queue
+#BSUB -o {logs_dir}/genes_{sample_id}.out   # Output file
+#BSUB -e {logs_dir}/genes_{sample_id}.err   # Error file    
+    
+ml .testing
+ml Subread/2.0.3-GCC-9.3.0
+{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname} 
+# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
+# next, we are only interested in the 1st and 7th column (gene_id and count)
+cp {out_fname} {out_fname}_temp
+# make header line of file and overwrite out_fname as new file
+echo "{header_line}" >| {out_fname}
+# sum exon counts to gene counts with awk
+tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname}
+rm {out_fname}_temp
+# move summary from featureCount to logs
+mv {out_fname}.summary {logs_dir}/
+gzip -f {out_fname}
+        """
 
     job_sh_genes="""
     {container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname} 

diff --git a/splicekit/core/jbrowse2.py b/splicekit/core/jbrowse2.py
@@ -53,7 +53,24 @@ def write_sample_jobs(force_samples):
     os.system("rm -r jobs/jobs_jbrowse/* >/dev/null 2>&1") # clean up previous jobs
 
     # create bigwig and then cram files
-    job_bw="""
+    if config.platforn == 'SLURM':
+        job_bw="""
+#!/bin/bash
+#SBATCH --job-name={sample}_jbrowse               # Job name
+#SBATCH --ntasks=4                               # Number of tasks
+#SBATCH --nodes=1                                # All tasks on one node
+#SBATCH --partition=short                        # Select queue
+#SBATCH --output=logs/logs_jbrowse/{sample}.out  # Output file
+#SBATCH --error=logs/logs_jbrowse/{sample}.err   # Error file
+
+{container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize}  -b {bam_fname} -o {bigwig_fname} -of bigwig
+{container} samtools view -C -T {genome_fa} {bam_fname} -O CRAM -o {cram_fname}
+{container} samtools index {cram_fname}
+        """
+
+    else:
+
+        job_bw="""
 #!/bin/bash
 #BSUB -J {sample}_jbrowse               # job name
 #BSUB -n 4                              # number of tasks
@@ -65,7 +82,7 @@ def write_sample_jobs(force_samples):
 {container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize}  -b {bam_fname} -o {bigwig_fname} -of bigwig
 {container} samtools view -C -T {genome_fa} {bam_fname} -O CRAM -o {cram_fname}
 {container} samtools index {cram_fname}
-    """
+        """
 
     job_sh_bw="""
 {container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize}  -b {bam_fname} -o {bigwig_fname} -of bigwig 2> logs/logs_jbrowse/{sample}.err

diff --git a/splicekit/splicekit.config.template b/splicekit/splicekit.config.template
@@ -28,7 +28,7 @@ protein = "K562.TARDBP.0"
 protein_label = "tdp43" # used in file names and titles
 
 # platform and container
-platform = "desktop" # "desktop" or "cluster" (HPC with SLURM)
+platform = "desktop" # "desktop" or "cluster" or "SLURM" (Cluster refers to HPC systems with LSF queue manager, SLURM refers to HPC systems with SLURM queue manager)
 
 # leaving the container empty will use software dependencies installed on your system
 container = ""