diff --git a/splicekit/__init__.py b/splicekit/__init__.py index af54ef2..ec90e2d 100644 --- a/splicekit/__init__.py +++ b/splicekit/__init__.py @@ -85,6 +85,9 @@ def junctions_master(): if splicekit.config.platform=="desktop": splicekit.core.mprocess("jobs/count_junctions/process.sh") splicekit.core.junctions.make_master() + if splicekit.config.platform=="SLURM": + os.system("jobs=( $(ls jobs/count_junctions/*.job) ); g=10; " "for((i=0; i < ${#jobs[@]}; i+=g)); do " "part=( \"${jobs[@]:i:g}\" ); " "for job_fname in ${part[*]}; do " "echo \"splicekit | features | junctions | submitted $job_fname\"; " "sbatch --mem=8G --parsable ${job_fname} & " "done; wait; " "echo \"splicekit | features | junctions | processing next 10\"; " "done; " "echo \"splicekit | features | junctions | processing complete\"") + os.system("sbatch --partition=short --mem=16G --output=/dev/null --error=/dev/null " "--wrap=\"python -c 'import splicekit; splicekit.core.junctions.make_master()'\"") def junctions(): splicekit.core.annotation.make_comparisons() diff --git a/splicekit/core/anchors.py b/splicekit/core/anchors.py index c284ac7..471c9f5 100644 --- a/splicekit/core/anchors.py +++ b/splicekit/core/anchors.py @@ -70,7 +70,33 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'): jobs_dir = f'jobs/count_{anchor_type}_anchors' logs_dir = f'logs/count_{anchor_type}_anchors' - job_anchors=""" + if config.platform == 'SLURM': + + job_anchors=""" + #!/bin/bash +#SBATCH --job-name={anchor_type}_anchors_{sample_id} # Job name +#SBATCH --ntasks=12 # Number of tasks +#SBATCH --nodes=1 # All tasks on one node +#SBATCH --partition=short # Select queue +#SBATCH --output={logs_dir}/{anchor_type}_anchors_{sample_id}.out # Output file +#SBATCH --error={logs_dir}/{anchor_type}_anchors_{sample_id}.err # Error file + +module load Subread/2.0.3-GCC-9.3.0 +{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t anchor -g {anchor_type}_anchor_id -a {gtf_fname} -o {out_fname} {sam_fname} +# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing +# next, we are only interested in the 1st and 7th column (anchor_id and count) +cp {out_fname} {out_fname}_temp +# make header line of file and overwrite out_fname as new file +echo "{header_line}" >| {out_fname} +tail -n +3 {out_fname}_temp| cut -f1,7 >> {out_fname} +rm {out_fname}_temp +# move summary from featureCount to logs +mv {out_fname}.summary {logs_dir}/ +gzip -f {out_fname} + """ + else: + + job_anchors=""" #!/bin/bash #BSUB -J {anchor_type}_anchors_{sample_id} # Job name #BSUB -n 12 # number of tasks @@ -92,7 +118,7 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'): # move summary from featureCount to logs mv {out_fname}.summary {logs_dir}/ gzip -f {out_fname} - """ + """ job_sh_anchors=""" {container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t anchor -g {anchor_type}_anchor_id -a {gtf_fname} -o {out_fname} {sam_fname} diff --git a/splicekit/core/annotation.py b/splicekit/core/annotation.py index 1949d80..6db355c 100644 --- a/splicekit/core/annotation.py +++ b/splicekit/core/annotation.py @@ -137,7 +137,23 @@ def make_comparisons(): annotation.samples = [str(el) for el in annotation.samples] def write_comparisons(): - job_rmats=""" + if config.platform == 'SLURM': + job_rmats=""" +#!/bin/bash +#SBATCH --job-name={job_name} # Job name +#SBATCH --ntasks=1 # Number of tasks +#SBATCH --mem=8G # Allocate memory +#SBATCH --nodes=1 # All tasks on one node +#SBATCH --partition=short # Select queue +#SBATCH --output=logs/rmats/{comparison_name}.out # Output file +#SBATCH --error=logs/rmats/{comparison_name}.err # Error file + +{container} run_rmats --b1 results/rmats/{comparison_name}_test.tab --b2 results/rmats/{comparison_name}_control.tab --gtf {gtf_path} -t paired --readLength 150 --variable-read-length --allow-clipping --nthread 4 --od results/rmats/{comparison_name}_results --tmp results/rmats/{comparison_name}_temp + """ + + else: + + job_rmats=""" #!/bin/bash #BSUB -J {job_name} # job name #BSUB -n 1 # number of tasks @@ -148,7 +164,7 @@ def write_comparisons(): #BSUB -e logs/rmats/{comparison_name}.err # error file {container} run_rmats --b1 results/rmats/{comparison_name}_test.tab --b2 results/rmats/{comparison_name}_control.tab --gtf {gtf_path} -t paired --readLength 150 --variable-read-length --allow-clipping --nthread 4 --od results/rmats/{comparison_name}_results --tmp results/rmats/{comparison_name}_temp -""" + """ comps_table = open("annotation/comparisons.tab", "wt") header = ["comparison", "control_samples", "test_samples", "control_group_id", "test_group_id"] @@ -186,7 +202,25 @@ def write_comparisons(): write_edgeR_jobs() def write_edgeR_jobs(): - job_edgeR=""" + if config.platform == 'SLURM': + job_edgeR=""" +#!/bin/bash +#SBATCH --job-name={job_name} # Job name +#SBATCH --ntasks=1 # Number of tasks +#SBATCH --nodes=1 # All tasks on one node +#SBATCH --mem=8G # Allocate memory +#SBATCH --partition=short # Select queue +#SBATCH --output=logs/edgeR/{atype}/{comparison_name}.out # Output file +#SBATCH --error=logs/edgeR/{atype}/{comparison_name}.err # Error file + +module load R +{container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R + """ + + + else: + + job_edgeR=""" #!/bin/bash #BSUB -J {job_name} # job name #BSUB -n 1 # number of tasks @@ -198,7 +232,7 @@ def write_edgeR_jobs(): ml R {container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R -""" + """ job_sh_edgeR="""{container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R""" fsh_exons = open(f"jobs/edgeR/exons/process.sh", "wt") diff --git a/splicekit/core/exons.py b/splicekit/core/exons.py index 269ccee..66db090 100644 --- a/splicekit/core/exons.py +++ b/splicekit/core/exons.py @@ -75,7 +75,33 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'): jobs_dir = f'jobs/count_exons' logs_dir = f'logs/count_exons' - job_exons=""" + if config.platform == 'SLURM': + job_exons=""" +#!/bin/bash +#SBATCH --job-name=count_exons_{sample_id} # Job name +#SBATCH --ntasks=12 # Number of tasks +#SBATCH --nodes=1 # All tasks on one node +#SBATCH --partition=short # Select queue +#SBATCH --output={logs_dir}/exons_{sample_id}.out # Output file +#SBATCH --error={logs_dir}/exons_{sample_id}.err # Error file + +module load Subread/2.0.3-GCC-9.3.0 +{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g exon_id -a {gtf_fname} -o {out_fname} {sam_fname} +# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing +# next, we are only interested in the 1st and 7th column (exon_id and count) +cp {out_fname} {out_fname}_temp +# make header line of file and overwrite out_fname as new file +echo "{header_line}" >| {out_fname} +tail -n +3 {out_fname}_temp| cut -f1,7 >> {out_fname} +rm {out_fname}_temp +gzip -f {out_fname} +# move summary from featureCount to logs +mv {out_fname}.summary {logs_dir}/ +gzip -f {out_fname} + """ + + else: + job_exons=""" #!/bin/bash #BSUB -J count_exons_{sample_id} # Job name #BSUB -n 12 # number of tasks diff --git a/splicekit/core/genes.py b/splicekit/core/genes.py index 143ad2a..de648a3 100644 --- a/splicekit/core/genes.py +++ b/splicekit/core/genes.py @@ -75,30 +75,57 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'): jobs_dir = f"jobs/count_genes" logs_dir = f"logs/count_genes" - job_genes=""" - #!/bin/bash - #BSUB -J count_genes_{sample_id} # Job name - #BSUB -n 12 # number of tasks - #BSUB -R "span[hosts=1]" # Allocate all tasks in 1 host - #BSUB -q short # Select queue - #BSUB -o {logs_dir}/genes_{sample_id}.out # Output file - #BSUB -e {logs_dir}/genes_{sample_id}.err # Error file - - ml .testing - ml Subread/2.0.3-GCC-9.3.0 - {container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname} - # featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing - # next, we are only interested in the 1st and 7th column (gene_id and count) - cp {out_fname} {out_fname}_temp - # make header line of file and overwrite out_fname as new file - echo "{header_line}" >| {out_fname} - # sum exon counts to gene counts with awk - tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname} - rm {out_fname}_temp - # move summary from featureCount to logs - mv {out_fname}.summary {logs_dir}/ - gzip -f {out_fname} - """ + if config.platform == 'SLURM': + job_genes=""" +#!/bin/bash +#SBATCH --job-name=count_genes_{sample_id} # Job name +#SBATCH --ntasks=12 # Number of tasks +#SBATCH --nodes=1 # All tasks on one node +#SBATCH --partition=short # Select queue +#SBATCH --output={logs_dir}/genes_{sample_id}.out # Output file +#SBATCH --error={logs_dir}/genes_{sample_id}.err # Error file + + +module load Subread/2.0.3-GCC-9.3.0 +{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname} +# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing +# next, we are only interested in the 1st and 7th column (gene_id and count) +cp {out_fname} {out_fname}_temp +# make header line of file and overwrite out_fname as new file +echo "{header_line}" >| {out_fname} +# sum exon counts to gene counts with awk +tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname} +rm {out_fname}_temp +# move summary from featureCount to logs +mv {out_fname}.summary {logs_dir}/ +gzip -f {out_fname} + """ + + else: + job_genes=""" +#!/bin/bash +#BSUB -J count_genes_{sample_id} # Job name +#BSUB -n 12 # number of tasks +#BSUB -R "span[hosts=1]" # Allocate all tasks in 1 host +#BSUB -q short # Select queue +#BSUB -o {logs_dir}/genes_{sample_id}.out # Output file +#BSUB -e {logs_dir}/genes_{sample_id}.err # Error file + +ml .testing +ml Subread/2.0.3-GCC-9.3.0 +{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname} +# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing +# next, we are only interested in the 1st and 7th column (gene_id and count) +cp {out_fname} {out_fname}_temp +# make header line of file and overwrite out_fname as new file +echo "{header_line}" >| {out_fname} +# sum exon counts to gene counts with awk +tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname} +rm {out_fname}_temp +# move summary from featureCount to logs +mv {out_fname}.summary {logs_dir}/ +gzip -f {out_fname} + """ job_sh_genes=""" {container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname} diff --git a/splicekit/core/jbrowse2.py b/splicekit/core/jbrowse2.py index 7aff1a8..5bdc183 100644 --- a/splicekit/core/jbrowse2.py +++ b/splicekit/core/jbrowse2.py @@ -53,7 +53,24 @@ def write_sample_jobs(force_samples): os.system("rm -r jobs/jobs_jbrowse/* >/dev/null 2>&1") # clean up previous jobs # create bigwig and then cram files - job_bw=""" + if config.platforn == 'SLURM': + job_bw=""" +#!/bin/bash +#SBATCH --job-name={sample}_jbrowse # Job name +#SBATCH --ntasks=4 # Number of tasks +#SBATCH --nodes=1 # All tasks on one node +#SBATCH --partition=short # Select queue +#SBATCH --output=logs/logs_jbrowse/{sample}.out # Output file +#SBATCH --error=logs/logs_jbrowse/{sample}.err # Error file + +{container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize} -b {bam_fname} -o {bigwig_fname} -of bigwig +{container} samtools view -C -T {genome_fa} {bam_fname} -O CRAM -o {cram_fname} +{container} samtools index {cram_fname} + """ + + else: + + job_bw=""" #!/bin/bash #BSUB -J {sample}_jbrowse # job name #BSUB -n 4 # number of tasks @@ -65,7 +82,7 @@ def write_sample_jobs(force_samples): {container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize} -b {bam_fname} -o {bigwig_fname} -of bigwig {container} samtools view -C -T {genome_fa} {bam_fname} -O CRAM -o {cram_fname} {container} samtools index {cram_fname} - """ + """ job_sh_bw=""" {container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize} -b {bam_fname} -o {bigwig_fname} -of bigwig 2> logs/logs_jbrowse/{sample}.err diff --git a/splicekit/splicekit.config.template b/splicekit/splicekit.config.template index 85ea1c2..0bec406 100644 --- a/splicekit/splicekit.config.template +++ b/splicekit/splicekit.config.template @@ -28,7 +28,7 @@ protein = "K562.TARDBP.0" protein_label = "tdp43" # used in file names and titles # platform and container -platform = "desktop" # "desktop" or "cluster" (HPC with SLURM) +platform = "desktop" # "desktop" or "cluster" or "SLURM" (Cluster refers to HPC systems with LSF queue manager, SLURM refers to HPC systems with SLURM queue manager) # leaving the container empty will use software dependencies installed on your system container = ""