Skip to content

Commit

Permalink
Merge pull request #48 from bedapub/29-slurm-support
Browse files Browse the repository at this point in the history
Added SLURM support
  • Loading branch information
Amit-H authored Jan 12, 2024
2 parents d50eca9 + 57b8f50 commit 82df535
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 34 deletions.
3 changes: 3 additions & 0 deletions splicekit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def junctions_master():
if splicekit.config.platform=="desktop":
splicekit.core.mprocess("jobs/count_junctions/process.sh")
splicekit.core.junctions.make_master()
if splicekit.config.platform=="SLURM":
os.system("jobs=( $(ls jobs/count_junctions/*.job) ); g=10; " "for((i=0; i < ${#jobs[@]}; i+=g)); do " "part=( \"${jobs[@]:i:g}\" ); " "for job_fname in ${part[*]}; do " "echo \"splicekit | features | junctions | submitted $job_fname\"; " "sbatch --mem=8G --parsable ${job_fname} & " "done; wait; " "echo \"splicekit | features | junctions | processing next 10\"; " "done; " "echo \"splicekit | features | junctions | processing complete\"")
os.system("sbatch --partition=short --mem=16G --output=/dev/null --error=/dev/null " "--wrap=\"python -c 'import splicekit; splicekit.core.junctions.make_master()'\"")

def junctions():
splicekit.core.annotation.make_comparisons()
Expand Down
30 changes: 28 additions & 2 deletions splicekit/core/anchors.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,33 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'):
jobs_dir = f'jobs/count_{anchor_type}_anchors'
logs_dir = f'logs/count_{anchor_type}_anchors'

job_anchors="""
if config.platform == 'SLURM':

job_anchors="""
#!/bin/bash
#SBATCH --job-name={anchor_type}_anchors_{sample_id} # Job name
#SBATCH --ntasks=12 # Number of tasks
#SBATCH --nodes=1 # All tasks on one node
#SBATCH --partition=short # Select queue
#SBATCH --output={logs_dir}/{anchor_type}_anchors_{sample_id}.out # Output file
#SBATCH --error={logs_dir}/{anchor_type}_anchors_{sample_id}.err # Error file
module load Subread/2.0.3-GCC-9.3.0
{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t anchor -g {anchor_type}_anchor_id -a {gtf_fname} -o {out_fname} {sam_fname}
# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
# next, we are only interested in the 1st and 7th column (anchor_id and count)
cp {out_fname} {out_fname}_temp
# make header line of file and overwrite out_fname as new file
echo "{header_line}" >| {out_fname}
tail -n +3 {out_fname}_temp| cut -f1,7 >> {out_fname}
rm {out_fname}_temp
# move summary from featureCount to logs
mv {out_fname}.summary {logs_dir}/
gzip -f {out_fname}
"""
else:

job_anchors="""
#!/bin/bash
#BSUB -J {anchor_type}_anchors_{sample_id} # Job name
#BSUB -n 12 # number of tasks
Expand All @@ -92,7 +118,7 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'):
# move summary from featureCount to logs
mv {out_fname}.summary {logs_dir}/
gzip -f {out_fname}
"""
"""

job_sh_anchors="""
{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t anchor -g {anchor_type}_anchor_id -a {gtf_fname} -o {out_fname} {sam_fname}
Expand Down
42 changes: 38 additions & 4 deletions splicekit/core/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,23 @@ def make_comparisons():
annotation.samples = [str(el) for el in annotation.samples]

def write_comparisons():
job_rmats="""
if config.platform == 'SLURM':
job_rmats="""
#!/bin/bash
#SBATCH --job-name={job_name} # Job name
#SBATCH --ntasks=1 # Number of tasks
#SBATCH --mem=8G # Allocate memory
#SBATCH --nodes=1 # All tasks on one node
#SBATCH --partition=short # Select queue
#SBATCH --output=logs/rmats/{comparison_name}.out # Output file
#SBATCH --error=logs/rmats/{comparison_name}.err # Error file
{container} run_rmats --b1 results/rmats/{comparison_name}_test.tab --b2 results/rmats/{comparison_name}_control.tab --gtf {gtf_path} -t paired --readLength 150 --variable-read-length --allow-clipping --nthread 4 --od results/rmats/{comparison_name}_results --tmp results/rmats/{comparison_name}_temp
"""

else:

job_rmats="""
#!/bin/bash
#BSUB -J {job_name} # job name
#BSUB -n 1 # number of tasks
Expand All @@ -148,7 +164,7 @@ def write_comparisons():
#BSUB -e logs/rmats/{comparison_name}.err # error file
{container} run_rmats --b1 results/rmats/{comparison_name}_test.tab --b2 results/rmats/{comparison_name}_control.tab --gtf {gtf_path} -t paired --readLength 150 --variable-read-length --allow-clipping --nthread 4 --od results/rmats/{comparison_name}_results --tmp results/rmats/{comparison_name}_temp
"""
"""

comps_table = open("annotation/comparisons.tab", "wt")
header = ["comparison", "control_samples", "test_samples", "control_group_id", "test_group_id"]
Expand Down Expand Up @@ -186,7 +202,25 @@ def write_comparisons():
write_edgeR_jobs()

def write_edgeR_jobs():
job_edgeR="""
if config.platform == 'SLURM':
job_edgeR="""
#!/bin/bash
#SBATCH --job-name={job_name} # Job name
#SBATCH --ntasks=1 # Number of tasks
#SBATCH --nodes=1 # All tasks on one node
#SBATCH --mem=8G # Allocate memory
#SBATCH --partition=short # Select queue
#SBATCH --output=logs/edgeR/{atype}/{comparison_name}.out # Output file
#SBATCH --error=logs/edgeR/{atype}/{comparison_name}.err # Error file
module load R
{container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R
"""


else:

job_edgeR="""
#!/bin/bash
#BSUB -J {job_name} # job name
#BSUB -n 1 # number of tasks
Expand All @@ -198,7 +232,7 @@ def write_edgeR_jobs():
ml R
{container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R
"""
"""

job_sh_edgeR="""{container} R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R"""
fsh_exons = open(f"jobs/edgeR/exons/process.sh", "wt")
Expand Down
28 changes: 27 additions & 1 deletion splicekit/core/exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,33 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'):
jobs_dir = f'jobs/count_exons'
logs_dir = f'logs/count_exons'

job_exons="""
if config.platform == 'SLURM':
job_exons="""
#!/bin/bash
#SBATCH --job-name=count_exons_{sample_id} # Job name
#SBATCH --ntasks=12 # Number of tasks
#SBATCH --nodes=1 # All tasks on one node
#SBATCH --partition=short # Select queue
#SBATCH --output={logs_dir}/exons_{sample_id}.out # Output file
#SBATCH --error={logs_dir}/exons_{sample_id}.err # Error file
module load Subread/2.0.3-GCC-9.3.0
{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g exon_id -a {gtf_fname} -o {out_fname} {sam_fname}
# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
# next, we are only interested in the 1st and 7th column (exon_id and count)
cp {out_fname} {out_fname}_temp
# make header line of file and overwrite out_fname as new file
echo "{header_line}" >| {out_fname}
tail -n +3 {out_fname}_temp| cut -f1,7 >> {out_fname}
rm {out_fname}_temp
gzip -f {out_fname}
# move summary from featureCount to logs
mv {out_fname}.summary {logs_dir}/
gzip -f {out_fname}
"""

else:
job_exons="""
#!/bin/bash
#BSUB -J count_exons_{sample_id} # Job name
#BSUB -n 12 # number of tasks
Expand Down
75 changes: 51 additions & 24 deletions splicekit/core/genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,30 +75,57 @@ def write_jobs_featureCounts(library_type='single-end', library_strand='NONE'):
jobs_dir = f"jobs/count_genes"
logs_dir = f"logs/count_genes"

job_genes="""
#!/bin/bash
#BSUB -J count_genes_{sample_id} # Job name
#BSUB -n 12 # number of tasks
#BSUB -R "span[hosts=1]" # Allocate all tasks in 1 host
#BSUB -q short # Select queue
#BSUB -o {logs_dir}/genes_{sample_id}.out # Output file
#BSUB -e {logs_dir}/genes_{sample_id}.err # Error file
ml .testing
ml Subread/2.0.3-GCC-9.3.0
{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname}
# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
# next, we are only interested in the 1st and 7th column (gene_id and count)
cp {out_fname} {out_fname}_temp
# make header line of file and overwrite out_fname as new file
echo "{header_line}" >| {out_fname}
# sum exon counts to gene counts with awk
tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname}
rm {out_fname}_temp
# move summary from featureCount to logs
mv {out_fname}.summary {logs_dir}/
gzip -f {out_fname}
"""
if config.platform == 'SLURM':
job_genes="""
#!/bin/bash
#SBATCH --job-name=count_genes_{sample_id} # Job name
#SBATCH --ntasks=12 # Number of tasks
#SBATCH --nodes=1 # All tasks on one node
#SBATCH --partition=short # Select queue
#SBATCH --output={logs_dir}/genes_{sample_id}.out # Output file
#SBATCH --error={logs_dir}/genes_{sample_id}.err # Error file
module load Subread/2.0.3-GCC-9.3.0
{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname}
# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
# next, we are only interested in the 1st and 7th column (gene_id and count)
cp {out_fname} {out_fname}_temp
# make header line of file and overwrite out_fname as new file
echo "{header_line}" >| {out_fname}
# sum exon counts to gene counts with awk
tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname}
rm {out_fname}_temp
# move summary from featureCount to logs
mv {out_fname}.summary {logs_dir}/
gzip -f {out_fname}
"""

else:
job_genes="""
#!/bin/bash
#BSUB -J count_genes_{sample_id} # Job name
#BSUB -n 12 # number of tasks
#BSUB -R "span[hosts=1]" # Allocate all tasks in 1 host
#BSUB -q short # Select queue
#BSUB -o {logs_dir}/genes_{sample_id}.out # Output file
#BSUB -e {logs_dir}/genes_{sample_id}.err # Error file
ml .testing
ml Subread/2.0.3-GCC-9.3.0
{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname}
# featureCount outputs command as first line of file, get rid of this first line and replace header for further parsing
# next, we are only interested in the 1st and 7th column (gene_id and count)
cp {out_fname} {out_fname}_temp
# make header line of file and overwrite out_fname as new file
echo "{header_line}" >| {out_fname}
# sum exon counts to gene counts with awk
tail -n +3 {out_fname}_temp| cut -f1,7 | awk '{{sum[$1]+=$2}} END {{OFS="\\t"; for (i in sum) print i, sum[i]}}' | sort -n >> {out_fname}
rm {out_fname}_temp
# move summary from featureCount to logs
mv {out_fname}.summary {logs_dir}/
gzip -f {out_fname}
"""

job_sh_genes="""
{container} featureCounts {library_type_insert}-s {library_strand_insert} -M -O -T 12 -F GTF -f -t exon -g gene_id -a {gtf_fname} -o {out_fname} {sam_fname}
Expand Down
21 changes: 19 additions & 2 deletions splicekit/core/jbrowse2.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,24 @@ def write_sample_jobs(force_samples):
os.system("rm -r jobs/jobs_jbrowse/* >/dev/null 2>&1") # clean up previous jobs

# create bigwig and then cram files
job_bw="""
if config.platforn == 'SLURM':
job_bw="""
#!/bin/bash
#SBATCH --job-name={sample}_jbrowse # Job name
#SBATCH --ntasks=4 # Number of tasks
#SBATCH --nodes=1 # All tasks on one node
#SBATCH --partition=short # Select queue
#SBATCH --output=logs/logs_jbrowse/{sample}.out # Output file
#SBATCH --error=logs/logs_jbrowse/{sample}.err # Error file
{container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize} -b {bam_fname} -o {bigwig_fname} -of bigwig
{container} samtools view -C -T {genome_fa} {bam_fname} -O CRAM -o {cram_fname}
{container} samtools index {cram_fname}
"""

else:

job_bw="""
#!/bin/bash
#BSUB -J {sample}_jbrowse # job name
#BSUB -n 4 # number of tasks
Expand All @@ -65,7 +82,7 @@ def write_sample_jobs(force_samples):
{container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize} -b {bam_fname} -o {bigwig_fname} -of bigwig
{container} samtools view -C -T {genome_fa} {bam_fname} -O CRAM -o {cram_fname}
{container} samtools index {cram_fname}
"""
"""

job_sh_bw="""
{container} bamCoverage --ignoreDuplicates --binSize {bamCoverage_binSize} -b {bam_fname} -o {bigwig_fname} -of bigwig 2> logs/logs_jbrowse/{sample}.err
Expand Down
2 changes: 1 addition & 1 deletion splicekit/splicekit.config.template
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ protein = "K562.TARDBP.0"
protein_label = "tdp43" # used in file names and titles

# platform and container
platform = "desktop" # "desktop" or "cluster" (HPC with SLURM)
platform = "desktop" # "desktop" or "cluster" or "SLURM" (Cluster refers to HPC systems with LSF queue manager, SLURM refers to HPC systems with SLURM queue manager)

# leaving the container empty will use software dependencies installed on your system
container = ""
Expand Down

0 comments on commit 82df535

Please sign in to comment.