Merge pull request #47 from bedapub/edgeR2

Edge r2
bedapub · Jan 12, 2024 · d50eca9 · d50eca9
2 parents d828c81 + 125f2fa
commit d50eca9
Show file tree

Hide file tree

Showing 7 changed files with 30 additions and 16 deletions.
diff --git a/splicekit/clusterlogfc/__init__.py b/splicekit/clusterlogfc/__init__.py
@@ -19,6 +19,7 @@
 from matplotlib import pyplot as plt
 import matplotlib.font_manager
 import seaborn as sns
+import gzip
 
 def compute_distance(compound_name1, compound_name2, fname):
     if not os.path.exists(fname):

diff --git a/splicekit/core/annotation.py b/splicekit/core/annotation.py
@@ -125,7 +125,7 @@ def make_comparisons():
                 if sep_temp!="":
                     test_group_id = f"{test_sample}_{sep_temp}"
                     control_group_id = f"{control_sample}_{dmso_hash[tuple(temp_control)]}_{sep_temp}"
-                    comparison_name = f"{test_sample}_{control_sample}_{dmso_hash[tuple(temp_control)]}_{sep_temp}"
+                    comparison_name = f"{test_sample}_{control_sample}{dmso_hash[tuple(temp_control)]}_{sep_temp}"
                 else:
                     test_group_id = f"{test_sample}"
                     control_group_id = f"{control_sample}{dmso_hash[tuple(temp_control)]}"
@@ -180,7 +180,7 @@ def write_comparisons():
         job_rmats_instance = job_rmats.format(container=splicekit.config.container, core_path=os.path.dirname(core.__file__), comparison_name=comparison_name, job_name="rmats_"+comparison_name, gtf_path=splicekit.config.gtf_path[:-3])
         f_rmats.write(job_rmats_instance)
         f_rmats.close()
-        row = [comparison_name, ",".join(str(el) for el in test_ids), ",".join(str(el) for el in control_ids), control_group_id, test_group_id]
+        row = [comparison_name, ",".join(str(el) for el in control_ids), ",".join(str(el) for el in test_ids), control_group_id, test_group_id]
         comps_table.write("\t".join(row) + "\n") 
     comps_table.close()
     write_edgeR_jobs()
@@ -209,10 +209,12 @@ def write_edgeR_jobs():
     sample_membership = {}
     for (comparison_name, control_set, test_set, control_group_id, test_group_id) in annotation.comparisons:
         for (sample_id, _, _, _) in control_set:
-            assert(sample_membership.get(sample_id, None) in [None, control_group_id])
+            # in some rare cases, the same sample can be part of diverse control groups
+            #assert(sample_membership.get(sample_id, None) in [None, control_group_id])
             sample_membership[sample_id] = control_group_id
         for (sample_id, _, _, _) in test_set:
-            assert(sample_membership.get(sample_id, None) in [None, test_group_id])
+            # in some rare cases, the same sample can be part of diverse test groups
+            #assert(sample_membership.get(sample_id, None) in [None, test_group_id])
             sample_membership[sample_id] = test_group_id
     sample_membership = [sample_membership[sample_id] for sample_id in annotation.samples]
     for (comparison_name, control_set, test_set, control_group_id, test_group_id) in annotation.comparisons:

diff --git a/splicekit/core/features.py b/splicekit/core/features.py
@@ -48,7 +48,7 @@ def load_genes():
         if gene["gene_name"]=="":
             gene["gene_name"] = atts.get("gene", "")
             if gene["gene_name"]=="":
-                gene["gene_name"] = gene["gene_id"]
+                gene["gene_name"] = gene_id
         # because we have a feature called "genes" (and exons, donor_anchors, acceptor_anchors and junctions)
         # this stores gene counts
         gene_info = gene.get("genes", {})

diff --git a/splicekit/core/motifs.py b/splicekit/core/motifs.py
@@ -605,7 +605,7 @@ def make_distance():
     print(f"{module_name} make_distance | start")
     len_background, background = get_background()
     for feature_type in motif_criteria.keys():
-        treatment_seq = treatment_seq_bytype[f"{feature_type}_all"] # use all to do the clustering and dendrogram
+        treatment_seq = treatment_seq_bytype.get(f"{feature_type}_all", {}) # use all to do the clustering and dendrogram
         f = open(f"results/motifs/{feature_type}_donor_pattern_dm.tab", "wt")
         f.write("\t".join(["treatment1", "treatment2", "donnor_pattern_distance"]) + "\n")
         m = len(treatment_seq.keys()) # 179 for ps180
@@ -639,7 +639,7 @@ def cluster(cutoff=9):
     from matplotlib import pyplot as plt
     for feature_type in motif_criteria.keys():
         treatments = set()
-        for treatment, seqs in treatment_seq_bytype[f"{feature_type}_all"].items():
+        for treatment, seqs in treatment_seq_bytype.get(f"{feature_type}_all", {}).items():
             treatments.add(treatment)
 
         treatments = list(treatments)
@@ -656,6 +656,9 @@ def cluster(cutoff=9):
             r = f.readline()
         f.close()
 
+        if len(cmd)==0:
+            continue
+
         Z = linkage(cdm, 'ward')
         cutree = cut_tree(Z, n_clusters=cutoff) # cut dendrogram at point where we have args.cutoff clusters
         clusters = {}
@@ -734,9 +737,9 @@ def cluster(cutoff=9):
         f.close()
 
 def process():
-    #if config.scanRBP:
-    #    make_scanRBP()
-    #    plot_scanRBP()
+    if config.scanRBP:
+        make_scanRBP()
+        plot_scanRBP()
     make_logos()
     dreme()
     make_distance()

diff --git a/splicekit/core/promisc.py b/splicekit/core/promisc.py
@@ -2,6 +2,7 @@
 import sys
 import splicekit.core.annotation as annotation
 import splicekit.config as config
+import gzip
 
 def toint(temp):
     try:
@@ -16,10 +17,10 @@ def same_junction(data1, data2):
     return False
 
 def process():
-    annotation.read_comparisons()
+    annotation.make_comparisons()
     results = {}
     # populate result list
-    f = open("results/results_edgeR_junctions.tab", "rt")
+    f = gzip.open("results/edgeR/junctions_results_fdr005.tab.gz", "rt")
     header = f.readline().replace("\r", "").replace("\n", "").split("\t")
     r = f.readline()
     while r:
@@ -40,7 +41,7 @@ def process():
                 results[id][rdata["comparison"]+"_junction"] = 1
 
     reported_junction = {}
-    fout_junction = open("results/results_edgeR_junctions_promisc.tab", "wt")
+    fout_junction = gzip.open("results/junctions_promisc.tab.gz", "wt")
     header = ["gene_name", "gene_id", "chr", "strand", "feature_start", "feature_stop", "row_sum"]
     for (comparison, _, _, _, _) in annotation.comparisons:
         header.append(comparison)

diff --git a/splicekit/core/report.py b/splicekit/core/report.py
@@ -96,8 +96,14 @@ def edgeR_feature(feature_name, version=""):
                 assembly = {"homo_sapiens":"hg38", "mus_musculus":"mm39"}[config.species]
                 tracks_fixed = ["gene-refseq", "transcript-refseq", "transcript-ensembl"]
             tracks_control, tracks_test = comparisons[comparison] 
-            tracks_control = [track+'_bw' for track in tracks_control] # New JBrowse2 --> bigwig files are called by id_bw
-            tracks_test = [track+'_bw' for track in tracks_test] # New JBrowse2 --> bigwig files are called by id_bw
+
+            if config.jbrowse2_url.startswith("https://genomebrowser"):
+                tracks_control = [track.lstrip("0")+'_bw' for track in tracks_control]
+                tracks_test = [track.lstrip("0")+'_bw' for track in tracks_test]
+            else:
+                tracks_control = [track+'_bw' for track in tracks_control] # New JBrowse2 --> bigwig files are called by id_bw
+                tracks_test = [track+'_bw' for track in tracks_test] # New JBrowse2 --> bigwig files are called by id_bw
+
             tracks = ",".join(tracks_test[:3]+tracks_control[:3]+tracks_fixed)
             f_from=int(data["feature_start"])
             f_to=int(data["feature_stop"])

diff --git a/splicekit/splicecompare b/splicekit/splicecompare
@@ -5,6 +5,7 @@ import sys
 import argparse
 import seaborn as sns
 import matplotlib.pyplot as plt
+import gzip
 
 parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
 parser.add_argument('splicekit_run1', help="Main folder with splicekit run 1")
@@ -36,7 +37,7 @@ for feature_type in args.features.split(","):
     fout = open(f"{args.fname_out}_{feature_type}.tab", "wt")
     results = {}
     for splicekit_run, comparison, comparison_fname in [(args.splicekit_run1, args.comparison1, args.comparison1_fname), (args.splicekit_run2, args.comparison2, args.comparison2_fname)]:
-        f = open(os.path.join(splicekit_run, "results", f"results_edgeR_{feature_type}", f"{comparison_fname}"), "rt")
+        f = gzip.open(os.path.join(splicekit_run, "results", "edgeR", feature_type, f"{comparison_fname}.gz"), "rt")
         header = f.readline().replace("\r", "").replace("\n", "").split("\t")
         r = f.readline()
         while r: