Skip to content

Commit

Permalink
Mostly code refactoring, some slight improvements, cache-prevention a…
Browse files Browse the repository at this point in the history
…nd other minor changes.
  • Loading branch information
grexor committed May 14, 2016
1 parent c8e8adb commit 396836c
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 71 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
old*
201506_splicing_tables_backup
*.pyc
comps
comps.paper
20 changes: 8 additions & 12 deletions README
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
1. Installation instructions for Ubuntu
RNAmotifs v2.0

* Highslide
**Dependencies**

http://highslide.com/download-confirm.php?file=download%2Fhighslide-4.1.13.zip
Highslide
http://highslide.com/download-confirm.php?file=download%2Fhighslide-4.1.13.zip

Download to "software/highslide" and make symbolic link to "comps" folder.

* twoBitToFa utility:

download from http://hgdownload.cse.ucsc.edu/admin/exe

* Python to R interface:
twoBitToFa
Download from http://hgdownload.cse.ucsc.edu/admin/exe

Python to R interface
apt-get install python-rpy2

* Python Fisher’s test software:

Python Fisher’s test
apt-get install python-setuptools
apt-get install python-dev
easy_install fisher
18 changes: 16 additions & 2 deletions __init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def worker():
q.join()

base_motif_fisher = assemble_results(comps, genome, region, cn)
if base_motif_fisher<=0.01:
if base_motif_fisher<=rnamotifs2.data.base_motif_thr:
continue_cluster(comps, genome, region, cn, pth, sf)
return base_motif_fisher

Expand Down Expand Up @@ -170,7 +170,21 @@ def assemble_results(comps, genome, region, cn):
f.write("\t".join(str(x) for x in row)+"\n")
f.close()

base_motif_fisher, _, _ = test_results[data[0][0]] # get fisher value of top (base) motif
# correct for fdr?
rnamotifs2.data.read_config(comps)
if rnamotifs2.data.use_FDR:
pybio.utils.FDR_tab(os.path.join(region_folder, "results%s.tab" % cn), "fisher")
# read the most significant motif fisher value back
data = []
f = open(os.path.join(region_folder, "results%s.tab" % cn))
header = f.readline()
r = f.readline().replace("\n", "").replace("\r", "").split("\t")
try: # the results file could be empty
base_motif_fisher = float(r[2]) # fisher column
except:
base_motif_fisher = 1 # there are no results
else:
base_motif_fisher, _, _ = test_results[data[0][0]] # get fisher value of top (base) motif
return base_motif_fisher

def fdr(pvalues, correction_type = "Benjamini-Hochberg"):
Expand Down
5 changes: 3 additions & 2 deletions bin/rnamotifs2.draw
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ import random
import glob
import time
random.seed(42)

import argparse

nocache = random.randint(0, 1000000) # better would be a unique datetime number
parser = argparse.ArgumentParser()
parser.add_argument('-comps', action="store", dest="comps", default=None)
args = parser.parse_args()
Expand Down Expand Up @@ -147,7 +148,7 @@ for region in regions:
f.write("</td>")
for a in range(0, 4):
f.write("<td>")
image_filename = "%s_tree%s_area%s.png" % (region, index, a+1)
image_filename = "%s_tree%s_area%s.png?nocache=%s" % (region, index, a+1, nocache)
f.write("<a href=%s class='highslide' onclick='return hs.expand(this)'><img src=%s width=250></a>" % (image_filename, image_filename))
f.write("</td>")
f.write("</tr>")
Expand Down
9 changes: 4 additions & 5 deletions bin/rnamotifs2.draw_apa
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ import pickle
import random
import glob
random.seed(42)
import argparse

web_folder = None
web_folder = "/rnamotifs2"

import argparse
nocache = random.randint(0, 1000000)
parser = argparse.ArgumentParser()
parser.add_argument('-comps', action="store", dest="comps", default=None)
args = parser.parse_args()
Expand Down Expand Up @@ -108,7 +108,6 @@ for region, tree_list in configs.items():
area["e"] = [0] * 201
logs, loge = rnamotifs2.compute.es_apa(area["s"], area["e"], area["c"])
image_filename = "%s_tree%s_area%s" % (region, index, a+1)
print fisher
draw_sum = rnamotifs2.draw.area_apa("+".join(motif_list), logs[a], loge[a], os.path.join(comps_folder, "rnamap", image_filename), area=a, region=region, limy=max_es, fisher=fisher)

regions = configs.keys()
Expand Down Expand Up @@ -142,9 +141,9 @@ for region in regions:
for a in range(0, 1):
f.write("<td align=center>")
if web_folder==None:
image_filename = "%s_tree%s_area%s" % (region, index, a+1)
image_filename = "%s_tree%s_area%s.png?nocache=%s" % (region, index, a+1, nocache)
else:
image_filename_temp = "%s_tree%s_area%s" % (region, index, a+1)
image_filename_temp = "%s_tree%s_area%s.png?nocache=%s" % (region, index, a+1, nocache)
image_filename = "%s/%s/rnamap/%s" % (web_folder, args.comps, image_filename_temp)
if fisher!=None:
f.write("%s (<a href=%s target=_new>tree=%s</a>, h=%s)<br>" % ("+".join(motif_list), tree_file, index, h))
Expand Down
2 changes: 1 addition & 1 deletion bin/rnamotifs2.motif.cluster
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ else:

# get already filtered data
_, _, _, rfilter, _, _, _ = pickle.load(open(pickle_filename))

step = len(cmotif)
base_motif, base_h = read_base_motif(comps, region, cn)
print "base_motif=%s, motif_h=%s" % (base_motif, base_h)
Expand Down
51 changes: 28 additions & 23 deletions cluster/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pickle
from Queue import Queue
from threading import Thread
import pybio

max_steps = 4

Expand Down Expand Up @@ -68,20 +69,18 @@ def next_cluster(comps, genome, region, cn, pth=0.5, sf="r"):
motifs, cmotif, fisher, ig, raw_ig, p_emp, h = rnamotifs2.cluster.start_motifs(comps, region, cn)
else:
motifs, cmotif, fisher, ig, raw_ig, p_emp, h = rnamotifs2.cluster.next_motifs(comps, region, cn, step)
if fisher>0.001:
if fisher>rnamotifs2.data.cluster_stop_thr:
draw(comps, genome, region, cn, steps=step)
return

# stop tree construction

return # stop tree construction

num_worker_threads = 40
q = Queue()
q = Queue()
def worker():
while True:
task = q.get()
os.system(task)
q.task_done()
tasks = []
q.task_done()
tasks = []
for motif in motifs:
cluster = motif + cmotif
pickle_file = os.path.join(pickle_folder, "c%s.%s.filter.%s.pickle" % (cn, "_".join(sorted(motif)), "_".join(sorted(cmotif))))
Expand All @@ -92,16 +91,17 @@ def worker():
pickle_file = os.path.join(pickle_folder, "c%s.%s.pickle" % (cn, "_".join(sorted(cluster))))
if not os.path.exists(pickle_file):
command = "rnamotifs2.motif %s %s %s %s %s %s %s" % (comps, genome, region, "_".join(cluster), pth, 0, sf)
tasks.append(command)
tasks.append(command)
for i in range(num_worker_threads):
t = Thread(target=worker)
t.daemon = True
t.start()
t.start()
for task in tasks:
q.put(task)
q.put(task)
q.join()

assemble(comps, genome, region, cn, step)

draw(comps, genome, region, cn, steps=max_steps)

def assemble(comps, genome, region, cn, step):
Expand All @@ -113,7 +113,7 @@ def assemble(comps, genome, region, cn, step):
motifs, cmotif, fisher, ig, raw_ig, p_emp, h = rnamotifs2.cluster.start_motifs(comps, region, cn)
else:
motifs, cmotif, fisher, ig, raw_ig, p_emp, h = rnamotifs2.cluster.next_motifs(comps, region, cn, step)

area = {}
h = {}
test_results = {}
Expand Down Expand Up @@ -146,7 +146,7 @@ def assemble(comps, genome, region, cn, step):
row_id = ""
row = ["_".join(motif), "_".join(cmotif), h[k_string], fisher, ig, raw_ig, p_emp]
data.append(row)

data = sorted(data, key=operator.itemgetter(3))
f = open(os.path.join(region_folder, "c%s.temp%s.tab" % (cn, step)), "wt")
header = ["motif", "cmotif", "h", "fisher", "ig", "raw_ig", "p_emp"]
Expand All @@ -155,6 +155,11 @@ def assemble(comps, genome, region, cn, step):
f.write("\t".join(str(x) for x in row)+"\n")
f.close()

# correct for fdr?
rnamotifs2.data.read_config(comps)
if rnamotifs2.data.use_FDR:
pybio.utils.FDR_tab(os.path.join(region_folder, "c%s.temp%s.tab" % (cn, step)), "fisher")

def draw(comps, genome, region, cn, steps=4):
cluster_region = "t"
control_region = "c"
Expand Down Expand Up @@ -197,12 +202,12 @@ def draw(comps, genome, region, cn, steps=4):

removed_region = 0
removed_control = 0

if cmotif!=[]:
_, _, _, _, nums, rcounts, _ = pickle.load(open(os.path.join(pickle_folder, "c%s.%s.filter.%s.pickle" % (cn, "_".join(sorted(motif)), "_".join(sorted(cmotif))))))
else:
_, _, _, _, nums, rcounts, _ = pickle.load(open(os.path.join(pickle_folder, "c%s.%s.pickle" % (cn, "_".join(sorted(motif))))))

# find out where the previous tree was cut
# and get filtered exons
if cn>0:
Expand All @@ -223,29 +228,29 @@ def draw(comps, genome, region, cn, steps=4):
m1 = data["motif"].split("_")
m2 = data["cmotif"].split("_")
r = f.readline()
f.close()
f.close()

if m2!=['']:
pickle_filename = os.path.join(pickle_folder, "c%s.%s.filter.%s.pickle" % (cn-1, m1[0], "_".join(sorted(m2))))
else:
pickle_filename = os.path.join(pickle_folder, "c%s.%s.pickle" % (cn-1, m1[0]))
_, _, _, rfilter, _, _, _ = pickle.load(open(pickle_filename))
else:
rfilter = {}

if len(cmotif)>1:
_, _, _, rfilter, _, _, _ = pickle.load(open(os.path.join(pickle_folder, "c%s.%s.filter.%s.pickle" % (cn, cmotif[0], "_".join(sorted(cmotif[1:]))))))
elif len(cmotif)==1:
_, _, _, rfilter, _, _, _ = pickle.load(open(os.path.join(pickle_folder, "c%s.%s.pickle" % (cn, "_".join(cmotif)))))

for k in rfilter.keys():
if k.startswith("t"):
removed_region += 1
if k.startswith("c"):
removed_control += 1
row = [step, fisher, ig, raw_ig, h, removed_region, removed_control, rcounts[cluster_region], rcounts[control_region], nums[cluster_region]-rcounts[cluster_region], nums[control_region]-rcounts[control_region], "compute_specificity", "_".join(motif), "_".join(cmotif)]
data_all.append(row)

# add * to show where the list was cut
#if follow and fisher>0.01:
# if len(data_all)>1:
Expand All @@ -260,13 +265,13 @@ def draw(comps, genome, region, cn, steps=4):
row[11] = num_control*dif_reg / max(1, float(num_region*dif_con)) # specificity
fout.write("\t".join(str(x) for x in row)+"\n")
fout.close()

"""
motif = "_".join(selection[-1][-2].split("_") + selection[-1][-1].split("_"))
row = [cn, region, motif, selection[-1][0], selection[-1][4]]
f.write("\t".join(str(x) for x in row) + "\n")
f.close()
# write removed regions
motif = "_".join(sorted(motif.split("_")))
filename = os.path.join(pickle_folder, "c%s.removed.pickle" % cluster_number)
Expand Down
20 changes: 11 additions & 9 deletions compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test(comps, genome, motif, rcounts, nums): # rcounts
num_control = nums.get("%s.%s" % (rt, "c"), 0)
val = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail
results["%s.%s" % (rt, event_class)] = val

# information gain (g at the end)
i1 = (num_class/float(num_class+num_control))*math.log(num_class/float(num_class+num_control), 2)
i2 = (num_control/float(num_class+num_control))*math.log(num_control/float(num_class+num_control), 2)
Expand All @@ -62,11 +62,11 @@ def test(comps, genome, motif, rcounts, nums): # rcounts
else:
c2_num = 0
g = i - (c1_num + c2_num)

#print rt, event_class, val_class, num_class
#print rt, "c", val_control, num_control
#print

results["%s.%s.g" % (rt, event_class)] = g
for p in range(0, rnamotifs2.config.perms):
val_class = rcounts.get("%s.%s.p%s" % (rt, event_class, p), 0)
Expand All @@ -89,13 +89,15 @@ def rtest(comps, genome, motif, rcounts, nums): # rcounts
print "%s.%s.%s: fisher test on real and perm data" % (comps, genome, motif)
val_class = rcounts.get("t", 0)
val_control = rcounts.get("c", 0)
num_class = nums.get("t", 0)
num_control = nums.get("c", 0)
#num_class = nums.get("t.all", 0) # v_18
#num_control = nums.get("c.all", 0) # v_18
num_class = nums.get("t", 0) # v_17
num_control = nums.get("c", 0) # v_17
num_all = float(num_class+num_control)
val_all = float(val_class+val_control)

fisher = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail

# information gain (g at the end)
i1 = (num_class/num_all)*math.log(num_class/num_all, 2)
i2 = (num_control/num_all)*math.log(num_control/num_all, 2)
Expand All @@ -113,7 +115,7 @@ def rtest(comps, genome, motif, rcounts, nums): # rcounts
else:
c2_num = 0
ig = i - (c1_num + c2_num)

p_emp = []
for p in range(0, rnamotifs2.config.perms):
val_class = rcounts.get("%s.p%s" % (event_class, p), 0)
Expand All @@ -123,4 +125,4 @@ def rtest(comps, genome, motif, rcounts, nums): # rcounts
val = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail
p_emp.append(val)

return (fisher, p_emp, ig)
return (fisher, p_emp, ig)
10 changes: 10 additions & 0 deletions data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ def read_config(comps):
continue
f.close()

# by default, do not correct for FDR
if getattr(m, "use_FDR", None)==None:
setattr(m, "use_FDR", False)

if getattr(m, "base_motif_thr", None)==None:
setattr(m, "base_motif_thr", 0.01)

if getattr(m, "cluster_stop_thr", None)==None:
setattr(m, "cluster_stop_thr", 0.001)

def read(comps):
read_config(comps)
if data_type=="apa":
Expand Down
Loading

0 comments on commit 396836c

Please sign in to comment.