Skip to content

Commit

Permalink
commit
Browse files Browse the repository at this point in the history
  • Loading branch information
albertwcheng committed Sep 6, 2011
1 parent 6430b5d commit 3ee1ba1
Show file tree
Hide file tree
Showing 4 changed files with 343 additions and 0 deletions.
153 changes: 153 additions & 0 deletions findUpstreamORFsFromBedSeqFile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env python

from bedFunc import *
from ORFFinder import findORFInThreeFrames
from optparse import OptionParser
from albertcommon import *
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

def findUORFsFromBedSeqFields(fields,options):

uORFS=[] #gStart0,gEnd1,tStart0,tEnd1,seq

try:
chrom,chromStartG0,chromEndG1,name,score,strand,thickStartG0,thickEndG1,itemRGB,blockCount,blockSizes,blockStarts0,transcriptSeq=fields
chromStartG0=int(chromStartG0)
chromEndG1=int(chromEndG1)
thickStartG0=int(thickStartG0)
thickEndG1=int(thickEndG1)
blockCount=int(blockCount)
blockSizeSplits=blockSizes.split(",")
blockSizes=[]
for s in blockSizeSplits:
s=s.strip()
if len(s)>0:
blockSizes.append(int(s))
blockStarts0Splits=blockStarts0.split(",")
blockStarts0=[]
for s in blockStarts0Splits:
s=s.strip()
if len(s)>0:
blockStarts0.append(int(s))

if len(blockStarts0)!=blockCount or len(blockSizes)!=blockCount:
print >> stderr,"block count not consistent with blockStarts or blockSizes data"
raise ValueError
except:
print >> stderr,"format error",fields
raise ValueError

#check if no main orf, then do nothing
if thickStartG0<0 or thickEndG1<0 or thickStartG0>=thickEndG1:
#no main orf
return name,chrom,chromStartG0,strand,uORFS


if strand=="+":
#forward strand
mainORFStartTCoord=genomicCoordToTranscriptCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,thickStartG0)
elif strand=="-":
mainORFStartTCoord=genomicCoordToTranscriptCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,thickEndG1-1,True)
else:
print >> stderr,"invalid strand",fields
raise ValueError



ranges=findORFInThreeFrames(transcriptSeq,False,options.minORFLength,options.maxORFLength) #findLongestORF=False #ranges=(start0,end1) in nt., not including Stop codon

if ranges==None:
#nothing found. return
return name,chrom,chromStartG0,strand,uORFS

for rang in ranges:
realrangeIncStop=(rang[0],rang[1]+3)
#now need the ORF before mainORFSTart
#this is sorted. so if >=mainORFSTart, break
if realrangeIncStop[0]>=mainORFStartTCoord:
break

distToCDS=mainORFStartTCoord-realrangeIncStop[0]

if realrangeIncStop[1]>mainORFStartTCoord: #stop after main orf start, check phase needs to be different (i.e., out of frame)
if distToCDS % 3 == 0: #in frame, skip
continue

#now the remaining are the OK Uorfs~
#HERE
uORFSeq=transcriptSeq[realrangeIncStop[0]:realrangeIncStop[1]]
if realrangeIncStop[0]-3>=0:
minus3Base=transcriptSeq[realrangeIncStop[0]-3]
else:
minus3Base="NA"

plus4Base=transcriptSeq[realrangeIncStop[0]+4-1]

if strand=="+":
uORFgStart0=transcriptCoordToGenomicCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,realrangeIncStop[0])
uORFgEnd1=transcriptCoordToGenomicCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,realrangeIncStop[1]-1)+1
else:
uORFgStart0=transcriptCoordToGenomicCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,realrangeIncStop[1]-1,True)
uORFgEnd1=transcriptCoordToGenomicCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,realrangeIncStop[0],True)+1



clipBlockStarts,clipBlockSizes=clipBlocksByGenomicBound(chromStartG0,blockStarts0,blockSizes,uORFgStart0,uORFgEnd1,True)
#transform clipblock starts by referencing first block from 0 Last option=True



uORFS.append((uORFgStart0,uORFgEnd1,realrangeIncStop[0],realrangeIncStop[1],clipBlockStarts,clipBlockSizes,uORFSeq,distToCDS,minus3Base,plus4Base))
#what should we do here?!?!?

return name,chrom,chromStartG0,strand,uORFS

def printUsageAndExit(parser):
parser.print_help()
exit(1)

if __name__=='__main__':

parser=OptionParser(usage="Usage: %prog inbed ogbed oinfo")
parser.add_option("--minORFLength",dest="minORFLength",default=-1,type="int",help="set min ORF length for uORFs")
parser.add_option("--maxORFLength",dest="maxORFLength",default=-1,type="int",help="set max ORF length for uORFs")
parser.add_option("--trackName",dest="trackName",default="uORFs",help="set track name for output bed file")
parser.add_option("--trackDescription",dest="trackDescription",default="upstream open reading frames",help="set track description for output bed file")
parser.add_option("--trackVisilibity",dest="trackVisibility",default="full",help="set track visibility")

(options,args)=parser.parse_args()

try:
filename,ogbedfn,oinfofn=args
except:
printUsageAndExit(parser)

ogbed=open(ogbedfn,"w")
oinfo=open(oinfofn,"w")

print >> ogbed,"track name='"+options.trackName+"' description='"+options.trackDescription+"' visibility='"+options.trackVisibility+"'"
print >> oinfo,"uORFName\tuORFSeq\tuORFTranslation\tuORFLengthNt\tDistanceToCDS\tminus3Base\tplus4Base"

fil=open(filename)
for lin in fil:
lin=lin.rstrip("\r\n")
if len(lin)<1 or lin[0]=="#":
continue

fields=lin.split("\t")
name,chrom,geneChromStart0,strand,uORFS=findUORFsFromBedSeqFields(fields,options)
for i in range(0,len(uORFS)):
uORFgStart0,uORFgEnd1,tStart0,tEnd1,clipBlockStarts,clipBlockSizes,uORFSeq,distToCDS,minus3Base,plus4Base=uORFS[i]
clipBlockStartStr=",".join(toStrList(clipBlockStarts))
clipBlockSizeStr=",".join(toStrList(clipBlockSizes))
outputv=toStrList([chrom,uORFgStart0,uORFgEnd1,name+".uORF"+str(i+1),0,strand,uORFgStart0,uORFgEnd1,"0,0,0",len(clipBlockStarts),clipBlockSizeStr,clipBlockStartStr])
print >> ogbed,"\t".join(outputv)
coding_dna=Seq(uORFSeq,generic_dna)
protein_seq=coding_dna.translate(to_stop=True)
print >> oinfo,"\t".join(toStrList([name+".uORF"+str(i+1),uORFSeq,protein_seq,len(uORFSeq),distToCDS,minus3Base,plus4Base]))


fil.close()
ogbed.close()
oinfo.close()
56 changes: 56 additions & 0 deletions joinMatrix.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

if [ $# -lt 3 ]; then
echo "$0 inMatrix outMatrix rowJoiner [ colJoiner [ rowJoinOpts [ colJoinOpts ] ] ]"
echo "join opts: (from joinu.py)"
joinu.py #no args to trigger help message
exit
fi

rowJoinOpts=""
colJoinOpts=""

inMatrix=$1
rowJoiner=$3
outMatrix=$2

colJoiner=$rowJoiner

if [ $# -ge 4 ]; then
colJoiner=$4
if [ $# -ge 5 ]; then
rowJoinOpts=$5
if [ $# -ge 6 ]; then
colJoinOpts=$6
fi
fi

fi

#now the real stuff

tmp1=`tempfile` #get temp file tmp1
#first rowjoin
cmd="joinu.py $rowJoinOpts $rowJoiner $inMatrix > $tmp1"
eval $cmd

#cat $tmp1

tmp2=`tempfile`
matrixTranspose.py $tmp1 > $tmp2

#cat $tmp2

tmp3=`tempfile`
cmd="joinu.py $colJoinOpts $colJoiner $tmp2 > $tmp3"
eval $cmd

#cat $tmp3

#now transpose back
matrixTranspose.py $tmp3 > $outMatrix

#now remove temp file (is this neccessary?)
rmrie.sh $tmp1
rmrie.sh $tmp2
rmrie.sh $tmp3
14 changes: 14 additions & 0 deletions marryFeatureNumForAgilentArray.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

if [ $# -lt 3 ]; then
echo $0 rawFileAsTemplate expMatrix outputFileWithFeatureNum
exit
fi

rawFileAsTemplate=$1
expMatrix=$2
outputFileWithFeatureNum=$3

awk '(FNR>=10){printf("%s\t%s\t%s\n",$2,$3,$4);}' $rawFileAsTemplate > marryFeatureNum.map.00
#now join
joinu.py -1.Row,.Col -2.Row,.Col marryFeatureNum.map.00 $expMatrix > $outputFileWithFeatureNum
120 changes: 120 additions & 0 deletions performOpsOnTwoMatrices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env python

from optparse import OptionParser
from sys import *
from math import *
from numpy import *

def printUsageAndExit(parser):
parser.print_help()
exit(1)


f=float
i=int

def f1(t=str):
return t(M1[ROWNUM0][COLNUM0])

def f2(t=str):
return t(M2[ROWNUM0][COLNUM0])

def f1f():
return float(f1())

def f2f():
return float(f2())

def f1i():
return int(f1())

def f2i():
return int(f2())

def loadMatrix(filename,options):
M=[]
colnames=[]
rownames=[]
topleftcorner=""
fil=open(filename)
lino=0
for lin in fil:
lino+=1
lin=lin.rstrip("\r\n")
fields=lin.split(options.IFS)
if lino==1:
topleftcorner=fields[0]
colnames=fields[1:]
else:
rownames.append(fields[0])
M.append(fields[1:])



fil.close()
return topleftcorner,colnames,rownames,M

def dim(M):
numRows=len(M)
if numRows>=1:
numCols=len(M[0])
else:
numCols=0

return (numRows,numCols)

if __name__=='__main__':
parser=OptionParser("usage: %prog [options] infile1 infile2 cmd > outfil\nf1(int) or f1(i) the content of file1 as integer\nf1(f) or f1(float) the content of file1 as float\nfor example: f1()+f2() concatenates the string from file1 and file2 at each cell\nf1(float)/f2(float) divides f1 as a float by f2\n'T' if f1(float)>f2(float) else 'F' writes T if f1 > f2 as floats else it writes F")
parser.add_option("--ifs",dest="IFS",default="\t",help="set input field separator [tab]")
parser.add_option("--ofs",dest="OFS",default="\t",help="set output field separator [tab]")
parser.add_option("--invalid-value",dest="invalidValue",default=None,help="set the invalid value to print [None: quit program]")

(options,args)=parser.parse_args()

try:
INFILE1,INFILE2,cmd=args
except:
printUsageAndExit(parser)

#now load file into mem


topleftcorner1,colnames1,rownames1,M1=loadMatrix(INFILE1,options)
topleftcorner2,colnames2,rownames2,M2=loadMatrix(INFILE2,options)

numRows1,numCols1=dim(M1)

if dim(M1)!=dim(M2):
print >> stderr,"file1:",INFILE1,"dim(r,c)=",dim(M1),"!=",INFILE2,"dim=",dim(M2)
exit(1)

if tuple(colnames1)!=tuple(colnames2):
print >> stderr,"warning: file1 and file2 column names do not equal file1=",colnames1,"file2=",colnames2

if tuple(rownames1)!=tuple(rownames2):
print >> stderr,"warning: file1 and file2 row names do not equal file1=",rownames1,"file2=",colnames2


print >> stdout,options.OFS.join([topleftcorner1]+colnames1)
#now contruct and print matrix
for ROWNUM0 in range(0,numRows1):
ROWNUM=ROWNUM0+1
ROWNAME=rownames1[ROWNUM0]
#print name
outputv=[ROWNAME]
for COLNUM0 in range(0,numCols1):
COLNUM=COLNUM0+1
COLNAME=colnames1[COLNUM0]
try:
valueToPrint=str(eval(cmd))
outputv.append(valueToPrint)
except:
if options.invalidValue==None:
print >> stderr,"error running",cmd,"at ROWNUM,ROWNAME=",ROWNUM,ROWNAME,"COLNUM,COLNAME=",COLNUM,COLNAME
exit(1)
else:
outputv.append(options.invalidValue)
print >> stdout,options.OFS.join(outputv)



0 comments on commit 3ee1ba1

Please sign in to comment.