forked from albertwcheng/albert-bioinformatics-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6430b5d
commit 3ee1ba1
Showing
4 changed files
with
343 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
#!/usr/bin/env python | ||
|
||
from bedFunc import * | ||
from ORFFinder import findORFInThreeFrames | ||
from optparse import OptionParser | ||
from albertcommon import * | ||
from Bio.Seq import Seq | ||
from Bio.Alphabet import generic_dna | ||
|
||
def findUORFsFromBedSeqFields(fields,options): | ||
|
||
uORFS=[] #gStart0,gEnd1,tStart0,tEnd1,seq | ||
|
||
try: | ||
chrom,chromStartG0,chromEndG1,name,score,strand,thickStartG0,thickEndG1,itemRGB,blockCount,blockSizes,blockStarts0,transcriptSeq=fields | ||
chromStartG0=int(chromStartG0) | ||
chromEndG1=int(chromEndG1) | ||
thickStartG0=int(thickStartG0) | ||
thickEndG1=int(thickEndG1) | ||
blockCount=int(blockCount) | ||
blockSizeSplits=blockSizes.split(",") | ||
blockSizes=[] | ||
for s in blockSizeSplits: | ||
s=s.strip() | ||
if len(s)>0: | ||
blockSizes.append(int(s)) | ||
blockStarts0Splits=blockStarts0.split(",") | ||
blockStarts0=[] | ||
for s in blockStarts0Splits: | ||
s=s.strip() | ||
if len(s)>0: | ||
blockStarts0.append(int(s)) | ||
|
||
if len(blockStarts0)!=blockCount or len(blockSizes)!=blockCount: | ||
print >> stderr,"block count not consistent with blockStarts or blockSizes data" | ||
raise ValueError | ||
except: | ||
print >> stderr,"format error",fields | ||
raise ValueError | ||
|
||
#check if no main orf, then do nothing | ||
if thickStartG0<0 or thickEndG1<0 or thickStartG0>=thickEndG1: | ||
#no main orf | ||
return name,chrom,chromStartG0,strand,uORFS | ||
|
||
|
||
if strand=="+": | ||
#forward strand | ||
mainORFStartTCoord=genomicCoordToTranscriptCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,thickStartG0) | ||
elif strand=="-": | ||
mainORFStartTCoord=genomicCoordToTranscriptCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,thickEndG1-1,True) | ||
else: | ||
print >> stderr,"invalid strand",fields | ||
raise ValueError | ||
|
||
|
||
|
||
ranges=findORFInThreeFrames(transcriptSeq,False,options.minORFLength,options.maxORFLength) #findLongestORF=False #ranges=(start0,end1) in nt., not including Stop codon | ||
|
||
if ranges==None: | ||
#nothing found. return | ||
return name,chrom,chromStartG0,strand,uORFS | ||
|
||
for rang in ranges: | ||
realrangeIncStop=(rang[0],rang[1]+3) | ||
#now need the ORF before mainORFSTart | ||
#this is sorted. so if >=mainORFSTart, break | ||
if realrangeIncStop[0]>=mainORFStartTCoord: | ||
break | ||
|
||
distToCDS=mainORFStartTCoord-realrangeIncStop[0] | ||
|
||
if realrangeIncStop[1]>mainORFStartTCoord: #stop after main orf start, check phase needs to be different (i.e., out of frame) | ||
if distToCDS % 3 == 0: #in frame, skip | ||
continue | ||
|
||
#now the remaining are the OK Uorfs~ | ||
#HERE | ||
uORFSeq=transcriptSeq[realrangeIncStop[0]:realrangeIncStop[1]] | ||
if realrangeIncStop[0]-3>=0: | ||
minus3Base=transcriptSeq[realrangeIncStop[0]-3] | ||
else: | ||
minus3Base="NA" | ||
|
||
plus4Base=transcriptSeq[realrangeIncStop[0]+4-1] | ||
|
||
if strand=="+": | ||
uORFgStart0=transcriptCoordToGenomicCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,realrangeIncStop[0]) | ||
uORFgEnd1=transcriptCoordToGenomicCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,realrangeIncStop[1]-1)+1 | ||
else: | ||
uORFgStart0=transcriptCoordToGenomicCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,realrangeIncStop[1]-1,True) | ||
uORFgEnd1=transcriptCoordToGenomicCoordBedFormatted(chromStartG0,blockStarts0,blockSizes,realrangeIncStop[0],True)+1 | ||
|
||
|
||
|
||
clipBlockStarts,clipBlockSizes=clipBlocksByGenomicBound(chromStartG0,blockStarts0,blockSizes,uORFgStart0,uORFgEnd1,True) | ||
#transform clipblock starts by referencing first block from 0 Last option=True | ||
|
||
|
||
|
||
uORFS.append((uORFgStart0,uORFgEnd1,realrangeIncStop[0],realrangeIncStop[1],clipBlockStarts,clipBlockSizes,uORFSeq,distToCDS,minus3Base,plus4Base)) | ||
#what should we do here?!?!? | ||
|
||
return name,chrom,chromStartG0,strand,uORFS | ||
|
||
def printUsageAndExit(parser): | ||
parser.print_help() | ||
exit(1) | ||
|
||
if __name__=='__main__': | ||
|
||
parser=OptionParser(usage="Usage: %prog inbed ogbed oinfo") | ||
parser.add_option("--minORFLength",dest="minORFLength",default=-1,type="int",help="set min ORF length for uORFs") | ||
parser.add_option("--maxORFLength",dest="maxORFLength",default=-1,type="int",help="set max ORF length for uORFs") | ||
parser.add_option("--trackName",dest="trackName",default="uORFs",help="set track name for output bed file") | ||
parser.add_option("--trackDescription",dest="trackDescription",default="upstream open reading frames",help="set track description for output bed file") | ||
parser.add_option("--trackVisilibity",dest="trackVisibility",default="full",help="set track visibility") | ||
|
||
(options,args)=parser.parse_args() | ||
|
||
try: | ||
filename,ogbedfn,oinfofn=args | ||
except: | ||
printUsageAndExit(parser) | ||
|
||
ogbed=open(ogbedfn,"w") | ||
oinfo=open(oinfofn,"w") | ||
|
||
print >> ogbed,"track name='"+options.trackName+"' description='"+options.trackDescription+"' visibility='"+options.trackVisibility+"'" | ||
print >> oinfo,"uORFName\tuORFSeq\tuORFTranslation\tuORFLengthNt\tDistanceToCDS\tminus3Base\tplus4Base" | ||
|
||
fil=open(filename) | ||
for lin in fil: | ||
lin=lin.rstrip("\r\n") | ||
if len(lin)<1 or lin[0]=="#": | ||
continue | ||
|
||
fields=lin.split("\t") | ||
name,chrom,geneChromStart0,strand,uORFS=findUORFsFromBedSeqFields(fields,options) | ||
for i in range(0,len(uORFS)): | ||
uORFgStart0,uORFgEnd1,tStart0,tEnd1,clipBlockStarts,clipBlockSizes,uORFSeq,distToCDS,minus3Base,plus4Base=uORFS[i] | ||
clipBlockStartStr=",".join(toStrList(clipBlockStarts)) | ||
clipBlockSizeStr=",".join(toStrList(clipBlockSizes)) | ||
outputv=toStrList([chrom,uORFgStart0,uORFgEnd1,name+".uORF"+str(i+1),0,strand,uORFgStart0,uORFgEnd1,"0,0,0",len(clipBlockStarts),clipBlockSizeStr,clipBlockStartStr]) | ||
print >> ogbed,"\t".join(outputv) | ||
coding_dna=Seq(uORFSeq,generic_dna) | ||
protein_seq=coding_dna.translate(to_stop=True) | ||
print >> oinfo,"\t".join(toStrList([name+".uORF"+str(i+1),uORFSeq,protein_seq,len(uORFSeq),distToCDS,minus3Base,plus4Base])) | ||
|
||
|
||
fil.close() | ||
ogbed.close() | ||
oinfo.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#!/bin/bash | ||
|
||
if [ $# -lt 3 ]; then | ||
echo "$0 inMatrix outMatrix rowJoiner [ colJoiner [ rowJoinOpts [ colJoinOpts ] ] ]" | ||
echo "join opts: (from joinu.py)" | ||
joinu.py #no args to trigger help message | ||
exit | ||
fi | ||
|
||
rowJoinOpts="" | ||
colJoinOpts="" | ||
|
||
inMatrix=$1 | ||
rowJoiner=$3 | ||
outMatrix=$2 | ||
|
||
colJoiner=$rowJoiner | ||
|
||
if [ $# -ge 4 ]; then | ||
colJoiner=$4 | ||
if [ $# -ge 5 ]; then | ||
rowJoinOpts=$5 | ||
if [ $# -ge 6 ]; then | ||
colJoinOpts=$6 | ||
fi | ||
fi | ||
|
||
fi | ||
|
||
#now the real stuff | ||
|
||
tmp1=`tempfile` #get temp file tmp1 | ||
#first rowjoin | ||
cmd="joinu.py $rowJoinOpts $rowJoiner $inMatrix > $tmp1" | ||
eval $cmd | ||
|
||
#cat $tmp1 | ||
|
||
tmp2=`tempfile` | ||
matrixTranspose.py $tmp1 > $tmp2 | ||
|
||
#cat $tmp2 | ||
|
||
tmp3=`tempfile` | ||
cmd="joinu.py $colJoinOpts $colJoiner $tmp2 > $tmp3" | ||
eval $cmd | ||
|
||
#cat $tmp3 | ||
|
||
#now transpose back | ||
matrixTranspose.py $tmp3 > $outMatrix | ||
|
||
#now remove temp file (is this neccessary?) | ||
rmrie.sh $tmp1 | ||
rmrie.sh $tmp2 | ||
rmrie.sh $tmp3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/bin/bash | ||
|
||
if [ $# -lt 3 ]; then | ||
echo $0 rawFileAsTemplate expMatrix outputFileWithFeatureNum | ||
exit | ||
fi | ||
|
||
rawFileAsTemplate=$1 | ||
expMatrix=$2 | ||
outputFileWithFeatureNum=$3 | ||
|
||
awk '(FNR>=10){printf("%s\t%s\t%s\n",$2,$3,$4);}' $rawFileAsTemplate > marryFeatureNum.map.00 | ||
#now join | ||
joinu.py -1.Row,.Col -2.Row,.Col marryFeatureNum.map.00 $expMatrix > $outputFileWithFeatureNum |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/usr/bin/env python | ||
|
||
from optparse import OptionParser | ||
from sys import * | ||
from math import * | ||
from numpy import * | ||
|
||
def printUsageAndExit(parser): | ||
parser.print_help() | ||
exit(1) | ||
|
||
|
||
f=float | ||
i=int | ||
|
||
def f1(t=str): | ||
return t(M1[ROWNUM0][COLNUM0]) | ||
|
||
def f2(t=str): | ||
return t(M2[ROWNUM0][COLNUM0]) | ||
|
||
def f1f(): | ||
return float(f1()) | ||
|
||
def f2f(): | ||
return float(f2()) | ||
|
||
def f1i(): | ||
return int(f1()) | ||
|
||
def f2i(): | ||
return int(f2()) | ||
|
||
def loadMatrix(filename,options): | ||
M=[] | ||
colnames=[] | ||
rownames=[] | ||
topleftcorner="" | ||
fil=open(filename) | ||
lino=0 | ||
for lin in fil: | ||
lino+=1 | ||
lin=lin.rstrip("\r\n") | ||
fields=lin.split(options.IFS) | ||
if lino==1: | ||
topleftcorner=fields[0] | ||
colnames=fields[1:] | ||
else: | ||
rownames.append(fields[0]) | ||
M.append(fields[1:]) | ||
|
||
|
||
|
||
fil.close() | ||
return topleftcorner,colnames,rownames,M | ||
|
||
def dim(M): | ||
numRows=len(M) | ||
if numRows>=1: | ||
numCols=len(M[0]) | ||
else: | ||
numCols=0 | ||
|
||
return (numRows,numCols) | ||
|
||
if __name__=='__main__': | ||
parser=OptionParser("usage: %prog [options] infile1 infile2 cmd > outfil\nf1(int) or f1(i) the content of file1 as integer\nf1(f) or f1(float) the content of file1 as float\nfor example: f1()+f2() concatenates the string from file1 and file2 at each cell\nf1(float)/f2(float) divides f1 as a float by f2\n'T' if f1(float)>f2(float) else 'F' writes T if f1 > f2 as floats else it writes F") | ||
parser.add_option("--ifs",dest="IFS",default="\t",help="set input field separator [tab]") | ||
parser.add_option("--ofs",dest="OFS",default="\t",help="set output field separator [tab]") | ||
parser.add_option("--invalid-value",dest="invalidValue",default=None,help="set the invalid value to print [None: quit program]") | ||
|
||
(options,args)=parser.parse_args() | ||
|
||
try: | ||
INFILE1,INFILE2,cmd=args | ||
except: | ||
printUsageAndExit(parser) | ||
|
||
#now load file into mem | ||
|
||
|
||
topleftcorner1,colnames1,rownames1,M1=loadMatrix(INFILE1,options) | ||
topleftcorner2,colnames2,rownames2,M2=loadMatrix(INFILE2,options) | ||
|
||
numRows1,numCols1=dim(M1) | ||
|
||
if dim(M1)!=dim(M2): | ||
print >> stderr,"file1:",INFILE1,"dim(r,c)=",dim(M1),"!=",INFILE2,"dim=",dim(M2) | ||
exit(1) | ||
|
||
if tuple(colnames1)!=tuple(colnames2): | ||
print >> stderr,"warning: file1 and file2 column names do not equal file1=",colnames1,"file2=",colnames2 | ||
|
||
if tuple(rownames1)!=tuple(rownames2): | ||
print >> stderr,"warning: file1 and file2 row names do not equal file1=",rownames1,"file2=",colnames2 | ||
|
||
|
||
print >> stdout,options.OFS.join([topleftcorner1]+colnames1) | ||
#now contruct and print matrix | ||
for ROWNUM0 in range(0,numRows1): | ||
ROWNUM=ROWNUM0+1 | ||
ROWNAME=rownames1[ROWNUM0] | ||
#print name | ||
outputv=[ROWNAME] | ||
for COLNUM0 in range(0,numCols1): | ||
COLNUM=COLNUM0+1 | ||
COLNAME=colnames1[COLNUM0] | ||
try: | ||
valueToPrint=str(eval(cmd)) | ||
outputv.append(valueToPrint) | ||
except: | ||
if options.invalidValue==None: | ||
print >> stderr,"error running",cmd,"at ROWNUM,ROWNAME=",ROWNUM,ROWNAME,"COLNUM,COLNAME=",COLNUM,COLNAME | ||
exit(1) | ||
else: | ||
outputv.append(options.invalidValue) | ||
print >> stdout,options.OFS.join(outputv) | ||
|
||
|
||
|