Skip to content

Commit

Permalink
Adding data
Browse files Browse the repository at this point in the history
  • Loading branch information
elj299 committed Oct 8, 2015
1 parent 36f6fb8 commit 551b2f7
Show file tree
Hide file tree
Showing 62 changed files with 279,064 additions and 0 deletions.
File renamed without changes.
10,583 changes: 10,583 additions & 0 deletions clean_structures/4LQW.pdb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions clean_structures/4LQW_B.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>4LQW_B
MELSKETNPVVFFDVCADGEPLGRITMELFSNIVPRTAENFRALCTGEKGFGFKNSIFHRVIPDFVCQGGDITKHDGTGGQSIYGDKFEDENFDVKHTGPGLLSMANQGQNTNNSQFVITLKKAEHLDFKHVVFGFVKDGMDTVKKIESFGSPKGSVCRRITITECGQI
2,445 changes: 2,445 additions & 0 deletions clean_structures/4LQW_BC.pdb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions clean_structures/4LQW_C.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>4LQW_C
PIVQNLQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRLHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTHNPPIPVGEIYKRWIILGLNKIVRMYS
612 changes: 612 additions & 0 deletions clean_structures/amino_acids.py

Large diffs are not rendered by default.

Binary file added clean_structures/amino_acids.pyc
Binary file not shown.
312 changes: 312 additions & 0 deletions clean_structures/clean_pdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,312 @@
#!/usr/bin/python

"written by Phil Bradley, Rhiju Das, Michael Tyka, TJ Brunette, and James Thompson from the Baker Lab. Edits done by Steven Combs, Sam Deluca and Jordan Willis from the Meiler Lab."

import sys
import os
from sys import argv, stderr, stdout
from os import popen, system
from os.path import exists, basename
from amino_acids import longer_names
from amino_acids import modres
# remote host for downloading pdbs
remote_host = ''

shit_stat_insres = False
shit_stat_altpos = False
shit_stat_modres = False
shit_stat_misdns = False # missing density!

fastaseq = {}
pdbfile = ""


def download_pdb(pdb_id, dest_dir):
# print "downloading %s" % ( pdb_id )
url = 'http://www.rcsb.org/pdb/files/%s.pdb.gz' % (pdb_id.upper())
dest = '%s/%s.pdb.gz' % (os.path.abspath(dest_dir), pdb_id.lower())
wget_cmd = 'wget --quiet %s -O %s' % (url, dest)
print wget_cmd
if remote_host:
wget_cmd = 'ssh %s %s' % (remote_host, wget_cmd)

lines = popen(wget_cmd).readlines()
if (exists(dest)):
return dest
else:
print "Error: didn't download file!"


def check_and_print_pdb(count, residue_buffer, residue_letter):
global pdbfile
# Check that CA, N and C are present!def check_and_print_pdb( outid, residue_buffer )
hasCA = False
hasN = False
hasC = False
for line in residue_buffer:
atomname = line[12:16]
# Only add bb atoms if they have occupancy!
occupancy = float(line[55:60])
if atomname == " CA " and occupancy > 0.0:
hasCA = True
if atomname == " N " and occupancy > 0.0:
hasN = True
if atomname == " C " and occupancy > 0.0:
hasC = True

# if all three backbone atoms are present withoccupancy proceed to print the residue
if hasCA and hasN and hasC:
for line in residue_buffer:
# add linear residue count
newnum = '%4d ' % count
line_edit = line[0:22] + newnum + line[27:]
# write the residue line
pdbfile = pdbfile + line_edit

# finally print residue letter into fasta strea
chain = line[21]
try:
fastaseq[chain] += residue_letter
except KeyError:
fastaseq[chain] = residue_letter
# count up residue number
count = count + 1
return True
return False


def print_help():
print "clean_pdb.py <pdb> <chain id>"

print "pdb = file name of the file. Can be with or without the .pdb file handle"
print "chain id = The chain id you are interested in. If more than one chain, "
print "you can pass the chain id without spaces. For example \"AB\" gets you"
print "chain A and B. \"A\" gets you chain A."
print "\n",
print "chain id = nochain. Removes chain identity from output"
print "chain id = ignorechain. Gets all the chains for pdb"
print "\n",
print "written by Phil Bradley, Rhiju Das, Michael Tyka, TJ Brunette, and James Thompson from the Baker Lab. Edits done by Steven Combs, Sam Deluca and Jordan Willis from the Meiler Lab."
sys.exit()


if argv.count('-h'):
print_help()
files_to_unlink = []
try:
assert(len(argv) > 2)
except AssertionError:
print_help()

pdbname = argv[1].upper()

if argv[2].strip() != "ignorechain" and argv[2].strip() != "nochain":
chainid = argv[2].upper()
else:
chainid = argv[2]

if (pdbname[-4:] != '.pdb' and pdbname[-8:] != '.pdb1.gz'):
pdbname += '.pdb'

# outfile = string.lower(pdbname[0:4]) + chainid + pdbname[4:]
outfile = pdbname[0:-4] + "_" + chainid + ".pdb"

nopdbout = 0
if argv.count('nopdbout'):
nopdbout = 1

removechain = 0
if argv.count('nochain'):
removechain = 1

ignorechain = 0
if argv.count('ignorechain'):
ignorechain = 1

netpdbname = pdbname
if not exists(netpdbname):
netpdbname = pdbname

fixed_pdb = pdbname
print "Looking for: ", fixed_pdb
if os.path.isfile(fixed_pdb):
print "Found preoptimised or otherwise fixed PDB file. "
netpdbname = fixed_pdb
else:
print "File %s doesn't exist, downloading from internet." % (netpdbname)
netpdbname = download_pdb(pdbname[0:4], '.')
files_to_unlink.append(netpdbname)

if netpdbname[-3:] == '.gz':
lines = popen('zcat '+netpdbname, 'r').readlines()
else:
lines = open(netpdbname, 'r').readlines()


oldresnum = ' '
count = 1
modifiedres = ''


residue_buffer = []
residue_letter = ''
residue_invalid = False

if chainid == '_':
chainid = ' '

for i in range(len(lines)):
line = lines[i]

if len(line) > 5 and line[:6] == 'ENDMDL': break # Its an NMR model.
chainid = [i for i in chainid]
if (line[21] in chainid or ignorechain or removechain):
line_edit = line
if line[0:3] == 'TER':
continue
elif (line[0:6] == 'HETATM'):
ok = False

# Is it a modified residue ?
if modres.has_key(line[17:20]):
# if so replace it with its canonical equivalent !
line_edit = 'ATOM '+line[6:17]+modres[line[17:20]] + line[20:]
modifiedres = modifiedres + line[17:20] + ', '
# dont count MSEs as modiied residues (cos they're so common and get_pdb deal with them previosuly)
if line[17:20] != "MSE":
shit_stat_modres = True
ok = True

# other substitution (of atoms mainly)
if (line[17:20] == 'MSE'): # Selenomethionine
if (line_edit[12:14] == 'SE'):
line_edit = line_edit[0:12]+' S'+line_edit[14:]
if len(line_edit) > 75:
if (line_edit[76:78] == 'SE'):
line_edit = line_edit[0:76]+' S'+line_edit[78:]

if not ok:
continue # skip this atom if we havnt found a conversion

if line_edit[0:4] == 'ATOM': # or line_edit[0:6] == 'HETATM':

# if line_edit[13:14]=='P': #Nucleic acid? Skip.
# resnum = line_edit[23:26]
# oldresnum = resnum
# while (resnum == oldresnum):
# print "HERE"
# i += 1
# line = lines[i]
# resnum = line_edit[23:26]

resnum = line_edit[22:27]

insres = line[26]
if insres != ' ':
shit_stat_insres = True

altpos = line[16]
if altpos != ' ':
shit_stat_altpos = True
# Is thresidue_letter
if not resnum == oldresnum:
if residue_buffer != []: # is there a residue in the buffer ?
if not residue_invalid:
if not check_and_print_pdb(count, residue_buffer, residue_letter):
# if unsuccessful
shit_stat_misdns = True
else:
count = count + 1

residue_buffer = []
residue_letter = ""
residue_invalid = False

longname = line_edit[17:20]
if longer_names.has_key(longname):
residue_letter = longer_names[longname]
else:
residue_letter = 'X'
residue_invalid = True

oldresnum = resnum

# What does this do ?
if line_edit[16:17] == 'A':
line_edit = line_edit[:16]+' '+line_edit[17:]

if line_edit[16:17] != ' ':
continue

if removechain:
line_edit = line_edit[0:21]+' '+line_edit[22:]

residue_buffer.append(line_edit)


if not check_and_print_pdb(count, residue_buffer, residue_letter):
# if unsuccessful
shit_stat_misdns = True
else:
count = count + 1


flag_altpos = "---"
if shit_stat_altpos:
flag_altpos = "ALT"
flag_insres = "---"
if shit_stat_insres:
flag_insres = "INS"
flag_modres = "---"
if shit_stat_modres:
flag_modres = "MOD"
flag_misdns = "---"
if shit_stat_misdns:
flag_misdns = "DNS"

nres = len("".join(fastaseq.values()))

flag_successful = "OK"
if nres <= 0:
flag_successful = "BAD"


print netpdbname, pdbname, "".join(chainid), "%5d" % nres, flag_altpos, flag_insres, flag_modres, flag_misdns, flag_successful


if chainid == ' ':
chainid = '_'
if nres > 0:
if (nopdbout == 0):
# outfile = string.lower( basename(outfile) )
outfile = outfile.replace('.pdb1.gz', '.pdb')
outid = open(outfile, 'w')
outid.write(pdbfile)
outid.write("TER\n")
outid.close()

fastaid = stdout
if argv[2] != "ignorechain" and argv[2] != "nochain":
for chain in fastaseq:
fastaid.write('>'+pdbname[0:4]+"_"+chain+'\n')
fastaid.write(fastaseq[chain])
fastaid.write('\n')
handle = open(pdbname[0:4]+"_"+"".join(chain) + ".fasta", 'w')
handle.write('>'+pdbname[0:4]+"_"+"".join(chain)+'\n')
handle.write(fastaseq[chain])
handle.write('\n')
handle.close()
else:
fastaseq = ["".join(fastaseq.values())]
fastaid.write('>'+pdbname[0:4]+"_"+argv[2]+'\n')
fastaid.writelines(fastaseq)
fastaid.write('\n')
handle = open(pdbname[0:4]+"_"+argv[2] + ".fasta", 'w')
handle.write('>'+pdbname[0:4]+"_"+argv[2]+'\n')
handle.writelines(fastaseq)
handle.write('\n')
handle.close()

if len(files_to_unlink) > 0:
for file in files_to_unlink:
os.unlink(file)
Loading

0 comments on commit 551b2f7

Please sign in to comment.