Adding data

wilkelab · Oct 8, 2015 · 551b2f7 · 551b2f7
1 parent 36f6fb8
commit 551b2f7
Show file tree

Hide file tree

Showing 62 changed files with 279,064 additions and 0 deletions.
diff --git a/README.md → README.txt b/README.md → README.txt
diff --git a/clean_structures/4LQW.pdb b/clean_structures/4LQW.pdb
diff --git a/clean_structures/4LQW_B.fasta b/clean_structures/4LQW_B.fasta
@@ -0,0 +1,2 @@
+>4LQW_B
+MELSKETNPVVFFDVCADGEPLGRITMELFSNIVPRTAENFRALCTGEKGFGFKNSIFHRVIPDFVCQGGDITKHDGTGGQSIYGDKFEDENFDVKHTGPGLLSMANQGQNTNNSQFVITLKKAEHLDFKHVVFGFVKDGMDTVKKIESFGSPKGSVCRRITITECGQI
diff --git a/clean_structures/4LQW_BC.pdb b/clean_structures/4LQW_BC.pdb
diff --git a/clean_structures/4LQW_C.fasta b/clean_structures/4LQW_C.fasta
@@ -0,0 +1,2 @@
+>4LQW_C
+PIVQNLQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRLHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTHNPPIPVGEIYKRWIILGLNKIVRMYS
diff --git a/clean_structures/amino_acids.py b/clean_structures/amino_acids.py
diff --git a/clean_structures/amino_acids.pyc b/clean_structures/amino_acids.pyc
diff --git a/clean_structures/clean_pdb.py b/clean_structures/clean_pdb.py
@@ -0,0 +1,312 @@
+#!/usr/bin/python
+
+"written by Phil Bradley, Rhiju Das, Michael Tyka, TJ Brunette, and James Thompson from the Baker Lab. Edits done by Steven Combs, Sam Deluca and Jordan Willis  from the Meiler Lab."
+
+import sys
+import os
+from sys import argv, stderr, stdout
+from os import popen, system
+from os.path import exists, basename
+from amino_acids import longer_names
+from amino_acids import modres
+# remote host for downloading pdbs
+remote_host = ''
+
+shit_stat_insres = False
+shit_stat_altpos = False
+shit_stat_modres = False
+shit_stat_misdns = False  # missing density!
+
+fastaseq = {}
+pdbfile = ""
+
+
+def download_pdb(pdb_id, dest_dir):
+    # print "downloading %s" % ( pdb_id )
+    url = 'http://www.rcsb.org/pdb/files/%s.pdb.gz' % (pdb_id.upper())
+    dest = '%s/%s.pdb.gz' % (os.path.abspath(dest_dir), pdb_id.lower())
+    wget_cmd = 'wget --quiet %s -O %s' % (url, dest)
+    print wget_cmd
+    if remote_host:
+        wget_cmd = 'ssh %s %s' % (remote_host, wget_cmd)
+
+    lines = popen(wget_cmd).readlines()
+    if (exists(dest)):
+        return dest
+    else:
+        print "Error: didn't download file!"
+
+
+def check_and_print_pdb(count, residue_buffer, residue_letter):
+    global pdbfile
+  # Check that CA, N and C are present!def check_and_print_pdb( outid, residue_buffer )
+    hasCA = False
+    hasN = False
+    hasC = False
+    for line in residue_buffer:
+        atomname = line[12:16]
+        # Only add bb atoms if they have occupancy!
+        occupancy = float(line[55:60])
+        if atomname == " CA " and occupancy > 0.0:
+            hasCA = True
+        if atomname == " N  " and occupancy > 0.0:
+            hasN = True
+        if atomname == " C  " and occupancy > 0.0:
+            hasC = True
+
+  # if all three backbone atoms are present withoccupancy proceed to print the residue
+    if hasCA and hasN and hasC:
+        for line in residue_buffer:
+            # add linear residue count
+            newnum = '%4d ' % count
+            line_edit = line[0:22] + newnum + line[27:]
+            # write the residue line
+            pdbfile = pdbfile + line_edit
+
+    # finally print residue letter into fasta strea
+        chain = line[21]
+        try:
+            fastaseq[chain] += residue_letter
+        except KeyError:
+            fastaseq[chain] = residue_letter
+    # count up residue number
+        count = count + 1
+        return True
+    return False
+
+
+def print_help():
+    print "clean_pdb.py <pdb> <chain id>"
+
+    print "pdb = file name of the file. Can be with or without the .pdb file handle"
+    print "chain id = The chain id you are interested in. If more than one chain, "
+    print "you can pass the chain id without spaces. For example \"AB\" gets you"
+    print "chain A and B. \"A\" gets you chain A."
+    print "\n",
+    print "chain id = nochain. Removes chain identity from output"
+    print "chain id = ignorechain. Gets all the chains for pdb"
+    print "\n",
+    print "written by Phil Bradley, Rhiju Das, Michael Tyka, TJ Brunette, and James Thompson from the Baker Lab. Edits done by Steven Combs, Sam Deluca and Jordan Willis from the Meiler Lab."
+    sys.exit()
+
+
+if argv.count('-h'):
+    print_help()
+files_to_unlink = []
+try:
+    assert(len(argv) > 2)
+except AssertionError:
+    print_help()
+
+pdbname = argv[1].upper()
+
+if argv[2].strip() != "ignorechain" and argv[2].strip() != "nochain":
+    chainid = argv[2].upper()
+else:
+    chainid = argv[2]
+
+if (pdbname[-4:] != '.pdb' and pdbname[-8:] != '.pdb1.gz'):
+    pdbname += '.pdb'
+
+# outfile = string.lower(pdbname[0:4]) + chainid + pdbname[4:]
+outfile = pdbname[0:-4] + "_" + chainid + ".pdb"
+
+nopdbout = 0
+if argv.count('nopdbout'):
+    nopdbout = 1
+
+removechain = 0
+if argv.count('nochain'):
+    removechain = 1
+
+ignorechain = 0
+if argv.count('ignorechain'):
+    ignorechain = 1
+
+netpdbname = pdbname
+if not exists(netpdbname):
+    netpdbname = pdbname
+
+fixed_pdb = pdbname
+print "Looking for: ", fixed_pdb
+if os.path.isfile(fixed_pdb):
+    print "Found preoptimised or otherwise fixed PDB file. "
+    netpdbname = fixed_pdb
+else:
+    print "File %s doesn't exist, downloading from internet." % (netpdbname)
+    netpdbname = download_pdb(pdbname[0:4], '.')
+    files_to_unlink.append(netpdbname)
+
+if netpdbname[-3:] == '.gz':
+    lines = popen('zcat '+netpdbname, 'r').readlines()
+else:
+    lines = open(netpdbname, 'r').readlines()
+
+
+oldresnum = '   '
+count = 1
+modifiedres = ''
+
+
+residue_buffer = []
+residue_letter = ''
+residue_invalid = False
+
+if chainid == '_':
+    chainid = ' '
+
+for i in range(len(lines)):
+    line = lines[i]
+
+    if len(line) > 5 and line[:6] == 'ENDMDL': break  # Its an NMR model.
+    chainid = [i for i in chainid]
+    if (line[21] in chainid or ignorechain or removechain):
+        line_edit = line
+        if line[0:3] == 'TER':
+            continue
+        elif (line[0:6] == 'HETATM'):
+            ok = False
+
+            # Is it a modified residue ?
+            if modres.has_key(line[17:20]):
+              # if so replace it with its canonical equivalent !
+                line_edit = 'ATOM  '+line[6:17]+modres[line[17:20]] + line[20:]
+                modifiedres = modifiedres + line[17:20] + ',  '
+                # dont count MSEs as modiied residues (cos they're so common and get_pdb deal with them previosuly)
+                if line[17:20] != "MSE":
+                    shit_stat_modres = True
+                ok = True
+
+            # other substitution (of atoms mainly)
+            if (line[17:20] == 'MSE'):  # Selenomethionine
+                if (line_edit[12:14] == 'SE'):
+                    line_edit = line_edit[0:12]+' S'+line_edit[14:]
+                if len(line_edit) > 75:
+                    if (line_edit[76:78] == 'SE'):
+                        line_edit = line_edit[0:76]+' S'+line_edit[78:]
+
+            if not ok:
+                continue  # skip this atom if we havnt found a conversion
+
+        if line_edit[0:4] == 'ATOM':  # or line_edit[0:6] == 'HETATM':
+
+# if line_edit[13:14]=='P': #Nucleic acid? Skip.
+# resnum = line_edit[23:26]
+# oldresnum = resnum
+# while (resnum == oldresnum):
+# print "HERE"
+# i += 1
+# line = lines[i]
+# resnum = line_edit[23:26]
+
+            resnum = line_edit[22:27]
+
+            insres = line[26]
+            if insres != ' ':
+                shit_stat_insres = True
+
+            altpos = line[16]
+            if altpos != ' ':
+                shit_stat_altpos = True
+            # Is thresidue_letter
+            if not resnum == oldresnum:
+                if residue_buffer != []:  # is there a residue in the buffer ?
+                    if not residue_invalid:
+                        if not check_and_print_pdb(count, residue_buffer, residue_letter):
+                            # if unsuccessful
+                            shit_stat_misdns = True
+                        else:
+                            count = count + 1
+
+                residue_buffer = []
+                residue_letter = ""
+                residue_invalid = False
+
+                longname = line_edit[17:20]
+                if longer_names.has_key(longname):
+                    residue_letter = longer_names[longname]
+                else:
+                    residue_letter = 'X'
+                    residue_invalid = True
+
+            oldresnum = resnum
+
+            # What does this do ?
+            if line_edit[16:17] == 'A':
+                line_edit = line_edit[:16]+' '+line_edit[17:]
+
+            if line_edit[16:17] != ' ':
+                continue
+
+            if removechain:
+                line_edit = line_edit[0:21]+' '+line_edit[22:]
+
+            residue_buffer.append(line_edit)
+
+
+if not check_and_print_pdb(count, residue_buffer, residue_letter):
+    # if unsuccessful
+    shit_stat_misdns = True
+else:
+    count = count + 1
+
+
+flag_altpos = "---"
+if shit_stat_altpos:
+    flag_altpos = "ALT"
+flag_insres = "---"
+if shit_stat_insres:
+    flag_insres = "INS"
+flag_modres = "---"
+if shit_stat_modres:
+    flag_modres = "MOD"
+flag_misdns = "---"
+if shit_stat_misdns:
+    flag_misdns = "DNS"
+
+nres = len("".join(fastaseq.values()))
+
+flag_successful = "OK"
+if nres <= 0:
+    flag_successful = "BAD"
+
+
+print netpdbname, pdbname, "".join(chainid), "%5d" % nres, flag_altpos,  flag_insres,  flag_modres,  flag_misdns, flag_successful
+
+
+if chainid == ' ':
+    chainid = '_'
+if nres > 0:
+    if (nopdbout == 0):
+        # outfile = string.lower( basename(outfile) )
+        outfile = outfile.replace('.pdb1.gz', '.pdb')
+        outid = open(outfile, 'w')
+        outid.write(pdbfile)
+        outid.write("TER\n")
+        outid.close()
+
+    fastaid = stdout
+    if argv[2] != "ignorechain" and argv[2] != "nochain":
+        for chain in fastaseq:
+            fastaid.write('>'+pdbname[0:4]+"_"+chain+'\n')
+            fastaid.write(fastaseq[chain])
+            fastaid.write('\n')
+            handle = open(pdbname[0:4]+"_"+"".join(chain) + ".fasta", 'w')
+            handle.write('>'+pdbname[0:4]+"_"+"".join(chain)+'\n')
+            handle.write(fastaseq[chain])
+            handle.write('\n')
+            handle.close()
+    else:
+        fastaseq = ["".join(fastaseq.values())]
+        fastaid.write('>'+pdbname[0:4]+"_"+argv[2]+'\n')
+        fastaid.writelines(fastaseq)
+        fastaid.write('\n')
+        handle = open(pdbname[0:4]+"_"+argv[2] + ".fasta", 'w')
+        handle.write('>'+pdbname[0:4]+"_"+argv[2]+'\n')
+        handle.writelines(fastaseq)
+        handle.write('\n')
+        handle.close()
+
+if len(files_to_unlink) > 0:
+    for file in files_to_unlink:
+        os.unlink(file)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		>4LQW_B
		MELSKETNPVVFFDVCADGEPLGRITMELFSNIVPRTAENFRALCTGEKGFGFKNSIFHRVIPDFVCQGGDITKHDGTGGQSIYGDKFEDENFDVKHTGPGLLSMANQGQNTNNSQFVITLKKAEHLDFKHVVFGFVKDGMDTVKKIESFGSPKGSVCRRITITECGQI
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		>4LQW_C
		PIVQNLQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRLHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTHNPPIPVGEIYKRWIILGLNKIVRMYS