Skip to content

Commit

Permalink
Merge pull request #311 from KosinskiLab/update_anlaysis_pipeline
Browse files Browse the repository at this point in the history
Add interface PAE and plddt calculation and binding energy calculation to anlaysis pipeline
fixed bugs in V2 beta version 1
  • Loading branch information
dingquanyu authored Apr 19, 2024
2 parents 05ae2d9 + e69c864 commit d878726
Show file tree
Hide file tree
Showing 28 changed files with 1,049 additions and 1,064 deletions.
2 changes: 1 addition & 1 deletion alphapulldown/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.0.0.b1"
__version__ = "2.0.0.b2"
20 changes: 13 additions & 7 deletions alphapulldown/analysis_pipeline/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,19 @@ COPY software/muscle-5.1/src/Linux/muscle /software/
COPY software/pi_score/ /software/pi_score/
COPY software/uniprot_sprot.fasta /database/
COPY alpha-analysis/pi_score_env.yml /app/pi_score_env.yml
COPY alpha-analysis/*py /app/alpha-analysis/
COPY alpha-analysis/*.sh /app/
# COPY alpha-analysis/*py /app/alpha-analysis/
# COPY alpha-analysis/*.sh /app/
RUN apt-get update
RUN apt-get install -y build-essential
# RUN conda env create -f /app/programme_notebook.yml
RUN conda env create -f /app/pi_score_env.yml
RUN pip install absl-py
RUN pip install biopandas==0.4.1
RUN pip install pyrosetta-installer
RUN python -c 'import pyrosetta_installer; pyrosetta_installer.install_pyrosetta()'
RUN pip install biopython==1.81
# RUN pip install "jax[cuda]==0.3.25" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

RUN pip install jax==0.4.16 jaxlib==0.4.16+cuda11.cudnn86 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
# RUN ["/bin/bash","-c","source activate programme_notebook && conda install -c conda-forge cctbx-base"]
RUN ["/bin/bash","-c","source activate pi_score && pip install numpy==1.16.6 scikit-learn==0.20.4"]
Expand All @@ -51,13 +57,13 @@ RUN ["/bin/bash","-c","test -d $CCP4_SCR || echo 'Unable to assign CCP4_SCR. Thi
ENV CLIBD="/software/lib64/data"
ENV PATH="/app:$PATH"
ENV PATH="/software:$PATH"
ENV PATH="/app/alpha-analysis:$PATH"
# ENV PATH="/app/alpha-analysis:$PATH"
ENV PYTHONPATH="/app:$PYTHONPATH"
ENV LD_LIBRARY_PATH="/software/lib64:$LD_LIBRARY_PATH"
# Below have to relink the libz:
RUN ["/bin/bash","-c","cd /software/lib64 && mv libz.so.1 libz.so.1.old && ln -s /lib/x86_64-linux-gnu/libz.so.1"]
RUN chmod +x /app/run_get_good_pae.sh
RUN chmod +x /app/run_execute_notebook.sh
RUN chmod +x /app/run_pi_score.sh
RUN chmod +x /app/alpha-analysis/get_good_inter_pae.py
# RUN chmod +x /app/run_get_good_pae.sh
# RUN chmod +x /app/run_execute_notebook.sh
# RUN chmod +x /app/run_pi_score.sh
# RUN chmod +x /app/alpha-analysis/get_good_inter_pae.py
ENTRYPOINT [ "bash" ]
235 changes: 112 additions & 123 deletions alphapulldown/analysis_pipeline/get_good_inter_pae.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,159 +1,148 @@
#!/usr/bin/env python3

from math import pi
from operator import index
import os
from calculate_mpdockq import *
from pdb_analyser import PDBAnalyser
from Bio.PDB import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
import os
import pickle
from absl import flags,app,logging
from absl import flags, app, logging
import json
import numpy as np
import pandas as pd
import subprocess
from calculate_mpdockq import *
import sys
import gzip
from typing import Tuple

flags.DEFINE_string('output_dir',None,'directory where predicted models are stored')
flags.DEFINE_float('cutoff',5.0,'cutoff value of PAE. i.e. only pae<cutoff is counted good')
flags.DEFINE_integer('surface_thres',2,'surface threshold. must be integer')
FLAGS=flags.FLAGS
flags.DEFINE_string('output_dir', None,
'directory where predicted models are stored')
flags.DEFINE_float(
'cutoff', 5.0, 'cutoff value of PAE. i.e. only pae<cutoff is counted good')
flags.DEFINE_integer('surface_thres', 2, 'surface threshold. must be integer')
FLAGS = flags.FLAGS

def examine_inter_pae(pae_mtx,seqs,cutoff):

def examine_inter_pae(pae_mtx, seq_lengths, cutoff):
"""A function that checks inter-pae values in multimer prediction jobs"""
lens = [len(seq) for seq in seqs]
old_lenth=0
for length in lens:
old_lenth = 0
mtx = pae_mtx.copy()
for length in seq_lengths:
new_length = old_lenth + length
pae_mtx[old_lenth:new_length,old_lenth:new_length] = 50
mtx[old_lenth:new_length, old_lenth:new_length] = 50
old_lenth = new_length
check = np.where(pae_mtx<cutoff)[0].size !=0
check = np.where(mtx < cutoff)[0].size != 0

return check


def obtain_mpdockq(work_dir):
"""Returns mpDockQ if more than two chains otherwise return pDockQ"""
pdb_path = os.path.join(work_dir,'ranked_0.pdb')
pdb_path = os.path.join(work_dir, 'ranked_0.pdb')
pdb_chains, chain_coords, chain_CA_inds, chain_CB_inds = read_pdb(pdb_path)
best_plddt = get_best_plddt(work_dir)
plddt_per_chain = read_plddt(best_plddt,chain_CA_inds)
complex_score,num_chains = score_complex(chain_coords,chain_CB_inds,plddt_per_chain)
if complex_score is not None and num_chains>2:
plddt_per_chain = read_plddt(best_plddt, chain_CA_inds)
complex_score, num_chains = score_complex(
chain_coords, chain_CB_inds, plddt_per_chain)
if complex_score is not None and num_chains > 2:
mpDockq_or_pdockq = calculate_mpDockQ(complex_score)
elif complex_score is not None and num_chains==2:
chain_coords,plddt_per_chain = read_pdb_pdockq(pdb_path)
mpDockq_or_pdockq = calc_pdockq(chain_coords,plddt_per_chain,t=8)
elif complex_score is not None and num_chains == 2:
chain_coords, plddt_per_chain = read_pdb_pdockq(pdb_path)
mpDockq_or_pdockq = calc_pdockq(chain_coords, plddt_per_chain, t=8)
else:
mpDockq_or_pdockq = "None"
return mpDockq_or_pdockq

def run_and_summarise_pi_score(workd_dir,jobs,surface_thres):
return mpDockq_or_pdockq, plddt_per_chain

"""A function to calculate all predicted models' pi_scores and make a pandas df of the results"""
def obtain_pae_and_iptm(result_subdir: str, best_model: str) -> Tuple[np.array, float]:
"""A function that obtains PAE matrix and iptm score from either results pickle, zipped pickle, or pae_json"""
try:
os.rmdir(f"{workd_dir}/pi_score_outputs")
except:
pass
subprocess.run(f"mkdir {workd_dir}/pi_score_outputs",shell=True,executable='/bin/bash')
pi_score_outputs = os.path.join(workd_dir,"pi_score_outputs")
for job in jobs:
subdir = os.path.join(workd_dir,job)
if not os.path.isfile(os.path.join(subdir,"ranked_0.pdb")):
print(f"{job} failed. Cannot find ranked_0.pdb in {subdir}")
sys.exit()
else:
pdb_path = os.path.join(subdir,"ranked_0.pdb")
output_dir = os.path.join(pi_score_outputs,f"{job}")
logging.info(f"pi_score output for {job} will be stored at {output_dir}")
subprocess.run(f"source activate pi_score && export PYTHONPATH=/software:$PYTHONPATH && python /software/pi_score/run_piscore_wc.py -p {pdb_path} -o {output_dir} -s {surface_thres} -ps 10",shell=True,executable='/bin/bash')

ranking_results = json.load(
open(os.path.join(result_subdir, "ranking_debug.json")))
except FileNotFoundError as e:
logging.warning(f"ranking_debug.json is not found at {result_subdir}")
iptm_score = "None"
if "iptm" in ranking_results:
iptm_score = ranking_results['iptm'].get(best_model, None)

if os.path.exists(f"{result_subdir}/pae_{best_model}.json"):
pae_results = json.load(
open(f"{result_subdir}/pae_{best_model}.json"))[0]['predicted_aligned_error']
pae_mtx = np.array(pae_results)

if iptm_score == "None":
try:
check_dict = pickle.load(
open(os.path.join(result_subdir, f"result_{best_model}.pkl"), 'rb'))
except FileNotFoundError:
logging.info(os.path.join(
result_subdir, f"result_{best_model}.pkl")+" does not exist. Will search for pkl.gz")
check_dict = pickle.load(gzip.open(os.path.join(
result_subdir, f"result_{best_model}.pkl.gz"), 'rb'))
finally:
logging.info(f"finished reading results for the best model.")
pae_mtx = check_dict['predicted_aligned_error']
iptm_score = check_dict['iptm']
return pae_mtx, iptm_score


def obtain_seq_lengths(result_subdir: str) -> list:
"""A function that obtain length of each chain in ranked_0.pdb"""
best_result_pdb = os.path.join(result_subdir, "ranked_0.pdb")
seq_length = []
pdb_parser = PDBParser()
sequence_builder = PPBuilder()
if not os.path.exists(best_result_pdb):
raise FileNotFoundError(
f"ranked_0.pdb is not found at {result_subdir}")
else:
structure = pdb_parser.get_structure("ranked_0", best_result_pdb)
seqs = sequence_builder.build_peptides(structure)
for seq in seqs:
seq_length.append(len(seq.get_sequence()))
return seq_length

output_df = pd.DataFrame()
for job in jobs:
subdir = os.path.join(pi_score_outputs,job)
csv_files = [f for f in os.listdir(subdir) if 'filter_intf_features' in f]
pi_score_files = [f for f in os.listdir(subdir) if 'pi_score_' in f]
filtered_df = pd.read_csv(os.path.join(subdir,csv_files[0]))

if filtered_df.shape[0]==0:
for column in filtered_df.columns:
filtered_df[column] = ["None"]
filtered_df['jobs'] = str(job)
filtered_df['pi_score'] = "No interface detected"
else:
with open(os.path.join(subdir,pi_score_files[0]),'r') as f:
lines = [l for l in f.readlines() if "#" not in l]
if len(lines)>0:
pi_score = pd.read_csv(os.path.join(subdir,pi_score_files[0]))
pi_score['jobs']=str(job)
else:
pi_score = pd.DataFrame.from_dict({"pi_score":['SC: mds: too many atoms']})
f.close()
filtered_df['jobs'] = str(job)
pi_score['interface'] = pi_score['chains']
filtered_df=pd.merge(filtered_df,pi_score,on=['jobs','interface'])
try:
filtered_df=filtered_df.drop(columns=["#PDB","pdb"," pvalue","chains","predicted_class"])
except:
pass

output_df = pd.concat([output_df,filtered_df])
return output_df



def main(argv):
jobs = os.listdir(FLAGS.output_dir)
good_jobs = []
iptm_ptm = list()
iptm = list()
mpDockq_scores = list()
count = 0
good_jobs = []
output_df = pd.DataFrame()
for job in jobs:
logging.info(f"now processing {job}")
if os.path.isfile(os.path.join(FLAGS.output_dir,job,'ranking_debug.json')):
count=count +1
result_subdir = os.path.join(FLAGS.output_dir,job)
best_model = json.load(open(os.path.join(result_subdir,"ranking_debug.json"),'rb'))['order'][0]
data = json.load(open(os.path.join(result_subdir,"ranking_debug.json"),'rb'))
if "iptm" in data.keys() or "iptm+ptm" in data.keys():
if os.path.isfile(os.path.join(FLAGS.output_dir, job, 'ranking_debug.json')):
pdb_analyser = PDBAnalyser(os.path.join(
FLAGS.output_dir, job, "ranked_0.pdb"))
count = count + 1
result_subdir = os.path.join(FLAGS.output_dir, job)
best_model = json.load(
open(os.path.join(result_subdir, "ranking_debug.json"), 'r'))['order'][0]
data = json.load(
open(os.path.join(result_subdir, "ranking_debug.json"), 'r'))
if "iptm" in data.keys() and "iptm+ptm" in data.keys():
iptm_ptm_score = data['iptm+ptm'][best_model]
try:
check_dict = pickle.load(open(os.path.join(result_subdir,f"result_{best_model}.pkl"),'rb'))
except FileNotFoundError:
logging.info(os.path.join(result_subdir,f"result_{best_model}.pkl")+" does not exist. Will search for pkl.gz")
check_dict = pickle.load(gzip.open(os.path.join(result_subdir,f"result_{best_model}.pkl.gz"),'rb'))
finally:
logging.info(f"finished reading result pickle for the best model.")
seqs = check_dict['seqs']
iptm_score = check_dict['iptm']
pae_mtx = check_dict['predicted_aligned_error']
check = examine_inter_pae(pae_mtx,seqs,cutoff=FLAGS.cutoff)
mpDockq_score = obtain_mpdockq(os.path.join(FLAGS.output_dir,job))
pae_mtx, iptm_score = obtain_pae_and_iptm(
result_subdir, best_model)
seq_lengths = obtain_seq_lengths(result_subdir)
check = examine_inter_pae(
pae_mtx, seq_lengths, cutoff=FLAGS.cutoff)
mpDockq_score, plddt_per_chain = obtain_mpdockq(
os.path.join(FLAGS.output_dir, job))
if check:
good_jobs.append(str(job))
iptm_ptm.append(iptm_ptm_score)
iptm.append(iptm_score)
mpDockq_scores.append(mpDockq_score)
logging.info(f"done for {job} {count} out of {len(jobs)} finished.")
if len(good_jobs) > 0:
other_measurements_df=pd.DataFrame.from_dict({
"jobs":good_jobs,
"iptm_ptm":iptm_ptm,
"iptm":iptm,
"mpDockQ/pDockQ":mpDockq_scores
})
pi_score_df = run_and_summarise_pi_score(FLAGS.output_dir,good_jobs,FLAGS.surface_thres)
pi_score_df=pd.merge(pi_score_df,other_measurements_df,on="jobs")
columns = list(pi_score_df.columns.values)
columns.pop(columns.index('jobs'))
pi_score_df = pi_score_df[['jobs'] + columns]
pi_score_df = pi_score_df.sort_values(by='iptm',ascending=False)

pi_score_df.to_csv(os.path.join(FLAGS.output_dir,"predictions_with_good_interpae.csv"),index=False)

good_jobs.append(job)
score_df = pdb_analyser(
pae_mtx, plddt_per_chain)
score_df['jobs']=job
score_df['iptm_ptm'] = iptm_ptm_score
score_df['iptm'] = iptm_score
score_df['pDockQ/mpDockQ'] = mpDockq_score
for i in ['pDockQ/mpDockQ', 'iptm', 'iptm_ptm','jobs']:
score_df.insert(0, i, score_df.pop(i))
output_df = pd.concat([score_df,output_df])
logging.info(
f"done for {job} {count} out of {len(jobs)} finished.")
if len(good_jobs) == 0:
logging.info(
f"Unfortunately, none of your protein models had at least one PAE on the interface below your cutoff value : {FLAGS.cutoff}.\n Please consider using a larger cutoff.")
else:
logging.info(f"Unfortunately, none of your protein models had at least one PAE on the interface below your cutoff value : {FLAGS.cutoff}.\n Please consider using a larger cutoff.")
output_df.to_csv(os.path.join(FLAGS.output_dir,"predictions_with_good_interpae.csv"),index=False)

if __name__ =='__main__':
app.run(main)
if __name__ == '__main__':
app.run(main)
Loading

0 comments on commit d878726

Please sign in to comment.