Merge pull request #311 from KosinskiLab/update_anlaysis_pipeline

Add interface PAE and plddt calculation and binding energy calculation to anlaysis pipeline fixed bugs in V2 beta version 1
KosinskiLab · Apr 19, 2024 · d878726 · d878726
2 parents 05ae2d9 + e69c864
commit d878726
Show file tree

Hide file tree

Showing 28 changed files with 1,049 additions and 1,064 deletions.
diff --git a/alphapulldown/__init__.py b/alphapulldown/__init__.py
@@ -1 +1 @@
-__version__ = "2.0.0.b1"
+__version__ = "2.0.0.b2"
diff --git a/alphapulldown/analysis_pipeline/Dockerfile b/alphapulldown/analysis_pipeline/Dockerfile
@@ -19,13 +19,19 @@ COPY software/muscle-5.1/src/Linux/muscle /software/
 COPY software/pi_score/ /software/pi_score/
 COPY software/uniprot_sprot.fasta /database/
 COPY alpha-analysis/pi_score_env.yml /app/pi_score_env.yml
-COPY alpha-analysis/*py /app/alpha-analysis/
-COPY alpha-analysis/*.sh /app/
+# COPY alpha-analysis/*py /app/alpha-analysis/
+# COPY alpha-analysis/*.sh /app/
 RUN apt-get update
 RUN apt-get install -y build-essential
 # RUN conda env create -f /app/programme_notebook.yml
 RUN conda env create -f /app/pi_score_env.yml
 RUN pip install absl-py
+RUN pip install biopandas==0.4.1
+RUN pip install pyrosetta-installer 
+RUN python -c 'import pyrosetta_installer; pyrosetta_installer.install_pyrosetta()'
+RUN pip install biopython==1.81
+# RUN pip install "jax[cuda]==0.3.25" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
 RUN pip install jax==0.4.16 jaxlib==0.4.16+cuda11.cudnn86 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 # RUN ["/bin/bash","-c","source activate programme_notebook && conda install -c conda-forge cctbx-base"]
 RUN ["/bin/bash","-c","source activate pi_score && pip install numpy==1.16.6 scikit-learn==0.20.4"]
@@ -51,13 +57,13 @@ RUN ["/bin/bash","-c","test -d $CCP4_SCR || echo 'Unable to assign CCP4_SCR. Thi
 ENV CLIBD="/software/lib64/data"
 ENV PATH="/app:$PATH"
 ENV PATH="/software:$PATH"
-ENV PATH="/app/alpha-analysis:$PATH"
+# ENV PATH="/app/alpha-analysis:$PATH"
 ENV PYTHONPATH="/app:$PYTHONPATH"
 ENV LD_LIBRARY_PATH="/software/lib64:$LD_LIBRARY_PATH"
 # Below have to relink the libz:
 RUN ["/bin/bash","-c","cd /software/lib64 && mv libz.so.1 libz.so.1.old && ln -s /lib/x86_64-linux-gnu/libz.so.1"]
-RUN chmod +x /app/run_get_good_pae.sh
-RUN chmod +x /app/run_execute_notebook.sh
-RUN chmod +x /app/run_pi_score.sh
-RUN chmod +x /app/alpha-analysis/get_good_inter_pae.py
+# RUN chmod +x /app/run_get_good_pae.sh
+# RUN chmod +x /app/run_execute_notebook.sh
+# RUN chmod +x /app/run_pi_score.sh
+# RUN chmod +x /app/alpha-analysis/get_good_inter_pae.py
 ENTRYPOINT [ "bash" ]
diff --git a/alphapulldown/analysis_pipeline/get_good_inter_pae.py b/alphapulldown/analysis_pipeline/get_good_inter_pae.py
@@ -1,159 +1,148 @@
 #!/usr/bin/env python3
 
-from math import pi
-from operator import index
-import os 
+from calculate_mpdockq import *
+from pdb_analyser import PDBAnalyser
+from Bio.PDB import PDBParser
+from Bio.PDB.Polypeptide import PPBuilder
+import os
 import pickle
-from absl import flags,app,logging
+from absl import flags, app, logging
 import json
 import numpy as np
 import pandas as pd
-import subprocess
-from calculate_mpdockq import *
-import sys 
 import gzip
+from typing import Tuple
 
-flags.DEFINE_string('output_dir',None,'directory where predicted models are stored')
-flags.DEFINE_float('cutoff',5.0,'cutoff value of PAE. i.e. only pae<cutoff is counted good')
-flags.DEFINE_integer('surface_thres',2,'surface threshold. must be integer')
-FLAGS=flags.FLAGS
+flags.DEFINE_string('output_dir', None,
+                    'directory where predicted models are stored')
+flags.DEFINE_float(
+    'cutoff', 5.0, 'cutoff value of PAE. i.e. only pae<cutoff is counted good')
+flags.DEFINE_integer('surface_thres', 2, 'surface threshold. must be integer')
+FLAGS = flags.FLAGS
 
-def examine_inter_pae(pae_mtx,seqs,cutoff):
+
+def examine_inter_pae(pae_mtx, seq_lengths, cutoff):
     """A function that checks inter-pae values in multimer prediction jobs"""
-    lens = [len(seq) for seq in seqs]
-    old_lenth=0
-    for length in lens:
+    old_lenth = 0
+    mtx = pae_mtx.copy()
+    for length in seq_lengths:
         new_length = old_lenth + length
-        pae_mtx[old_lenth:new_length,old_lenth:new_length] = 50
+        mtx[old_lenth:new_length, old_lenth:new_length] = 50
         old_lenth = new_length
-    check = np.where(pae_mtx<cutoff)[0].size !=0
+    check = np.where(mtx < cutoff)[0].size != 0
 
     return check
 
 
 def obtain_mpdockq(work_dir):
     """Returns mpDockQ if more than two chains otherwise return pDockQ"""
-    pdb_path = os.path.join(work_dir,'ranked_0.pdb')
+    pdb_path = os.path.join(work_dir, 'ranked_0.pdb')
     pdb_chains, chain_coords, chain_CA_inds, chain_CB_inds = read_pdb(pdb_path)
     best_plddt = get_best_plddt(work_dir)
-    plddt_per_chain = read_plddt(best_plddt,chain_CA_inds)
-    complex_score,num_chains = score_complex(chain_coords,chain_CB_inds,plddt_per_chain)
-    if complex_score is not None and num_chains>2:
+    plddt_per_chain = read_plddt(best_plddt, chain_CA_inds)
+    complex_score, num_chains = score_complex(
+        chain_coords, chain_CB_inds, plddt_per_chain)
+    if complex_score is not None and num_chains > 2:
         mpDockq_or_pdockq = calculate_mpDockQ(complex_score)
-    elif complex_score is not None and num_chains==2:
-        chain_coords,plddt_per_chain = read_pdb_pdockq(pdb_path)
-        mpDockq_or_pdockq = calc_pdockq(chain_coords,plddt_per_chain,t=8)
+    elif complex_score is not None and num_chains == 2:
+        chain_coords, plddt_per_chain = read_pdb_pdockq(pdb_path)
+        mpDockq_or_pdockq = calc_pdockq(chain_coords, plddt_per_chain, t=8)
     else:
         mpDockq_or_pdockq = "None"
-    return mpDockq_or_pdockq
-
-def run_and_summarise_pi_score(workd_dir,jobs,surface_thres):
+    return mpDockq_or_pdockq, plddt_per_chain
 
-    """A function to calculate all predicted models' pi_scores and make a pandas df of the results"""
+def obtain_pae_and_iptm(result_subdir: str, best_model: str) -> Tuple[np.array, float]:
+    """A function that obtains PAE matrix and iptm score from either results pickle, zipped pickle, or pae_json"""
     try:
-        os.rmdir(f"{workd_dir}/pi_score_outputs")
-    except:
-        pass
-    subprocess.run(f"mkdir {workd_dir}/pi_score_outputs",shell=True,executable='/bin/bash')
-    pi_score_outputs = os.path.join(workd_dir,"pi_score_outputs")
-    for job in jobs:
-        subdir = os.path.join(workd_dir,job)
-        if not os.path.isfile(os.path.join(subdir,"ranked_0.pdb")):
-            print(f"{job} failed. Cannot find ranked_0.pdb in {subdir}")
-            sys.exit()
-        else:
-            pdb_path = os.path.join(subdir,"ranked_0.pdb")
-            output_dir = os.path.join(pi_score_outputs,f"{job}")
-            logging.info(f"pi_score output for {job} will be stored at {output_dir}")
-            subprocess.run(f"source activate pi_score && export PYTHONPATH=/software:$PYTHONPATH && python /software/pi_score/run_piscore_wc.py -p {pdb_path} -o {output_dir} -s {surface_thres} -ps 10",shell=True,executable='/bin/bash')
-
+        ranking_results = json.load(
+            open(os.path.join(result_subdir, "ranking_debug.json")))
+    except FileNotFoundError as e:
+        logging.warning(f"ranking_debug.json is not found at {result_subdir}")
+    iptm_score = "None"
+    if "iptm" in ranking_results:
+        iptm_score = ranking_results['iptm'].get(best_model, None)
+
+    if os.path.exists(f"{result_subdir}/pae_{best_model}.json"):
+        pae_results = json.load(
+            open(f"{result_subdir}/pae_{best_model}.json"))[0]['predicted_aligned_error']
+        pae_mtx = np.array(pae_results)
+
+    if iptm_score == "None":
+        try:
+            check_dict = pickle.load(
+                open(os.path.join(result_subdir, f"result_{best_model}.pkl"), 'rb'))
+        except FileNotFoundError:
+            logging.info(os.path.join(
+                result_subdir, f"result_{best_model}.pkl")+" does not exist. Will search for pkl.gz")
+            check_dict = pickle.load(gzip.open(os.path.join(
+                result_subdir, f"result_{best_model}.pkl.gz"), 'rb'))
+        finally:
+            logging.info(f"finished reading results for the best model.")
+            pae_mtx = check_dict['predicted_aligned_error']
+            iptm_score = check_dict['iptm']
+    return pae_mtx, iptm_score
+
+
+def obtain_seq_lengths(result_subdir: str) -> list:
+    """A function that obtain length of each chain in ranked_0.pdb"""
+    best_result_pdb = os.path.join(result_subdir, "ranked_0.pdb")
+    seq_length = []
+    pdb_parser = PDBParser()
+    sequence_builder = PPBuilder()
+    if not os.path.exists(best_result_pdb):
+        raise FileNotFoundError(
+            f"ranked_0.pdb is not found at {result_subdir}")
+    else:
+        structure = pdb_parser.get_structure("ranked_0", best_result_pdb)
+        seqs = sequence_builder.build_peptides(structure)
+        for seq in seqs:
+            seq_length.append(len(seq.get_sequence()))
+    return seq_length
 
-    output_df = pd.DataFrame()
-    for job in jobs:
-        subdir = os.path.join(pi_score_outputs,job)
-        csv_files = [f for f in os.listdir(subdir) if 'filter_intf_features' in f]
-        pi_score_files = [f for f in os.listdir(subdir) if 'pi_score_' in f]
-        filtered_df = pd.read_csv(os.path.join(subdir,csv_files[0]))
-
-        if filtered_df.shape[0]==0:
-            for column in filtered_df.columns:
-                filtered_df[column] = ["None"]
-            filtered_df['jobs'] = str(job)
-            filtered_df['pi_score'] = "No interface detected"
-        else:
-            with open(os.path.join(subdir,pi_score_files[0]),'r') as f:
-                lines = [l for l in f.readlines() if "#" not in l]
-                if len(lines)>0:
-                    pi_score = pd.read_csv(os.path.join(subdir,pi_score_files[0]))
-                    pi_score['jobs']=str(job)
-                else:
-                    pi_score = pd.DataFrame.from_dict({"pi_score":['SC:  mds: too many atoms']})
-                f.close()
-            filtered_df['jobs'] = str(job)
-            pi_score['interface'] = pi_score['chains']
-            filtered_df=pd.merge(filtered_df,pi_score,on=['jobs','interface'])
-            try:
-                filtered_df=filtered_df.drop(columns=["#PDB","pdb"," pvalue","chains","predicted_class"])
-            except:
-                pass
-
-        output_df = pd.concat([output_df,filtered_df])
-    return output_df
-
-
 
 def main(argv):
     jobs = os.listdir(FLAGS.output_dir)
-    good_jobs = []
-    iptm_ptm = list()
-    iptm = list()
-    mpDockq_scores = list()
     count = 0
+    good_jobs = []
+    output_df = pd.DataFrame()
     for job in jobs:
         logging.info(f"now processing {job}")
-        if os.path.isfile(os.path.join(FLAGS.output_dir,job,'ranking_debug.json')):
-            count=count +1
-            result_subdir = os.path.join(FLAGS.output_dir,job)
-            best_model = json.load(open(os.path.join(result_subdir,"ranking_debug.json"),'rb'))['order'][0]
-            data = json.load(open(os.path.join(result_subdir,"ranking_debug.json"),'rb'))
-            if "iptm" in data.keys() or "iptm+ptm" in data.keys():
+        if os.path.isfile(os.path.join(FLAGS.output_dir, job, 'ranking_debug.json')):
+            pdb_analyser = PDBAnalyser(os.path.join(
+                FLAGS.output_dir, job, "ranked_0.pdb"))
+            count = count + 1
+            result_subdir = os.path.join(FLAGS.output_dir, job)
+            best_model = json.load(
+                open(os.path.join(result_subdir, "ranking_debug.json"), 'r'))['order'][0]
+            data = json.load(
+                open(os.path.join(result_subdir, "ranking_debug.json"), 'r'))
+            if "iptm" in data.keys() and "iptm+ptm" in data.keys():
                 iptm_ptm_score = data['iptm+ptm'][best_model]
-                try:
-                    check_dict = pickle.load(open(os.path.join(result_subdir,f"result_{best_model}.pkl"),'rb'))
-                except FileNotFoundError:
-                    logging.info(os.path.join(result_subdir,f"result_{best_model}.pkl")+" does not exist. Will search for pkl.gz")
-                    check_dict = pickle.load(gzip.open(os.path.join(result_subdir,f"result_{best_model}.pkl.gz"),'rb'))
-                finally:
-                    logging.info(f"finished reading result pickle for the best model.")
-                seqs = check_dict['seqs']
-                iptm_score = check_dict['iptm']
-                pae_mtx = check_dict['predicted_aligned_error']
-                check = examine_inter_pae(pae_mtx,seqs,cutoff=FLAGS.cutoff)
-                mpDockq_score = obtain_mpdockq(os.path.join(FLAGS.output_dir,job))
+                pae_mtx, iptm_score = obtain_pae_and_iptm(
+                    result_subdir, best_model)
+                seq_lengths = obtain_seq_lengths(result_subdir)
+                check = examine_inter_pae(
+                    pae_mtx, seq_lengths, cutoff=FLAGS.cutoff)
+                mpDockq_score, plddt_per_chain = obtain_mpdockq(
+                    os.path.join(FLAGS.output_dir, job))
                 if check:
-                    good_jobs.append(str(job))
-                    iptm_ptm.append(iptm_ptm_score)
-                    iptm.append(iptm_score)
-                    mpDockq_scores.append(mpDockq_score)
-            logging.info(f"done for {job} {count} out of {len(jobs)} finished.")
-    if len(good_jobs) > 0:
-        other_measurements_df=pd.DataFrame.from_dict({
-            "jobs":good_jobs,
-            "iptm_ptm":iptm_ptm,
-            "iptm":iptm,
-            "mpDockQ/pDockQ":mpDockq_scores
-        }) 
-        pi_score_df = run_and_summarise_pi_score(FLAGS.output_dir,good_jobs,FLAGS.surface_thres)
-        pi_score_df=pd.merge(pi_score_df,other_measurements_df,on="jobs")
-        columns = list(pi_score_df.columns.values)
-        columns.pop(columns.index('jobs'))
-        pi_score_df = pi_score_df[['jobs'] + columns]
-        pi_score_df = pi_score_df.sort_values(by='iptm',ascending=False)
-
-        pi_score_df.to_csv(os.path.join(FLAGS.output_dir,"predictions_with_good_interpae.csv"),index=False)
-
+                    good_jobs.append(job)
+                    score_df = pdb_analyser(
+                        pae_mtx, plddt_per_chain)
+                    score_df['jobs']=job
+                    score_df['iptm_ptm'] = iptm_ptm_score
+                    score_df['iptm'] = iptm_score
+                    score_df['pDockQ/mpDockQ'] = mpDockq_score
+                    for i in ['pDockQ/mpDockQ', 'iptm', 'iptm_ptm','jobs']:
+                        score_df.insert(0, i, score_df.pop(i))
+                    output_df = pd.concat([score_df,output_df])
+            logging.info(
+                f"done for {job} {count} out of {len(jobs)} finished.")
+    if len(good_jobs) == 0:
+        logging.info(
+            f"Unfortunately, none of your protein models had at least one PAE on the interface below your cutoff value : {FLAGS.cutoff}.\n Please consider using a larger cutoff.")
     else:
-        logging.info(f"Unfortunately, none of your protein models had at least one PAE on the interface below your cutoff value : {FLAGS.cutoff}.\n Please consider using a larger cutoff.")
+        output_df.to_csv(os.path.join(FLAGS.output_dir,"predictions_with_good_interpae.csv"),index=False)
 
-if __name__ =='__main__':
-    app.run(main)
+if __name__ == '__main__':
+    app.run(main)