From f84f389768cb65b7a5bb7fc29926b1d125cb422d Mon Sep 17 00:00:00 2001
From: Tim Dudgeon <tdudgeon@informaticsmatters.com>
Date: Thu, 16 Nov 2023 17:19:57 +0000
Subject: [PATCH] general fixes

---
 src/xchemalign/aligner.py  |  6 +++--
 src/xchemalign/collator.py | 18 ++++++++------
 src/xchemalign/copier.py   | 15 +++++++++++
 src/xchemalign/pdb_xtal.py | 51 +++++++++++++++++++-------------------
 4 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/src/xchemalign/aligner.py b/src/xchemalign/aligner.py
index 28f0468..6e0f2ef 100644
--- a/src/xchemalign/aligner.py
+++ b/src/xchemalign/aligner.py
@@ -12,6 +12,7 @@
 
 import argparse
 import os
+import logging
 import shutil
 from pathlib import Path
 
@@ -217,7 +218,7 @@ def run(self):
     def _write_output(self, collator_dict, aligner_dict):
         # keep a copy of the xtaforms and assemblies configs
         self._copy_file_to_version_dir(self.xtalforms_file)
-        # self._copy_file_to_version_dir(self.assemblies_file)
+        self._copy_file_to_version_dir(self.assemblies_file)
 
         collator_dict[Constants.META_XTALFORMS] = aligner_dict[Constants.META_XTALFORMS]
         collator_dict[Constants.META_CONFORMER_SITES] = aligner_dict[Constants.META_CONFORMER_SITES]
@@ -544,8 +545,10 @@ def _perform_alignments(self, meta):
 
         new_meta[Constants.META_XTALS] = {}
         for dtag, crystal in crystals.items():
+            self.logger.info('looking at', dtag)
             # Skip if no output for this dataset
             if dtag not in fs_model.alignments:
+                self.logger.warn('skipping {} as not in alignments'.format(dtag))
                 continue
 
             new_meta[Constants.META_XTALS][dtag] = {}
@@ -574,7 +577,6 @@ def _perform_alignments(self, meta):
                             Constants.META_AIGNED_EVENT_MAP: aligned_event_map_path,
                             Constants.META_AIGNED_X_MAP: aligned_xmap_path,
                             Constants.META_AIGNED_DIFF_MAP: aligned_diff_map_path,
-
                         }
 
         ## Add the reference alignments
diff --git a/src/xchemalign/collator.py b/src/xchemalign/collator.py
index 8ddc120..71f5dff 100644
--- a/src/xchemalign/collator.py
+++ b/src/xchemalign/collator.py
@@ -151,8 +151,14 @@ def __init__(self, config_file, logger=None):
         self.inputs = []
         inputs = utils.find_property(config, Constants.CONFIG_INPUTS)
         self.logger.info("found {} inputs".format(len(inputs)))
+
         if inputs:
             for input in inputs:
+                # Determine which datasets to exclude
+                excluded_datasets = utils.find_property(input, Constants.CONFIG_EXCLUDE)
+                if not excluded_datasets:
+                    excluded_datasets = []
+
                 input_path = utils.find_path(input, Constants.CONFIG_DIR)
                 type = utils.find_property(input, Constants.CONFIG_TYPE)
                 if type == Constants.CONFIG_TYPE_MODEL_BUILDING:
@@ -165,11 +171,6 @@ def __init__(self, config_file, logger=None):
                     else:
                         panddas_paths = []
 
-                    # Determine which datasets to exclude
-                    excluded_datasets = utils.find_property(input, Constants.CONFIG_EXCLUDE)
-                    if not excluded_datasets:
-                        excluded_datasets = []
-
                     self.logger.info("adding input", input_path)
                     self.inputs.append(
                         Input(
@@ -342,7 +343,10 @@ def _validate_soakdb_input(self, input, crystals):
                         else:
                             expanded_files.append(None)
                             missing_files += 1
-                            self._log_warning("PDB file for {} not found: {}".format(xtal_name, full_inputpath))
+                            self._log_warning(
+                                "PDB file for {} not found: {}. Skipping entry".format(xtal_name, full_inputpath)
+                            )
+                            continue
 
                         # if we have a PDB file then continue to look for the others
                         colname = Constants.SOAKDB_COL_MTZ
@@ -852,7 +856,7 @@ def _write_metadata(self, meta, all_xtals, new_xtals):
     def _copy_config(self):
         f = shutil.copy2(self.config_file, self.output_path / self.version_dir / 'config.yaml')
         if not f:
-            print("Failed to copy config file to {}".format((self.output_path / self.version_dir)))
+            self.logger.warn("Failed to copy config file to {}".format((self.output_path / self.version_dir)))
             return False
         return True
 
diff --git a/src/xchemalign/copier.py b/src/xchemalign/copier.py
index e107023..f83820b 100644
--- a/src/xchemalign/copier.py
+++ b/src/xchemalign/copier.py
@@ -138,6 +138,14 @@ def validate(self):
 
         return len(self.errors), len(self.warnings)
 
+    def check_path(self, path, expected_path):
+        try:
+            relp = path.relative_to(expected_path)
+            return True
+        except ValueError as ve:
+            self.logger.warn('unexpected path for file:', path)
+            return False
+
     def copy_files(self):
         if self.base_path and self.input_path.is_absolute():
             self.logger.warn("INFO: making input_path relative as a base_path is specified")
@@ -160,12 +168,19 @@ def copy_files(self):
         for index, row in df.iterrows():
             count += 1
             xtal_name = row["CrystalName"]
+            status_str = str(row[Constants.SOAKDB_COL_REFINEMENT_OUTCOME])
+            if status_str.startswith("7"):
+                self.logger.info("ignoring {} as status is 7".format(xtal_name))
+                continue
+
             xtal_dir_path = collator.generate_xtal_dir(self.input_path, xtal_name)
             self.logger.info("processing {} {}".format(count, xtal_name))
+            expected_path = self.base_path / self.input_path / Constants.DEFAULT_MODEL_BUILDING_DIR
 
             file = row["RefinementPDB_latest"]
             if file:
                 path = Path(file)
+                self.check_path(path, expected_path)
                 ok = self.copy_file(path, xtal_dir_path)
                 if ok:
                     num_files += 1
diff --git a/src/xchemalign/pdb_xtal.py b/src/xchemalign/pdb_xtal.py
index cefe22d..2e7d0ef 100644
--- a/src/xchemalign/pdb_xtal.py
+++ b/src/xchemalign/pdb_xtal.py
@@ -226,31 +226,32 @@ def create_ligands(self, chain: str, res_id, cif_file: str):
 
         return mol
 
-    def extract_sequences(self):
-        if not self.apo_desolv_file:
-            self.create_apo_solv_desolv()
-        pdb_file = open(self.output_dir / (self.filebase + "_apo-desolv.pdb"), "rt")
-        lines = pdb_file.readlines()
-        curr_chain = None
-        curr_resno = 0
-        curr_seq = None
-        seqs = []
-        for line in lines:
-            if line.startswith('ATOM'):
-                alt = line[16].strip()
-                chain = line[21].strip()
-                code = line[17:20].strip()
-                resno = int(line[22:26].strip())
-                if chain != curr_chain:
-                    curr_chain = chain
-                    curr_seq = ProteinSeq(chain, [], start=int(resno))
-                    seqs.append(curr_seq)
-                if resno != curr_resno:
-                    for i in range(resno - curr_resno - 1):
-                        curr_seq.seq.append('UNK')
-                    curr_resno = resno
-                    curr_seq.seq.append(code)
-        return seqs
+    def extract_sequences(self, pdb_file=None):
+        if not pdb_file:
+            pdb_file = self.pdbfile
+
+        with open(pdb_file, "rt") as pdb:
+            lines = pdb.readlines()
+            curr_chain = None
+            curr_resno = 0
+            curr_seq = None
+            seqs = []
+            for line in lines:
+                if line.startswith('ATOM'):
+                    alt = line[16].strip()
+                    chain = line[21].strip()
+                    code = line[17:20].strip()
+                    resno = int(line[22:26].strip())
+                    if chain != curr_chain:
+                        curr_chain = chain
+                        curr_seq = ProteinSeq(chain, [], start=int(resno))
+                        seqs.append(curr_seq)
+                    if resno != curr_resno:
+                        for i in range(resno - curr_resno - 1):
+                            curr_seq.seq.append('UNK')
+                        curr_resno = resno
+                        curr_seq.seq.append(code)
+            return seqs
 
 
 class ProteinSeq: