From f84f389768cb65b7a5bb7fc29926b1d125cb422d Mon Sep 17 00:00:00 2001 From: Tim Dudgeon Date: Thu, 16 Nov 2023 17:19:57 +0000 Subject: [PATCH] general fixes --- src/xchemalign/aligner.py | 6 +++-- src/xchemalign/collator.py | 18 ++++++++------ src/xchemalign/copier.py | 15 +++++++++++ src/xchemalign/pdb_xtal.py | 51 +++++++++++++++++++------------------- 4 files changed, 56 insertions(+), 34 deletions(-) diff --git a/src/xchemalign/aligner.py b/src/xchemalign/aligner.py index 28f0468..6e0f2ef 100644 --- a/src/xchemalign/aligner.py +++ b/src/xchemalign/aligner.py @@ -12,6 +12,7 @@ import argparse import os +import logging import shutil from pathlib import Path @@ -217,7 +218,7 @@ def run(self): def _write_output(self, collator_dict, aligner_dict): # keep a copy of the xtaforms and assemblies configs self._copy_file_to_version_dir(self.xtalforms_file) - # self._copy_file_to_version_dir(self.assemblies_file) + self._copy_file_to_version_dir(self.assemblies_file) collator_dict[Constants.META_XTALFORMS] = aligner_dict[Constants.META_XTALFORMS] collator_dict[Constants.META_CONFORMER_SITES] = aligner_dict[Constants.META_CONFORMER_SITES] @@ -544,8 +545,10 @@ def _perform_alignments(self, meta): new_meta[Constants.META_XTALS] = {} for dtag, crystal in crystals.items(): + self.logger.info('looking at', dtag) # Skip if no output for this dataset if dtag not in fs_model.alignments: + self.logger.warn('skipping {} as not in alignments'.format(dtag)) continue new_meta[Constants.META_XTALS][dtag] = {} @@ -574,7 +577,6 @@ def _perform_alignments(self, meta): Constants.META_AIGNED_EVENT_MAP: aligned_event_map_path, Constants.META_AIGNED_X_MAP: aligned_xmap_path, Constants.META_AIGNED_DIFF_MAP: aligned_diff_map_path, - } ## Add the reference alignments diff --git a/src/xchemalign/collator.py b/src/xchemalign/collator.py index 8ddc120..71f5dff 100644 --- a/src/xchemalign/collator.py +++ b/src/xchemalign/collator.py @@ -151,8 +151,14 @@ def __init__(self, config_file, logger=None): self.inputs = [] inputs = utils.find_property(config, Constants.CONFIG_INPUTS) self.logger.info("found {} inputs".format(len(inputs))) + if inputs: for input in inputs: + # Determine which datasets to exclude + excluded_datasets = utils.find_property(input, Constants.CONFIG_EXCLUDE) + if not excluded_datasets: + excluded_datasets = [] + input_path = utils.find_path(input, Constants.CONFIG_DIR) type = utils.find_property(input, Constants.CONFIG_TYPE) if type == Constants.CONFIG_TYPE_MODEL_BUILDING: @@ -165,11 +171,6 @@ def __init__(self, config_file, logger=None): else: panddas_paths = [] - # Determine which datasets to exclude - excluded_datasets = utils.find_property(input, Constants.CONFIG_EXCLUDE) - if not excluded_datasets: - excluded_datasets = [] - self.logger.info("adding input", input_path) self.inputs.append( Input( @@ -342,7 +343,10 @@ def _validate_soakdb_input(self, input, crystals): else: expanded_files.append(None) missing_files += 1 - self._log_warning("PDB file for {} not found: {}".format(xtal_name, full_inputpath)) + self._log_warning( + "PDB file for {} not found: {}. Skipping entry".format(xtal_name, full_inputpath) + ) + continue # if we have a PDB file then continue to look for the others colname = Constants.SOAKDB_COL_MTZ @@ -852,7 +856,7 @@ def _write_metadata(self, meta, all_xtals, new_xtals): def _copy_config(self): f = shutil.copy2(self.config_file, self.output_path / self.version_dir / 'config.yaml') if not f: - print("Failed to copy config file to {}".format((self.output_path / self.version_dir))) + self.logger.warn("Failed to copy config file to {}".format((self.output_path / self.version_dir))) return False return True diff --git a/src/xchemalign/copier.py b/src/xchemalign/copier.py index e107023..f83820b 100644 --- a/src/xchemalign/copier.py +++ b/src/xchemalign/copier.py @@ -138,6 +138,14 @@ def validate(self): return len(self.errors), len(self.warnings) + def check_path(self, path, expected_path): + try: + relp = path.relative_to(expected_path) + return True + except ValueError as ve: + self.logger.warn('unexpected path for file:', path) + return False + def copy_files(self): if self.base_path and self.input_path.is_absolute(): self.logger.warn("INFO: making input_path relative as a base_path is specified") @@ -160,12 +168,19 @@ def copy_files(self): for index, row in df.iterrows(): count += 1 xtal_name = row["CrystalName"] + status_str = str(row[Constants.SOAKDB_COL_REFINEMENT_OUTCOME]) + if status_str.startswith("7"): + self.logger.info("ignoring {} as status is 7".format(xtal_name)) + continue + xtal_dir_path = collator.generate_xtal_dir(self.input_path, xtal_name) self.logger.info("processing {} {}".format(count, xtal_name)) + expected_path = self.base_path / self.input_path / Constants.DEFAULT_MODEL_BUILDING_DIR file = row["RefinementPDB_latest"] if file: path = Path(file) + self.check_path(path, expected_path) ok = self.copy_file(path, xtal_dir_path) if ok: num_files += 1 diff --git a/src/xchemalign/pdb_xtal.py b/src/xchemalign/pdb_xtal.py index cefe22d..2e7d0ef 100644 --- a/src/xchemalign/pdb_xtal.py +++ b/src/xchemalign/pdb_xtal.py @@ -226,31 +226,32 @@ def create_ligands(self, chain: str, res_id, cif_file: str): return mol - def extract_sequences(self): - if not self.apo_desolv_file: - self.create_apo_solv_desolv() - pdb_file = open(self.output_dir / (self.filebase + "_apo-desolv.pdb"), "rt") - lines = pdb_file.readlines() - curr_chain = None - curr_resno = 0 - curr_seq = None - seqs = [] - for line in lines: - if line.startswith('ATOM'): - alt = line[16].strip() - chain = line[21].strip() - code = line[17:20].strip() - resno = int(line[22:26].strip()) - if chain != curr_chain: - curr_chain = chain - curr_seq = ProteinSeq(chain, [], start=int(resno)) - seqs.append(curr_seq) - if resno != curr_resno: - for i in range(resno - curr_resno - 1): - curr_seq.seq.append('UNK') - curr_resno = resno - curr_seq.seq.append(code) - return seqs + def extract_sequences(self, pdb_file=None): + if not pdb_file: + pdb_file = self.pdbfile + + with open(pdb_file, "rt") as pdb: + lines = pdb.readlines() + curr_chain = None + curr_resno = 0 + curr_seq = None + seqs = [] + for line in lines: + if line.startswith('ATOM'): + alt = line[16].strip() + chain = line[21].strip() + code = line[17:20].strip() + resno = int(line[22:26].strip()) + if chain != curr_chain: + curr_chain = chain + curr_seq = ProteinSeq(chain, [], start=int(resno)) + seqs.append(curr_seq) + if resno != curr_resno: + for i in range(resno - curr_resno - 1): + curr_seq.seq.append('UNK') + curr_resno = resno + curr_seq.seq.append(code) + return seqs class ProteinSeq: