renamed panddas_event_files to ligand_binding_events in output metadata

xchem · Apr 25, 2024 · 59c9eaf · 59c9eaf
1 parent b5001cd
commit 59c9eaf
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 42 deletions.
diff --git a/ALGORITHM-GUIDE.md b/ALGORITHM-GUIDE.md
@@ -31,9 +31,9 @@ In crystallography, the structural context is made up of two parts:
 
 Once these two have been deconvoluted, which is relatively straightforward if the biological assembly is know, the biological binding context of a compound can be identified.
 
-In order to generate the conformer site the next step is to identify which compounds have protein contexts which can be aligned to one another well. 
+In order to generate the conformer site the next step is to identify which compounds have protein contexts which can be aligned to one another well.
 
-The key idea of a conformer site is that this alignment does not have to be direct: two compounds do not need to share any atoms in their local protein context if there exist other compounds whose contexts allow them to be aligned to one another indirectly. 
+The key idea of a conformer site is that this alignment does not have to be direct: two compounds do not need to share any atoms in their local protein context if there exist other compounds whose contexts allow them to be aligned to one another indirectly.
 
 In this way two compounds which bind on the opposite sides of a pocket may still end up in the same conformer site if compounds which span the entire pocket, or at least the part of that pocket between these two compounds.
 
@@ -42,10 +42,9 @@ In this way two compounds which bind on the opposite sides of a pocket may still
 
 The key to identifying the biological context of fragments is by linking their binding mode to the sequence that generates the residues.
 
-As however 
+As however
 
 
 
 
 ### Crystalform Sites: Partitioning Structure By Crystallographic Context
-
diff --git a/USER-GUIDE.md b/USER-GUIDE.md
@@ -121,21 +121,31 @@ Also, this status is considered so that crystals in previous upload versions can
 
 #### Non-Diamond datasets
 
-Additional PDB files can be specified as an input of type `manual`. See the end of the above example.
-The dir specified is relative to `base_dir`. In that directory you place the PDB file, and any corresponding MTZ file,
-with the same base name and the .pdb and .mtz extensions. The base name is used for the name of the crystal (and is the
-name that will be used in Fragalysis, so choose sensible names here).  
+Additional structures can be specified as an input of type `manual`. See the end of the above example.
+The dir specified is relative to `base_dir`. In that directory you place the PDB file, the ligand CIF file, and any
+corresponding MTZ file, with the same base name and the .pdb and .mtz extensions. The base name is used for the name of the crystal (and is the
+name that will be used in Fragalysis, so choose sensible names here).
+
+The PDB and ligand CIF files are mandatory, and can be downloaded from [RCSB](https://www.rcsb.org/). Ligand CIFs can
+be downloaded from [here](https://www.rcsb.org/downloads/ligands). Check that the ligand name is the same in the PDB
+and CIF files. XCA uses the ligand name from the CIF file to identify the ligand in the PDB.
 
 For instance, if your directory contains this:
 
 * 1ABC.pdb
 * 1ABC.mtz
+* 1ABC.cif
 * 5XYZ.pdb
+* 5XYZ.cif
 * random.txt
 
 then 2 crystals will be processed and given the names 1ABC and 5XYZ. The second will not have a MTZ file and the file
 `random.txt` and any subdirectories are ignored.
 
+**NOTE**: currently the ligand name in the PDB MUST be LIG, even if it is something different in the downloaded files.
+So, currently, the ligand in the PDB file must be renamed to LIG (do not rename it in the CIF file). We expect to remove
+this limitation soon.
+
 #### Extra files
 
 There is support for adding arbitrary extra files to the upload. These files are not used by Fragalysis but

diff --git a/src/xchemalign/aligner.py b/src/xchemalign/aligner.py
@@ -597,9 +597,7 @@ def _perform_alignments(self, meta):
             # This is a bit of a hack as the event map file location is generated by LNA even if there is no event map
             # so we need to know whether to actually include it in the metadata.
             # It would be better if LNA only included if it actually existed which would make the checking easier.
-            event_map_dict_list = crystal.get(
-                Constants.META_XTAL_FILES, {}).get(
-                Constants.META_BINDING_EVENT, {})
+            event_map_dict_list = crystal.get(Constants.META_XTAL_FILES, {}).get(Constants.META_BINDING_EVENT, {})
 
             crystal_output[Constants.META_ALIGNED_FILES] = {}
             aligned_output = crystal_output[Constants.META_ALIGNED_FILES]
@@ -630,7 +628,9 @@ def _perform_alignments(self, meta):
                             }
                             # if the event map is present then include it in the output
                             if event_map_present:
-                                aligned_version_output[site_id][Constants.META_AIGNED_EVENT_MAP] = aligned_event_map_path
+                                aligned_version_output[site_id][
+                                    Constants.META_AIGNED_EVENT_MAP
+                                ] = aligned_event_map_path
                             i += 1
 
         ## Add the reference alignments
@@ -754,8 +754,7 @@ def _extract_components(self, crystals, aligner_meta):
                 for k2, v2 in v1[Constants.META_ALIGNED_FILES].items():  # chain
                     for k3, v3 in v2.items():  # ligand
                         for k4, v4 in v3.items():  # version
-
-                            for k5, v5 in v4.items():# conf site
+                            for k5, v5 in v4.items():  # conf site
                                 pdb = v5[Constants.META_AIGNED_STRUCTURE]
                                 self.logger.info("extracting components", k1, k2, k3, k4, k5, pdb)
                                 # pth = self.version_dir / pdb

diff --git a/src/xchemalign/collator.py b/src/xchemalign/collator.py
@@ -16,6 +16,7 @@
 import shutil
 import datetime
 import re
+import traceback
 from distutils import dir_util
 import yaml
 
@@ -182,7 +183,7 @@ def __init__(self, config_file, log_file=None, log_level=0, include_git_info=Fal
                     soakdb_path = utils.find_path(
                         input, Constants.CONFIG_SOAKDB, default=Constants.DEFAULT_SOAKDB_PATH
                     )
-                    panddas_csvs = utils.find_property(input, Constants.META_BINDING_EVENT)
+                    panddas_csvs = utils.find_property(input, Constants.CONFIG_PANDDAS_EVENT_FILES)
                     if panddas_csvs:
                         panddas_paths = [Path(p) for p in panddas_csvs]
                     else:
@@ -419,10 +420,8 @@ def _validate_soakdb_input(self, input, crystals):
                         expanded_files.append(None)
                         self._log_warning("PDB entry {} for {} not defined in SoakDB".format(colname, xtal_name))
                     else:
-                        # print('handling', colname, file)
                         inputpath = utils.make_path_relative(Path(file))
                         full_inputpath = self.base_path / inputpath
-                        # print('generated', full_inputpath)
                         ok = full_inputpath.exists()
                         if ok:
                             num_pdb_files += 1
@@ -900,8 +899,10 @@ def _copy_files(self, meta):
                                 mol = utils.gen_mol_from_cif(str(self.output_path / fdata[1]))
                                 smi = Chem.MolToSmiles(mol)
                                 data_to_add[Constants.META_XTAL_CIF][Constants.META_SMILES] = smi
+                                data_to_add[Constants.META_XTAL_CIF][Constants.META_LIGAND_NAME] = mol.GetProp('_Name')
                             except:
-                                self.logger.warn('failed to generate SMILES for ligand {}'.format(xtal_name))
+                                self.logger.warn('failed to generate ligand data for {}'.format(xtal_name))
+                                traceback.print_exc()
 
                     # copy event maps that differ in SHA from previously known ones
                     unsucessfully_copied_event_maps = {}

diff --git a/src/xchemalign/copier.py b/src/xchemalign/copier.py
@@ -407,7 +407,7 @@ def main():
                     continue
                 input_dirs.append(input.get('dir'))
                 soakdbfiles.append(input.get('soakdb', 'processing/database/soakDBDataFile.sqlite'))
-                panddas_files.append(input.get('panddas_event_files', []))
+                panddas_files.append(input.get(utils.Constants.META_BINDING_EVENT, []))
 
         # check we have at least one input
         if len(input_dirs) == 0:

diff --git a/src/xchemalign/utils.py b/src/xchemalign/utils.py
@@ -60,6 +60,7 @@ class Constants:
     CONFIG_EXCLUDE = 'exclude'
     CONFIG_CODE_PREFIX = 'code_prefix'
     CONFIG_CODE_PREFIX_TOOLTIP = 'code_prefix_tooltip'
+    CONFIG_PANDDAS_EVENT_FILES = "panddas_event_files"
     META_RUN_ON = "run_on"
     META_INPUT_DIRS = "input_dirs"
     META_VERSION_NUM = "version_number"
@@ -84,7 +85,7 @@ class Constants:
     META_XTAL_MTZ = "xtal_mtz"
     META_XTAL_CIF = "ligand_cif"
     META_SMILES = "smiles"
-    META_BINDING_EVENT = "panddas_event_files"
+    META_BINDING_EVENT = "ligand_binding_events"
     META_PANDDAS_MISSING_OK = "panddas_missing_ok"
     META_PROT_MODEL = "model"
     META_PROT_CHAIN = "chain"
@@ -127,6 +128,7 @@ class Constants:
     META_PDB_APO_DESOLV = "pdb_apo_desolv"
     META_LIGAND_MOL = "ligand_mol"
     META_LIGAND_PDB = "ligand_pdb"
+    META_LIGAND_NAME = "ligand_name"
     META_LIGAND_SMILES_STRING = "ligand_smiles_string"
     META_LIGAND_SMILES = "ligand_smiles"
     META_TRANSFORMS = "transforms"
@@ -348,6 +350,7 @@ def gen_mol_from_cif(cif_file):
     if not block:
         print("sole block not found")
         return None
+    comp_ids = block.find_loop('_chem_comp_atom.comp_id')
     atom_ids = block.find_loop('_chem_comp_atom.atom_id')
     atom_symbols = block.find_loop('_chem_comp_atom.type_symbol')
     # coordinates are sometimes called "x" and sometimes "model_Cartn_x" etc.
@@ -367,8 +370,14 @@ def gen_mol_from_cif(cif_file):
         charges = list(block.find_loop('_chem_comp_atom.partial_charge'))
 
     atoms = {}
-    for s, id, px, py, pz, charge in zip(atom_symbols, atom_ids, x, y, z, charges):
+    ligand_name = None
+    for name, s, id, px, py, pz, charge in zip(comp_ids, atom_symbols, atom_ids, x, y, z, charges):
         # sometimes that atom ids are wrapped in double quotes
+        if ligand_name is None:
+            ligand_name = name
+        elif name != ligand_name:
+            print("WARNING: ligand name has changed from {} to {}. Old name will be used.".format(ligand_name, name))
+
         id = strip_quotes(id)
 
         if len(s) == 2:
@@ -400,6 +409,8 @@ def gen_mol_from_cif(cif_file):
     Chem.AssignStereochemistryFrom3D(mol)
     mol = Chem.RemoveAllHs(mol)
 
+    mol.SetProp('_Name', ligand_name)
+
     return mol
 
 

diff --git a/xtalforms_example.md b/xtalforms_example.md
@@ -9,9 +9,9 @@ The purpose of the assemblies yaml is to specify which biological assemblies are
 "0": # The name of the assembly: must be unique
     reference: 5rgs  # The dataset that is the template for the assembly
     biomol: A  # The names of the chains in the "abstract" biomolecule. This needed because the reference dataset may
-                 # contain only one of the chains, and the others are then generated by some symmetry operation, as 
+                 # contain only one of the chains, and the others are then generated by some symmetry operation, as
                  # defined below
-    chains: A  # The chains and associated symmtery operations used to generate the biomolecule - in this case the 
+    chains: A  # The chains and associated symmtery operations used to generate the biomolecule - in this case the
                # symmetry operation (the identity (x,y,z)) can be omitted
   ```
 
@@ -22,9 +22,9 @@ We could also define a dimer:
 "1": # The name of the assembly: must be unique
     reference: Mpro-IBM0045  # The dataset that is the template for the assembly
     biomol: A,B  # The names of the chains in the "abstract" biomolecule. This needed because the reference dataset may
-                 # contain only one of the chains, and the others are then generated by some symmetry operation, as 
+                 # contain only one of the chains, and the others are then generated by some symmetry operation, as
                  # defined below
-    chains: A,A(-x,y,-z)  # The chains and associated symmtery operations used to generate the biomolecule. Here the B 
+    chains: A,A(-x,y,-z)  # The chains and associated symmtery operations used to generate the biomolecule. Here the B
                           # chain of the biomolecule is generated by reflections in the b and z axis.
   ```
 
@@ -52,7 +52,7 @@ The simplest xtalform has a single chain in the biological assembly:
       # to the reference dataset
       "0": # The name of the assembly -within this xtalform-: must be unique within this xtalform
         "assembly": "0"  # The name of the assembly in the assemblies.yaml to match this assembly to
-        "chains": A(x,y,z)  # The name of the chain in datasets from this crystalform and the symmetry operation 
+        "chains": A(x,y,z)  # The name of the chain in datasets from this crystalform and the symmetry operation
                               # which generates the corresponding (in index) chain in the reference assembly
   ```
 
@@ -61,12 +61,12 @@ given in the pdb, and the other is generated by a crystallographic symmetry oper
 
 ```yaml
 # crystalforms.yaml
-"0": 
-    "reference": 
-    "assemblies": 
-      "0": 
+"0":
+    "reference":
+    "assemblies":
+      "0":
         "assembly": "1" # Now the assembly is the dimer, rather than the monomer!
-        "chains": A,A(-x,y,-z) # Now there is a second generator: this creates the second chain, B, 
+        "chains": A,A(-x,y,-z) # Now there is a second generator: this creates the second chain, B,
                                         # by applying a symmetry operation to chain A. Notice the identity operation
                                         # can be omitted!
 
@@ -77,16 +77,16 @@ this time the assembly is given in the file so there is no need for a non-identi
 
 ```yaml
 # crystalforms.yaml
-"0": 
+"0":
     "reference": 5rgs
-    "assemblies": 
-      "0": 
+    "assemblies":
+      "0":
         "assembly": "0"  
         "chains": A,A(-x,y,-z)
-"1": 
+"1":
     "reference": Mpro-J0162
-    "assemblies": 
-      "0": 
+    "assemblies":
+      "0":
         "assembly": "1"  
         "chains": A,B # Notice this time the B chain of the dimer is generated by the identity operation applied to
                         # given B chain, rather than a symmetry operation duplicating the A chain
@@ -99,13 +99,13 @@ in which two dimers are present.
 # crystalforms.yaml
 ...
 
-"3": 
+"3":
     "reference": 8dz9
-    "assemblies": 
-      "0": 
+    "assemblies":
+      "0":
         "assembly": "1"  
-        "chains": "A,B" 
-      "1": 
+        "chains": "A,B"
+      "1":
         "assembly": "1"  
         "chains": C,D # Now chains C and D in datasets from this crystalform are matched to the Dimer's A and B chains
 ```