Merge pull request #694 from xchem/staging

Promote b/e changes to production
xchem · Nov 27, 2024 · 511734d · 511734d
2 parents 7522d4e + a7e70a3
commit 511734d
Show file tree

Hide file tree

Showing 26 changed files with 1,780 additions and 1,065 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -106,7 +106,7 @@ services:
       OIDC_RENEW_ID_TOKEN_EXPIRY_MINUTES: '210'
       # Public target access strings?
       # A comma-separated list of Project titles.
-      PUBLIC_TAS: lb18145-1
+      PUBLIC_TAS: ${PUBLIC_TAS:-lb18145-1}
       # Squonk configuration
       SQUONK2_VERIFY_CERTIFICATES: 'No'
       SQUONK2_UNIT_BILLING_DAY: 3

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,8 @@ urllib3 = "^2.0.4"
 validators = "^0.20.0"
 django-celery-beat = "^2.6.0"
 django-celery-results = "^2.5.1"
+numpy = "^1.23"
+python-dateutil = "^2"
 
 # Blocked packages...
 #

diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py
@@ -10,6 +10,7 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
+from dateutil.parser import parse
 from openpyxl.utils import get_column_letter
 
 os.environ.setdefault("DJANGO_SETTINGS_MODULE", "fragalysis.settings")
@@ -18,9 +19,10 @@
 django.setup()
 
 from django.conf import settings
-from django.core.exceptions import MultipleObjectsReturned
+from django.core.exceptions import MultipleObjectsReturned, ValidationError
 from django.core.files.base import ContentFile
 from django.core.files.storage import default_storage
+from django.core.validators import validate_email
 from django.db.models import F
 from rdkit import Chem
 
@@ -44,6 +46,23 @@
 # maximum distance between corresponding atoms in poses
 _DIST_LIMIT = 0.5
 
+EMPTY_VALUES = (
+    'nan',
+    '',
+    None,
+    np.nan,
+)
+
+
+HEADER_MOL_FIELDS = (
+    'ref_url',
+    'method',
+    'submitter_name',
+    'submitter_institution',
+    'submitter_email',
+    'generation_date',
+)
+
 
 def dataType(a_str: str) -> str:
     lean_str = a_str.strip()
@@ -269,37 +288,38 @@ def get_site_observation(
 
         return site_obvs
 
-    def create_mol(self, inchi, target, name=None) -> Compound:
+    def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]:
         # check for an existing compound, returning a Compound
 
         sanitized_mol = Chem.MolFromInchi(inchi, sanitize=True)
         Chem.RemoveStereochemistry(sanitized_mol)
         inchi = Chem.inchi.MolToInchi(sanitized_mol)
         inchi_key = Chem.InchiToInchiKey(inchi)
 
+        qs = Compound.objects.filter(
+            computedmolecule__computed_set__target=target,
+        )
+        cpd_number = '1'
         try:
             # NB! Max said there could be thousands of compounds per
             # target so this distinct() here may become a problem
+            cpd = qs.distinct().get(inchi_key=inchi_key)
 
-            # fmt: off
-            cpd = Compound.objects.filter(
-                computedmolecule__computed_set__target=target,
-            ).distinct().get(
-                inchi_key=inchi_key,
-            )
-            # fmt: on
+            # memo to self: I'm not setting cpd_number here, because
+            # it's read from computedmol name
         except Compound.DoesNotExist:
             cpd = Compound(
                 smiles=Chem.MolToSmiles(sanitized_mol),
                 inchi=inchi,
                 inchi_key=inchi_key,
-                current_identifier=name,
+                description=name,
             )
             # This is a new compound.
             cpd.save()
             # This is a new compound.
             # We must now set relationships to the Proposal that it applies to.
             cpd.project_id.add(target.project)
+            cpd_number = str(qs.count() + 1)
         except MultipleObjectsReturned as exc:
             # NB! when processing new uploads, Compound is always
             # fetched by inchi_key, so this shouldn't ever create
@@ -315,29 +335,30 @@ def create_mol(self, inchi, target, name=None) -> Compound:
             )
             raise MultipleObjectsReturned from exc
 
-        return cpd
+        return cpd, cpd_number
 
-    def set_props(self, cpd, props, compound_set) -> List[ScoreDescription]:
+    def set_props(self, cpd, props, score_descriptions) -> List[ScoreDescription]:
         if 'ref_mols' and 'ref_pdb' not in list(props.keys()):
             raise Exception('ref_mols and ref_pdb not set!')
-        set_obj = ScoreDescription.objects.filter(computed_set=compound_set)
-        assert set_obj
-
-        set_props_list = [s.name for s in set_obj]
-        for key in list(props.keys()):
-            if key in set_props_list not in ['ref_mols', 'ref_pdb', 'original SMILES']:
-                if dataType(str(props[key])) == 'TEXT':
-                    score_value = TextScoreValues()
-                else:
-                    score_value = NumericalScoreValues()
-                score_value.score = ScoreDescription.objects.get(
-                    computed_set=compound_set, name=key
-                )
-                score_value.value = props[key]
-                score_value.compound = cpd
-                score_value.save()
 
-        return set_obj
+        for sd, val in score_descriptions.items():
+            logger.debug('sd: %s', sd)
+            logger.debug('sd.name, val: %s: %s', sd.name, val)
+            if dataType(str(props[sd.name])) == 'TEXT':
+                score_value = TextScoreValues()
+            else:
+                score_value = NumericalScoreValues()
+
+            if sd.name in HEADER_MOL_FIELDS:
+                score_value.value = val
+            else:
+                score_value.value = props[sd.name]
+
+            score_value.compound = cpd
+            score_value.score = sd
+            score_value.save()
+
+        return score_descriptions
 
     def set_mol(
         self, mol, target, compound_set, filename, zfile=None, zfile_hashvals=None
@@ -354,7 +375,7 @@ def set_mol(
         Chem.RemoveStereochemistry(mol)
         flat_inchi = Chem.inchi.MolToInchi(flattened_copy)
 
-        compound: Compound = self.create_mol(
+        compound, number = self.create_mol(
             inchi, compound_set.target, name=molecule_name
         )
 
@@ -428,7 +449,7 @@ def set_mol(
             suffix = next(alphanumerator(start_from=groups.groups()[2]))  # type: ignore [index]
         else:
             suffix = 'a'
-            number = 1
+            # number = 1
 
         name = f'v{number}{suffix}'
 
@@ -540,25 +561,38 @@ def set_mol(
         return computed_molecule
 
     def get_submission_info(self, description_mol) -> ComputedSetSubmitter:
-        y_m_d = description_mol.GetProp('generation_date').split('-')
+        datestring = description_mol.GetProp('generation_date')
+        try:
+            date = parse(datestring, dayfirst=True)
+        except ValueError as exc:
+            logger.error('"%s" is not a valid date', datestring)
+            raise ValueError from exc
+
         return ComputedSetSubmitter.objects.get_or_create(
             name=description_mol.GetProp('submitter_name'),
             method=description_mol.GetProp('method'),
             email=description_mol.GetProp('submitter_email'),
             institution=description_mol.GetProp('submitter_institution'),
-            generation_date=datetime.date(int(y_m_d[0]), int(y_m_d[1]), int(y_m_d[2])),
+            generation_date=date,
         )[0]
 
     def process_mol(
-        self, mol, target, compound_set, filename, zfile=None, zfile_hashvals=None
+        self,
+        mol,
+        target,
+        compound_set,
+        filename,
+        score_descriptions,
+        zfile=None,
+        zfile_hashvals=None,
     ) -> List[ScoreDescription]:
         cpd = self.set_mol(mol, target, compound_set, filename, zfile, zfile_hashvals)
         other_props = mol.GetPropsAsDict()
-        return self.set_props(cpd, other_props, compound_set)
+        return self.set_props(cpd, other_props, score_descriptions)
 
     def set_descriptions(
         self, filename, computed_set: ComputedSet
-    ) -> List[Chem.rdchem.Mol]:
+    ) -> tuple[List[Chem.rdchem.Mol], dict[str, ScoreDescription]]:
         suppl = Chem.SDMolSupplier(str(filename))
         description_mol = suppl[0]
 
@@ -577,6 +611,11 @@ def set_descriptions(
         computed_set.save()
 
         description_dict = description_mol.GetPropsAsDict()
+        logger.debug('index mol original values: %s', description_dict)
+        # score descriptions for this upload, doesn't matter if
+        # created or existing
+        score_descriptions = {}
+        errors = []
         for key in description_dict.keys():
             if key in descriptions_needed and key not in [
                 'ref_mols',
@@ -585,13 +624,35 @@ def set_descriptions(
                 'Name',
                 'original SMILES',
             ]:
-                _ = ScoreDescription.objects.get_or_create(
+                description, _ = ScoreDescription.objects.get_or_create(
                     computed_set=computed_set,
                     name=key,
                     description=description_dict[key],
                 )
 
-        return mols
+                value = description_dict[key]
+
+                if key in HEADER_MOL_FIELDS:
+                    if value in EMPTY_VALUES:
+                        msg = f'Empty value for {key} in header molecule'
+                        errors.append(msg)
+                        logger.error(msg)
+                    if key == 'submitter_email':
+                        try:
+                            validate_email(value)
+                        except ValidationError as exc:
+                            msg = f'"{value}" is not a valid email'
+                            logger.error(msg)
+                            errors.append(msg)
+                            raise ValidationError(msg) from exc
+
+                score_descriptions[description] = value
+
+        logger.debug('index mol values: %s', score_descriptions.values())
+        if errors:
+            raise ValueError(errors)
+
+        return mols, score_descriptions
 
     def task(self) -> ComputedSet:
         # Truncate submitted method (lower-case)?
@@ -673,7 +734,7 @@ def task(self) -> ComputedSet:
         # This also sets the submitter and method URL properties of the computed set
         # while also saving it.
         sdf_filename = str(self.sdf_filename)
-        mols_to_process = self.set_descriptions(
+        mols_to_process, score_descriptions = self.set_descriptions(
             filename=sdf_filename, computed_set=computed_set
         )
 
@@ -688,14 +749,21 @@ def task(self) -> ComputedSet:
                 self.target_id,
                 computed_set,
                 sdf_filename,
+                score_descriptions,
                 self.zfile,
                 self.zfile_hashvals,
             )
 
         # move and save the compound set
-        new_filename = f'{settings.MEDIA_ROOT}{settings.COMPUTED_SET_MEDIA_DIRECTORY}/{computed_set.name}.sdf'
+        new_filename = (
+            Path(settings.MEDIA_ROOT)
+            .joinpath(settings.COMPUTED_SET_MEDIA_DIRECTORY)
+            .joinpath(
+                f'{computed_set.name}_upload_{computed_set.md_ordinal}_{Path(sdf_filename).name}'
+            )
+        )
         os.rename(sdf_filename, new_filename)
-        computed_set.submitted_sdf = sdf_filename
+        computed_set.submitted_sdf = Path(sdf_filename).name
         computed_set.written_sdf_filename = new_filename
         computed_set.save()
 

diff --git a/viewer/download_structures.py b/viewer/download_structures.py
@@ -44,7 +44,7 @@
     'apo_desolv_file': ('aligned'),  # SiteObservation: apo_desolv_file
     'bound_file': ('aligned'),  # SiteObservation: bound_file
     'sdf_info': ('aligned'),  # SiteObservation: ligand_mol_file (indirectly)
-    'ligand_mol': ('aligned'),  # SiteObservation: ligand_mol
+    'ligand_sdf': ('aligned'),  # SiteObservation: ligand_sdf
     'ligand_smiles': ('aligned'),  # SiteObservation: ligand_smiles
     'ligand_pdb': ('aligned'),  # SiteObservation: ligand_pdb
     'smiles_info': (''),  # SiteObservation: smiles_info (indirectly)
@@ -87,7 +87,7 @@ class ArchiveFile:
         'diff_file': {},
         'sigmaa_file': {},
         'ligand_pdb': {},
-        'ligand_mol': {},
+        'ligand_sdf': {},
         'ligand_smiles': {},
         # additional ccp4 files, issue 1448
         'event_file_crystallographic': {},
@@ -755,7 +755,7 @@ def _create_structures_dict(site_obvs, protein_params, other_params):
                     'artefacts_file',
                     'pdb_header_file',
                     'ligand_pdb',
-                    'ligand_mol',
+                    'ligand_sdf',
                     'ligand_smiles',
                     'diff_file',
                 ]:
@@ -825,12 +825,12 @@ def _create_structures_dict(site_obvs, protein_params, other_params):
         num_molecules_collected = 0
         num_missing_sd_files = 0
         for so in site_obvs:
-            if so.ligand_mol:
+            if so.ligand_sdf:
                 # There is an SD file (normal)
                 archive_path = str(
                     Path('aligned_files').joinpath(so.code).joinpath(f'{so.code}.sdf')
                 )
-                file_path = str(Path(settings.MEDIA_ROOT).joinpath(so.ligand_mol.name))
+                file_path = str(Path(settings.MEDIA_ROOT).joinpath(so.ligand_sdf.name))
                 # path is ignored when writing sdfs but mandatory field
                 zip_contents['molecules']['sdf_files'].update(
                     {
@@ -845,7 +845,7 @@ def _create_structures_dict(site_obvs, protein_params, other_params):
             else:
                 # No file value (odd).
                 logger.warning(
-                    "SiteObservation record's 'ligand_mol' isn't set (%s)", so
+                    "SiteObservation record's 'ligand_sdf' isn't set (%s)", so
                 )
                 num_missing_sd_files += 1
 
@@ -899,7 +899,7 @@ def get_download_params(request):
         'apo_solv_file': serializer.validated_data['all_aligned_structures'],
         'apo_desolv_file': serializer.validated_data['all_aligned_structures'],
         'ligand_pdb': serializer.validated_data['all_aligned_structures'],
-        'ligand_mol': serializer.validated_data['all_aligned_structures'],
+        'ligand_sdf': serializer.validated_data['all_aligned_structures'],
         'ligand_smiles': serializer.validated_data['all_aligned_structures'],
         'cif_info': serializer.validated_data['cif_info'],
         'mtz_info': serializer.validated_data['mtz_info'],

diff --git a/viewer/migrations/0065_compound_ligand_name.py b/viewer/migrations/0065_compound_ligand_name.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.2.25 on 2024-10-18 09:46
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('viewer', '0064_auto_20240918_1256'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='compound',
+            name='ligand_name',
+            field=models.TextField(blank=True, default='LIG'),
+        ),
+    ]