Merge branch 'main' into patch-1

KosinskiLab · Jan 20, 2025 · bea2d79 · bea2d79
2 parents 769cff5 + 254794b
commit bea2d79
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 46 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,20 +1,20 @@
 [submodule "alphafold"]
 	path = alphafold
-	url = git@github.com:KosinskiLab/alphafold.git
+	url = https://github.com/KosinskiLab/alphafold.git
 	branch = main
 [submodule "alphapulldown/analysis_pipeline/af2plots"]
 	path = alphapulldown/analysis_pipeline/af2plots
 	url = https://gitlab.com/gchojnowski/af2plots.git
 	branch = main
 [submodule "AlphaLink2"]
 	path = AlphaLink2
-	url = git@github.com:KosinskiLab/AlphaLink2.git
+	url = https://github.com/KosinskiLab/AlphaLink2.git
 	branch = main
 [submodule "ColabFold"]
 	path = ColabFold
-	url = git@github.com:sokrypton/ColabFold.git
+	url = https://github.com/sokrypton/ColabFold.git
 	branch = main
 [submodule "alphafold3"]
 	path = alphafold3
-	url = git@github.com:google-deepmind/alphafold3.git
+	url = https://github.com/google-deepmind/alphafold3.git
 	branch = main
diff --git a/README.md b/README.md
@@ -98,7 +98,9 @@ AlphaPulldown is a customized implementation of [AlphaFold-Multimer](https://git
 
 AlphaPulldown can be used in two ways: either by a two-step pipeline made of **python scripts**, or by a **Snakemake pipeline** as a whole. For details on using the Snakemake pipeline, please refer to the separate GitHub [**repository**](https://github.com/KosinskiLab/AlphaPulldownSnakemake).
 
-To enable faster usage and avoid redundant feature recalculations, we have developed a public database containing precomputed features for all major model organisms, available for download. For more details, [click here](https://github.com/KosinskiLab/AlphaPulldown/blob/main/README.md#features-database).
+To enable faster usage and avoid redundant feature recalculations, we have developed a [public database](https://alphapulldown.s3.embl.de/index.html) containing precomputed features for all major model organisms, available for download. You can check the full list and download individual features at https://alphapulldown.s3.embl.de/index.html or https://s3.embl.de/alphapulldown/index.html. 
+
+For more details, [click here](https://github.com/KosinskiLab/AlphaPulldown/blob/main/README.md#features-database).
 ## Overview
 
 <picture>

diff --git a/alphafold3 b/alphafold3
diff --git a/alphapulldown/scripts/run_multimer_jobs.py b/alphapulldown/scripts/run_multimer_jobs.py
@@ -93,7 +93,7 @@ def main(argv):
         "--path_to_mmt": FLAGS.path_to_mmt,
         "--compress_result_pickles": FLAGS.compress_result_pickles,
         "--remove_result_pickles": FLAGS.remove_result_pickles,
-        "--remove_keys_from_pickles" : FLAGS.remove_keys_from_pickles,
+        "--remove_keys_from_pickles": FLAGS.remove_keys_from_pickles,
         "--use_ap_style": True,
         "--use_gpu_relax": FLAGS.use_gpu_relax,
         "--protein_delimiter": FLAGS.protein_delimiter,
@@ -139,6 +139,7 @@ def main(argv):
             command = base_command.copy()
             for arg, value in command_args.items():
                 command.extend([str(arg), str(value)])
+            logging.info(f"command: {command}")
             subprocess.run(" ".join(command), check=True, shell=True)
 
 

diff --git a/test/test_post_prediction.py b/test/test_post_prediction.py
@@ -12,11 +12,8 @@
 class TestPostPrediction(parameterized.TestCase):
     def setUp(self) -> None:
         super().setUp()
-        # Get path of the alphapulldown module
         parent_dir = join(dirname(dirname(abspath(__file__))))
-        # Join the path with the script name
         self.input_dir = join(parent_dir, "test/test_data/predictions")
-        # Set logging level to INFO
         logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
     @parameterized.parameters(
@@ -40,71 +37,90 @@ def setUp(self) -> None:
     def test_files(self, prediction_dir, compress_pickles, remove_pickles, remove_keys):
         temp_dir = tempfile.TemporaryDirectory()
         try:
-            logging.info(f"Running test for prediction_dir='{prediction_dir}', compress_pickles={compress_pickles}, remove_pickles={remove_pickles}, remove_keys={remove_keys}")
+            logging.info(f"Running test for prediction_dir='{prediction_dir}', "
+                         f"compress_pickles={compress_pickles}, remove_pickles={remove_pickles}, remove_keys={remove_keys}")
             temp_dir_path = temp_dir.name
-            # Copy the files to the temporary directory
             shutil.copytree(join(self.input_dir, prediction_dir), join(temp_dir_path, prediction_dir))
+
             # Remove existing gz files
-            gz_files = [f for f in os.listdir(join(temp_dir_path, prediction_dir)) if f.endswith('.gz')]
-            for f in gz_files:
-                os.remove(join(temp_dir_path, prediction_dir, f))
-            # Run the postprocessing function
-            post_prediction_process(join(temp_dir_path, prediction_dir), compress_pickles, remove_pickles, remove_keys)
+            gz_files_existing = [f for f in os.listdir(join(temp_dir_path, prediction_dir)) if f.endswith('.gz')]
+            for f_ in gz_files_existing:
+                os.remove(join(temp_dir_path, prediction_dir, f_))
 
-            # Get the best model from ranking_debug.json
+            # Run the postprocessing
+            post_prediction_process(join(temp_dir_path, prediction_dir),
+                                    compress_pickles,
+                                    remove_pickles,
+                                    remove_keys)
+
+            # Identify the best model
             with open(join(temp_dir_path, prediction_dir, 'ranking_debug.json')) as f:
                 best_model = json.load(f)['order'][0]
-
-            # Define the expected best result pickle path
             best_result_pickle = join(temp_dir_path, prediction_dir, f"result_{best_model}.pkl")
 
-            # Check if files are removed and/or compressed based on the parameters
+            # Gather .pkl and .gz files
             pickle_files = [f for f in os.listdir(join(temp_dir_path, prediction_dir)) if f.endswith('.pkl')]
             gz_files = [f for f in os.listdir(join(temp_dir_path, prediction_dir)) if f.endswith('.gz')]
 
+            # Check if specified keys exist or were removed
             if remove_keys:
-                # Ensure specified keys are removed from the pickle files
-                for pickle_file in pickle_files:
-                    with open(join(temp_dir_path, prediction_dir, pickle_file), 'rb') as f:
+                for pf in pickle_files:
+                    with open(join(temp_dir_path, prediction_dir, pf), 'rb') as f:
                         data = pickle.load(f)
                     for key in ['aligned_confidence_probs', 'distogram', 'masked_msa']:
-                        self.assertNotIn(key, data, f"Key {key} was not removed from {pickle_file}")
+                        self.assertNotIn(key, data, f"Key '{key}' was not removed from {pf}")
+            else:
+                # If we're not removing keys, verify they still exist in the pickle
+                for pf in pickle_files:
+                    with open(join(temp_dir_path, prediction_dir, pf), 'rb') as f:
+                        data = pickle.load(f)
+                    for key in ['aligned_confidence_probs', 'distogram', 'masked_msa']:
+                        self.assertIn(key, data, f"Key '{key}' was unexpectedly removed from {pf}")
 
+            # Now check file counts / compressions
             if not compress_pickles and not remove_pickles:
-                # All pickle files should be present, no gz files
-                logging.info("Checking condition: not compress_pickles and not remove_pickles")
-                self.assertEqual(len(pickle_files), 5, f"Expected 5 pickle files, found {len(pickle_files)}.")
-                self.assertEqual(len(gz_files), 0, f"Expected 0 gz files, found {len(gz_files)}.")
+                # Expect all .pkl files (5 in your scenario), no .gz
+                self.assertEqual(len(pickle_files), 5,
+                                 f"Expected 5 pickle files, found {len(pickle_files)}.")
+                self.assertEqual(len(gz_files), 0,
+                                 f"Expected 0 gz files, found {len(gz_files)}.")
 
             if compress_pickles and not remove_pickles:
-                # No pickle files should be present, each compressed separately
-                logging.info("Checking condition: compress_pickles and not remove_pickles")
-                self.assertEqual(len(pickle_files), 0, f"Expected 0 pickle files, found {len(pickle_files)}.")
-                self.assertEqual(len(gz_files), 5, f"Expected 5 gz files, found {len(gz_files)}.")
+                # Expect 0 .pkl files, all compressed (5)
+                self.assertEqual(len(pickle_files), 0,
+                                 f"Expected 0 pickle files, found {len(pickle_files)}.")
+                self.assertEqual(len(gz_files), 5,
+                                 f"Expected 5 gz files, found {len(gz_files)}.")
+                # Validate that gz files are readable
                 for gz_file in gz_files:
                     with gzip.open(join(temp_dir_path, prediction_dir, gz_file), 'rb') as f:
-                        f.read(1)  # Ensure it's a valid gzip file
+                        f.read(1)
 
             if not compress_pickles and remove_pickles:
-                # Only the best result pickle should be present
-                logging.info("Checking condition: not compress_pickles and remove_pickles")
-                self.assertEqual(len(pickle_files), 1, f"Expected 1 pickle file, found {len(pickle_files)}.")
-                self.assertEqual(len(gz_files), 0, f"Expected 0 gz files, found {len(gz_files)}.")
-                self.assertTrue(os.path.exists(best_result_pickle), f"Best result pickle file does not exist: {best_result_pickle}")
+                # Only the best pickle remains
+                self.assertEqual(len(pickle_files), 1,
+                                 f"Expected 1 pickle file, found {len(pickle_files)}.")
+                self.assertEqual(len(gz_files), 0,
+                                 f"Expected 0 gz files, found {len(gz_files)}.")
+                self.assertTrue(os.path.exists(best_result_pickle),
+                                f"Best result pickle file does not exist: {best_result_pickle}")
 
             if compress_pickles and remove_pickles:
-                # Only the best result pickle should be compressed, no pickle files present
-                logging.info("Checking condition: compress_pickles and remove_pickles")
-                self.assertEqual(len(pickle_files), 0, f"Expected 0 pickle files, found {len(pickle_files)}.")
-                self.assertEqual(len(gz_files), 1, f"Expected 1 gz file, found {len(gz_files)}.")
-                self.assertTrue(os.path.exists(best_result_pickle + ".gz"), f"Best result pickle file not compressed: {best_result_pickle}.gz")
+                # Only the best pickle is compressed
+                self.assertEqual(len(pickle_files), 0,
+                                 f"Expected 0 pickle files, found {len(pickle_files)}.")
+                self.assertEqual(len(gz_files), 1,
+                                 f"Expected 1 gz file, found {len(gz_files)}.")
+                self.assertTrue(os.path.exists(best_result_pickle + ".gz"),
+                                f"Best result pickle file not compressed: {best_result_pickle}.gz")
                 with gzip.open(join(temp_dir_path, prediction_dir, gz_files[0]), 'rb') as f:
-                    f.read(1)  # Ensure it's a valid gzip file
+                    f.read(1)  # Check it's valid gzip
+
         except AssertionError as e:
             logging.error(f"AssertionError: {e}")
             all_files = os.listdir(join(temp_dir_path, prediction_dir))
             relevant_files = [f for f in all_files if f.endswith('.gz') or f.endswith('.pkl')]
             logging.error(f".gz and .pkl files in {join(temp_dir_path, prediction_dir)}: {relevant_files}")
-            raise  # Re-raise the exception to ensure the test is marked as failed
+            raise
         finally:
             temp_dir.cleanup()
+2 −2		README.md
+6 −2		docker/Dockerfile
+231 −77		docs/input.md
+51 −15		docs/installation.md
+9 −30		docs/known_issues.md
+32 −2		docs/output.md
+25 −12		docs/performance.md
+97 −63		run_alphafold.py
+3 −3		run_alphafold_data_test.py
+21 −9		run_alphafold_test.py
+38 −0		src/alphafold3/.github/workflows/ci.yaml
+437 −152		src/alphafold3/common/folding_input.py
+16 −8		src/alphafold3/constants/chemical_components.py
+6 −1		src/alphafold3/data/featurisation.py
+1 −1		src/alphafold3/data/parsers.py
+24 −11		src/alphafold3/data/pipeline.py
+3 −1		src/alphafold3/data/templates.py
+22 −0		src/alphafold3/data/tools/rdkit_utils.py
+7 −6		src/alphafold3/jax/gated_linear_unit/matmul_ext.py
+10 −40		src/alphafold3/model/atom_layout/atom_layout.py
+0 −62		src/alphafold3/model/components/base_model.py
+4 −4		src/alphafold3/model/confidence_types.py
+12 −8		src/alphafold3/model/confidences.py
+2 −4		src/alphafold3/model/data3.py
+117 −106		src/alphafold3/model/features.py
+57 −344		src/alphafold3/model/model.py
+1 −1		src/alphafold3/model/model_config.py
+1 −1		src/alphafold3/model/network/atom_cross_attention.py
+2 −2		src/alphafold3/model/network/confidence_head.py
+3 −3		src/alphafold3/model/network/diffusion_head.py
+0 −0		src/alphafold3/model/network/diffusion_transformer.py
+0 −0		src/alphafold3/model/network/distogram_head.py
+347 −0		src/alphafold3/model/network/evoformer.py
+0 −0		src/alphafold3/model/network/featurization.py
+1 −1		src/alphafold3/model/network/modules.py
+1 −1		src/alphafold3/model/network/template_modules.py
+4 −4		src/alphafold3/model/pipeline/pipeline.py
+13 −3		src/alphafold3/model/post_processing.py
+ −		src/alphafold3/test_data/alphafold_run_outputs/run_alphafold_test_output_bucket_1024.pkl
+ −		src/alphafold3/test_data/alphafold_run_outputs/run_alphafold_test_output_bucket_default.pkl
+4 −4		src/alphafold3/test_data/featurised_example.json
+ −		src/alphafold3/test_data/featurised_example.pkl
+562 −0		src/alphafold3/test_data/miniature_databases/pdb_mmcif/5y2e.cif
+3,293 −0		src/alphafold3/test_data/miniature_databases/pdb_mmcif/6s61.cif
+2,467 −0		src/alphafold3/test_data/miniature_databases/pdb_mmcif/6ydw.cif
+893 −0		src/alphafold3/test_data/miniature_databases/pdb_mmcif/7rye.cif
+2 −1		src/alphafold3/test_data/model_config.json