From 806fd4f7ab25c8102c6c06c8daa581308d401cd5 Mon Sep 17 00:00:00 2001
From: aax270 <s.lo@qmul.ac.uk>
Date: Tue, 5 Nov 2024 12:06:19 +0000
Subject: [PATCH 1/6] Use https instead of ssh in `.gitmodules`

Semi-reverts a397177
---
 .gitmodules | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/.gitmodules b/.gitmodules
index 0b83c869..31171554 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "alphafold"]
 	path = alphafold
-	url = git@github.com:KosinskiLab/alphafold.git
+	url = https://github.com/KosinskiLab/alphafold.git
 	branch = main
 [submodule "alphapulldown/analysis_pipeline/af2plots"]
 	path = alphapulldown/analysis_pipeline/af2plots
@@ -8,9 +8,9 @@
 	branch = main
 [submodule "AlphaLink2"]
 	path = AlphaLink2
-	url = git@github.com:KosinskiLab/AlphaLink2.git
+	url = https://github.com/KosinskiLab/AlphaLink2.git
 	branch = main
 [submodule "ColabFold"]
 	path = ColabFold
-	url = git@github.com:sokrypton/ColabFold.git
+	url = https://github.com/sokrypton/ColabFold.git
 	branch = main

From 547f1bc6e1aaf72a5ae1be2eed6db8581160b28e Mon Sep 17 00:00:00 2001
From: Dima <33123184+DimaMolod@users.noreply.github.com>
Date: Tue, 14 Jan 2025 15:36:35 +0100
Subject: [PATCH 2/6] Added links to the features db

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d02f44ef..2ecca245 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,9 @@ AlphaPulldown is a customized implementation of [AlphaFold-Multimer](https://git
 
 AlphaPulldown can be used in two ways: either by a two-step pipeline made of **python scripts**, or by a **Snakemake pipeline** as a whole. For details on using the Snakemake pipeline, please refer to the separate GitHub [**repository**](https://github.com/KosinskiLab/AlphaPulldownSnakemake).
 
-To enable faster usage and avoid redundant feature recalculations, we have developed a public database containing precomputed features for all major model organisms, available for download. For more details, [click here](https://github.com/KosinskiLab/AlphaPulldown/blob/main/README.md#features-database).
+To enable faster usage and avoid redundant feature recalculations, we have developed a [public database](https://alphapulldown.s3.embl.de/index.html) containing precomputed features for all major model organisms, available for download. You can check the full list and download individual features at https://alphapulldown.s3.embl.de/index.html or https://s3.embl.de/alphapulldown/index.html. 
+
+For more details, [click here](https://github.com/KosinskiLab/AlphaPulldown/blob/main/README.md#features-database).
 ## Overview
 
 <picture>

From a4b4135287881b201887cf9a2bdd8a4c74fbd17c Mon Sep 17 00:00:00 2001
From: Dima Molodenskiy <dmolodenskiy@embl-hamburg.de>
Date: Thu, 16 Jan 2025 11:53:50 +0100
Subject: [PATCH 3/6] Use https:// for alphafold3 submodule

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index b6bbaa8f..954cdc1e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -16,5 +16,5 @@
 	branch = main
 [submodule "alphafold3"]
 	path = alphafold3
-	url = git@github.com:google-deepmind/alphafold3.git
+	url = https://github.com/google-deepmind/alphafold3.git
 	branch = main

From d4fd6c68ab17d378ad9dd9996e51ea4d7b8fd078 Mon Sep 17 00:00:00 2001
From: Dima Molodenskiy <dmolodenskiy@embl-hamburg.de>
Date: Thu, 16 Jan 2025 11:54:35 +0100
Subject: [PATCH 4/6] Update alphafold3

---
 alphafold3 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alphafold3 b/alphafold3
index aa724ca1..ea040346 160000
--- a/alphafold3
+++ b/alphafold3
@@ -1 +1 @@
-Subproject commit aa724ca1cbfc7084fa683d27418a0d86d6228cd4
+Subproject commit ea040346e10db1759170e723ef263316e64aa768

From d89b0f8b88e5c8adf98fecc1e79c3327c439ceeb Mon Sep 17 00:00:00 2001
From: Dima Molodenskiy <dmolodenskiy@embl-hamburg.de>
Date: Fri, 17 Jan 2025 10:29:53 +0100
Subject: [PATCH 5/6] Check keys to be removed are not removed if
 --remove_keys_from_pickles=false

---
 test/test_post_prediction.py | 94 +++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 39 deletions(-)

diff --git a/test/test_post_prediction.py b/test/test_post_prediction.py
index e805e5a1..c6aaa1d9 100644
--- a/test/test_post_prediction.py
+++ b/test/test_post_prediction.py
@@ -12,11 +12,8 @@
 class TestPostPrediction(parameterized.TestCase):
     def setUp(self) -> None:
         super().setUp()
-        # Get path of the alphapulldown module
         parent_dir = join(dirname(dirname(abspath(__file__))))
-        # Join the path with the script name
         self.input_dir = join(parent_dir, "test/test_data/predictions")
-        # Set logging level to INFO
         logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
     @parameterized.parameters(
@@ -40,71 +37,90 @@ def setUp(self) -> None:
     def test_files(self, prediction_dir, compress_pickles, remove_pickles, remove_keys):
         temp_dir = tempfile.TemporaryDirectory()
         try:
-            logging.info(f"Running test for prediction_dir='{prediction_dir}', compress_pickles={compress_pickles}, remove_pickles={remove_pickles}, remove_keys={remove_keys}")
+            logging.info(f"Running test for prediction_dir='{prediction_dir}', "
+                         f"compress_pickles={compress_pickles}, remove_pickles={remove_pickles}, remove_keys={remove_keys}")
             temp_dir_path = temp_dir.name
-            # Copy the files to the temporary directory
             shutil.copytree(join(self.input_dir, prediction_dir), join(temp_dir_path, prediction_dir))
+
             # Remove existing gz files
-            gz_files = [f for f in os.listdir(join(temp_dir_path, prediction_dir)) if f.endswith('.gz')]
-            for f in gz_files:
-                os.remove(join(temp_dir_path, prediction_dir, f))
-            # Run the postprocessing function
-            post_prediction_process(join(temp_dir_path, prediction_dir), compress_pickles, remove_pickles, remove_keys)
+            gz_files_existing = [f for f in os.listdir(join(temp_dir_path, prediction_dir)) if f.endswith('.gz')]
+            for f_ in gz_files_existing:
+                os.remove(join(temp_dir_path, prediction_dir, f_))
 
-            # Get the best model from ranking_debug.json
+            # Run the postprocessing
+            post_prediction_process(join(temp_dir_path, prediction_dir),
+                                    compress_pickles,
+                                    remove_pickles,
+                                    remove_keys)
+
+            # Identify the best model
             with open(join(temp_dir_path, prediction_dir, 'ranking_debug.json')) as f:
                 best_model = json.load(f)['order'][0]
-
-            # Define the expected best result pickle path
             best_result_pickle = join(temp_dir_path, prediction_dir, f"result_{best_model}.pkl")
 
-            # Check if files are removed and/or compressed based on the parameters
+            # Gather .pkl and .gz files
             pickle_files = [f for f in os.listdir(join(temp_dir_path, prediction_dir)) if f.endswith('.pkl')]
             gz_files = [f for f in os.listdir(join(temp_dir_path, prediction_dir)) if f.endswith('.gz')]
 
+            # Check if specified keys exist or were removed
             if remove_keys:
-                # Ensure specified keys are removed from the pickle files
-                for pickle_file in pickle_files:
-                    with open(join(temp_dir_path, prediction_dir, pickle_file), 'rb') as f:
+                for pf in pickle_files:
+                    with open(join(temp_dir_path, prediction_dir, pf), 'rb') as f:
                         data = pickle.load(f)
                     for key in ['aligned_confidence_probs', 'distogram', 'masked_msa']:
-                        self.assertNotIn(key, data, f"Key {key} was not removed from {pickle_file}")
+                        self.assertNotIn(key, data, f"Key '{key}' was not removed from {pf}")
+            else:
+                # If we're not removing keys, verify they still exist in the pickle
+                for pf in pickle_files:
+                    with open(join(temp_dir_path, prediction_dir, pf), 'rb') as f:
+                        data = pickle.load(f)
+                    for key in ['aligned_confidence_probs', 'distogram', 'masked_msa']:
+                        self.assertIn(key, data, f"Key '{key}' was unexpectedly removed from {pf}")
 
+            # Now check file counts / compressions
             if not compress_pickles and not remove_pickles:
-                # All pickle files should be present, no gz files
-                logging.info("Checking condition: not compress_pickles and not remove_pickles")
-                self.assertEqual(len(pickle_files), 5, f"Expected 5 pickle files, found {len(pickle_files)}.")
-                self.assertEqual(len(gz_files), 0, f"Expected 0 gz files, found {len(gz_files)}.")
+                # Expect all .pkl files (5 in your scenario), no .gz
+                self.assertEqual(len(pickle_files), 5,
+                                 f"Expected 5 pickle files, found {len(pickle_files)}.")
+                self.assertEqual(len(gz_files), 0,
+                                 f"Expected 0 gz files, found {len(gz_files)}.")
 
             if compress_pickles and not remove_pickles:
-                # No pickle files should be present, each compressed separately
-                logging.info("Checking condition: compress_pickles and not remove_pickles")
-                self.assertEqual(len(pickle_files), 0, f"Expected 0 pickle files, found {len(pickle_files)}.")
-                self.assertEqual(len(gz_files), 5, f"Expected 5 gz files, found {len(gz_files)}.")
+                # Expect 0 .pkl files, all compressed (5)
+                self.assertEqual(len(pickle_files), 0,
+                                 f"Expected 0 pickle files, found {len(pickle_files)}.")
+                self.assertEqual(len(gz_files), 5,
+                                 f"Expected 5 gz files, found {len(gz_files)}.")
+                # Validate that gz files are readable
                 for gz_file in gz_files:
                     with gzip.open(join(temp_dir_path, prediction_dir, gz_file), 'rb') as f:
-                        f.read(1)  # Ensure it's a valid gzip file
+                        f.read(1)
 
             if not compress_pickles and remove_pickles:
-                # Only the best result pickle should be present
-                logging.info("Checking condition: not compress_pickles and remove_pickles")
-                self.assertEqual(len(pickle_files), 1, f"Expected 1 pickle file, found {len(pickle_files)}.")
-                self.assertEqual(len(gz_files), 0, f"Expected 0 gz files, found {len(gz_files)}.")
-                self.assertTrue(os.path.exists(best_result_pickle), f"Best result pickle file does not exist: {best_result_pickle}")
+                # Only the best pickle remains
+                self.assertEqual(len(pickle_files), 1,
+                                 f"Expected 1 pickle file, found {len(pickle_files)}.")
+                self.assertEqual(len(gz_files), 0,
+                                 f"Expected 0 gz files, found {len(gz_files)}.")
+                self.assertTrue(os.path.exists(best_result_pickle),
+                                f"Best result pickle file does not exist: {best_result_pickle}")
 
             if compress_pickles and remove_pickles:
-                # Only the best result pickle should be compressed, no pickle files present
-                logging.info("Checking condition: compress_pickles and remove_pickles")
-                self.assertEqual(len(pickle_files), 0, f"Expected 0 pickle files, found {len(pickle_files)}.")
-                self.assertEqual(len(gz_files), 1, f"Expected 1 gz file, found {len(gz_files)}.")
-                self.assertTrue(os.path.exists(best_result_pickle + ".gz"), f"Best result pickle file not compressed: {best_result_pickle}.gz")
+                # Only the best pickle is compressed
+                self.assertEqual(len(pickle_files), 0,
+                                 f"Expected 0 pickle files, found {len(pickle_files)}.")
+                self.assertEqual(len(gz_files), 1,
+                                 f"Expected 1 gz file, found {len(gz_files)}.")
+                self.assertTrue(os.path.exists(best_result_pickle + ".gz"),
+                                f"Best result pickle file not compressed: {best_result_pickle}.gz")
                 with gzip.open(join(temp_dir_path, prediction_dir, gz_files[0]), 'rb') as f:
-                    f.read(1)  # Ensure it's a valid gzip file
+                    f.read(1)  # Check it's valid gzip
+
         except AssertionError as e:
             logging.error(f"AssertionError: {e}")
             all_files = os.listdir(join(temp_dir_path, prediction_dir))
             relevant_files = [f for f in all_files if f.endswith('.gz') or f.endswith('.pkl')]
             logging.error(f".gz and .pkl files in {join(temp_dir_path, prediction_dir)}: {relevant_files}")
-            raise  # Re-raise the exception to ensure the test is marked as failed
+            raise
         finally:
             temp_dir.cleanup()

From 254794bccbc696a48508a93dd8482a7d1f5bc0f1 Mon Sep 17 00:00:00 2001
From: Dima Molodenskiy <dmolodenskiy@embl-hamburg.de>
Date: Mon, 20 Jan 2025 14:43:21 +0100
Subject: [PATCH 6/6] Fix PR#475

---
 alphapulldown/scripts/run_multimer_jobs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/alphapulldown/scripts/run_multimer_jobs.py b/alphapulldown/scripts/run_multimer_jobs.py
index a6544080..24371946 100644
--- a/alphapulldown/scripts/run_multimer_jobs.py
+++ b/alphapulldown/scripts/run_multimer_jobs.py
@@ -93,6 +93,7 @@ def main(argv):
         "--path_to_mmt": FLAGS.path_to_mmt,
         "--compress_result_pickles": FLAGS.compress_result_pickles,
         "--remove_result_pickles": FLAGS.remove_result_pickles,
+        "--remove_keys_from_pickles": FLAGS.remove_keys_from_pickles,
         "--use_ap_style": True,
         "--use_gpu_relax": FLAGS.use_gpu_relax,
         "--protein_delimiter": FLAGS.protein_delimiter,
@@ -138,6 +139,7 @@ def main(argv):
             command = base_command.copy()
             for arg, value in command_args.items():
                 command.extend([str(arg), str(value)])
+            logging.info(f"command: {command}")
             subprocess.run(" ".join(command), check=True, shell=True)