OATML-Markslab · pascalnotin · Mar 6, 2025 · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/proteingym/baselines/kermut/.gitignore b/proteingym/baselines/kermut/.gitignore
@@ -0,0 +1,31 @@
+.idea
+.vscode
+
+**/*.pyc
+**/__pycache___
+**/*.h5
+**/*.pdf
+**/*.csv
+**/*.pdb
+**/*.json
+**/*.npy
+**/*.npz
+**/*.pt
+**/*.zip
+data/conditional_probs/raw_ProteinMPNN_outputs/*
+
+# Ignore predictions but keep directories
+results/predictions/**/*.csv
+results/predictions_old_split/**/*.csv
+
+# Keep summarized results
+!results/summary/**/*.csv
+!results/summary_old_split/**/*.csv
+!results/ablation_summary/**/*.csv
+!results/*.csv
+!results/calibration_metrics/*.csv
+!data/constants.json
+
+# Housekeeping
+logs/
+multirun/
diff --git a/proteingym/baselines/kermut/LICENSE b/proteingym/baselines/kermut/LICENSE
@@ -0,0 +1,9 @@
+MIT License
+
+Copyright (c) 2024 Peter Mørch Groth and Mads Herbert Kerrn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/proteingym/baselines/kermut/README.md b/proteingym/baselines/kermut/README.md
@@ -0,0 +1,164 @@
+# Kermut
+
+This is the official implementation of the supervised variant effect predictor, Kermut, from the paper _Kermut: Composite kernel regression for protein variant effects_ ([preprint](https://www.biorxiv.org/content/10.1101/2024.05.28.596219v1)).
+
+
+## Overview
+Kermut is a Gaussian process which obtains high performance for protein property prediction on ProteinGym's supervised substitution benchmark while providing meaningful overall calibration.
+
+### Results on ProteinGym 
+Below is a table showing the aggregated Spearman scores per cross validation scheme.
+
+
+| Model | Average  | Random | Modulo | Contiguous |
+|------------|------------------|--------------------------------|--------------------------------|-------------------------------------|
+| Kermut     | 0.655            | 0.744                          | 0.631                          | 0.591                              |
+
+
+Below is a table showing the aggregated Spearman scores per functional category.
+
+| Model | Activity | Binding | Expression | Organismal Fitness | Stability |
+|------------|-------------------|------------------|---------------------|---------------------------|-------------------|
+| Kermut     | 0.605             | 0.614            | 0.662               | 0.580                     | 0.817             |
+
+
+
+## Installation
+
+After cloning the repository, the environment can be installed via
+
+```bash
+conda env create -f environment.yml
+conda activate kermut_env
+pip install -e .
+```
+
+## Reproduce results [with precomputed embeddings]
+
+
+### Download pre-computed embeddings
+
+All outputs from the preprocessing procedure (i.e., precomputed ESM-2 embeddings, conditional amino acid distributions, processed coordinate files, and zero-shot scores from ESM-2) can be readily accessed via a zip-archive hosted by the Electronic Research Data Archive (ERDA) by the University of Copenhagen using the following [link](https://sid.erda.dk/sharelink/c2EWrbGSCV). The file takes up approximately 4GB. To download and extract the data, run the following (from the Kermut base directory):
+
+```bash
+# Download zip archive
+curl -o kermut_data.zip https://sid.erda.dk/share_redirect/c2EWrbGSCV/kermut_data.zip
+# Unpack and remove zip archive
+unzip kermut_data.zip && rm kermut_data.zip
+```
+
+### Compute fitness 
+
+To compute the fitness for the 0th assay in the reference file, run the following:
+
+```bash
+python proteingym/baselines/kermut/kermut/proteingym_benchmark.py \
+    DMS_idx=0 \
+    split_method=fold_random_5
+```
+
+Per-mutant predictions will (by default) be placed in: 
+`model_scores/supervised_substitutions/fold_random_5/kermut/assay_name_for_idx_0.csv`
+
+#### Note on directories:
+In the `proteingym_gpr.yaml` configuration file, four paths are defined by default:
+```yaml
+DMS_data_folder: data/substitutions_singles
+DMS_reference_file_path: reference_files/DMS_substitutions.csv
+output_scores_folder: model_scores/supervised_substitutions
+auxiliary_data_folder: proteingym/baselines/kermut/data
+```
+These can easily be overwritten, e.g., via:
+```bash
+python proteingym/baselines/kermut/kermut/proteingym_benchmark.py \
+    DMS_idx=0 \
+    split_method=fold_random_5 \
+    auxiliary_data_folder=/tmp/kermut/data \
+    output_scores_folder=/tmp/kermut/outputs
+```
+
+
+## Reproduce results [from scratch]
+To run Kermut from scratch without precomputed resources, e.g., for a new dataset, the ProteinMPNN repository must be installed. Additionally, the ESM-2 650M parameter model must be saved locally: 
+### ProteinMPNN
+Kermut leverages structure-conditioned amino acid distributions from [ProteinMPNN](https://www.science.org/doi/10.1126/science.add2187), which can has to installed from the [official repository](https://github.com/dauparas/ProteinMPNN). An environment variable pointing to the installation location can then be set for later use:
+
+```bash
+export PROTEINMPNN_DIR=<path-to-ProteinMPNN-installation>
+```
+
+### ESM-2 models 
+Kermut leverages protein sequence embeddings and zero-shot scores extracted from ESM-2 ([paper](https://www.science.org/doi/10.1126/science.ade2574), [repo](https://github.com/facebookresearch/esm)). We concretely use the 650M parameter model (`esm2_t33_650M_UR50D`). While the ESM repository is installed above /via the yml-file), the model weights should be downloaded separately and placed in the `models` directory:
+
+```bash
+curl -o models/esm2_t33_650M_UR50D.pt https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt
+
+curl -o models/esm2_t33_650M_UR50D-contact-regression.pt https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt
+```
+
+### Sequence embeddings
+ESM-2 embeddings can be generated by calling:
+```bash
+python kermut/data/extract_esm2_embeddings.py \
+    --DMS_idx=0 \
+    --which=singles
+```
+For each assay, an `h5` file is generated which contains all embeddings for all variants. Since the Kermut GP only uses the mean-pooled embeddings, only these are stored. To obtain per AA embeddings, the extraction script can be altered by removing the mean operator.
+The embeddings are located in `kermut/data/embeddings/substitutions_singles/ESM2` (for the single-mutant assays).
+
+
+### Structure-conditioned amino acid distributions
+
+The structure-conditioned amino acid distributions for all residues and assays, can be computed with ProteinMPNN via
+
+```
+bash scripts/conditional_probabilities_all.sh
+```
+This generates per-assay directories in `data/conditional_probs/raw_ProteinMPNN_outputs`. After this, postprocessing for easier access is performed via
+```bash
+python kermut/data/extract_ProteinMPNN_probs.py
+```
+This generates per-assay `npy`-files in `data/conditional_probs/ProteinMPNN`.
+
+### 3D coordinates
+Lastly, the 3D coordinates can be extracted from each PDB file via
+```bash
+python kermut/data/extract_3d_coords.py
+```
+This saves `npy`-files for each assay in `data/structures/coords`. 
+
+### Optional: Zero-shot scores
+If not relying on pre-computed zero-shot scores from ProteinGym, they can be computed for ESM-2 via:
+```bash
+python kermut/data/extract_esm2_zero_shots.py --DMS_idx 0
+```
+See the script for usage details. For multi-mutant datasets, the log-likelihood ratios are summed for each mutant.
+
+### Compute fitness
+
+See above.
+
+
+
+
+
+## Pre-computed results:
+
+The per-variant predictions (with uncertainties) for all assays can be downloaded via:
+
+```bash
+# Download zip archive
+curl -o predictions.zip https://sid.erda.dk/share_redirect/c2EWrbGSCV/predictions.zip
+# Unpack and remove zip archive
+unzip predictions.zip && rm predictions.zip
+```
+
+The per-variant predictions for the old split and the ablation predictions can be downloaded via 
+
+```bash
+# Download zip archive
+curl -o predictions_old_split.zip https://sid.erda.dk/share_redirect/c2EWrbGSCV/predictions_old_split.zip
+# Unpack and remove zip archive
+unzip predictions_old_split.zip && rm predictions_old_split.zip
+```
+
diff --git a/proteingym/baselines/kermut/configs/gp/kermut.yaml b/proteingym/baselines/kermut/configs/gp/kermut.yaml
@@ -0,0 +1,31 @@
+name: kermut
+
+use_mutation_kernel: true
+use_global_kernel: true
+embedding_type: ESM2
+embedding_dim: 1280
+
+mutation_kernel:
+  model:
+    _target_: kermut.model.kernel.Kermut
+  use_distances: true
+  conditional_probs_method: ProteinMPNN
+  tokenizer:
+    _target_: kermut.data.data_utils.Tokenizer
+    flatten: true
+  kernel_params:
+    h_scale: 1.0
+    h_lengthscale: 0.1
+    d_lengthscale: 0.1
+    p_lengthscale: 0.1
+
+gp_model:
+  _target_: kermut.model.gp.ExactGPKermut
+
+use_zero_shot: true
+zero_shot_method: ESM2
+use_prior: true
+noise_prior_scale: 0.1
+optim:
+  n_steps: 150
+  lr: 0.1
diff --git a/proteingym/baselines/kermut/configs/proteingym_gpr.yaml b/proteingym/baselines/kermut/configs/proteingym_gpr.yaml
@@ -0,0 +1,28 @@
+defaults:
+  - gp: kermut
+  - _self_
+  - override hydra/job_logging: disabled  # Prevents creation of .log file from Hydra
+
+DMS_idx: ???
+split_method: ???
+DMS_data_folder: data/substitutions_singles
+DMS_reference_file_path: reference_files/DMS_substitutions.csv
+output_scores_folder: model_scores/supervised_substitutions
+auxiliary_data_folder: proteingym/baselines/kermut/data
+
+
+seed: 2024
+progress_bar: false
+overwrite: false
+standardize: true
+use_gpu: true
+
+# Disables Hydra logging functionality
+hydra:
+  output_subdir: null
+  job:
+    chdir: false
+  run:
+    dir: .
+
+
diff --git a/proteingym/baselines/kermut/data/conditional_probs/ProteinMPNN/.gitkeep b/proteingym/baselines/kermut/data/conditional_probs/ProteinMPNN/.gitkeep