Skip to content

Commit

Permalink
Merge pull request #705 from padix-key/structural-superimpose
Browse files Browse the repository at this point in the history
Implement structural superimposition and TM-score
  • Loading branch information
padix-key authored Jan 6, 2025
2 parents e76eb24 + 803f61b commit aecc27d
Show file tree
Hide file tree
Showing 23 changed files with 1,008 additions and 55 deletions.
28 changes: 28 additions & 0 deletions benchmarks/structure/benchmark_superimpose.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pathlib import Path
import pytest
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
from tests.util import data_dir


@pytest.fixture
def atoms():
pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif")
return pdbx.get_structure(pdbx_file)


@pytest.mark.benchmark
@pytest.mark.parametrize(
"method",
[
struc.superimpose,
struc.superimpose_without_outliers,
struc.superimpose_homologs,
struc.superimpose_structural_homologs,
],
)
def benchmark_superimpose(method, atoms):
"""
Compute superimposition of two structures with the same number of atoms.
"""
method(atoms[0], atoms[1])
6 changes: 4 additions & 2 deletions doc/apidoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,9 @@
],
"Superimpositions" : [
"superimpose",
"superimpose_homologs",
"superimpose_without_outliers",
"superimpose_homologs",
"superimpose_structural_homologs",
"AffineTransformation"
],
"Filters" : [
Expand Down Expand Up @@ -319,7 +320,8 @@
"average",
"rmsd",
"rmspd",
"rmsf"
"rmsf",
"tm_score"
],
"General analysis" : [
"sasa",
Expand Down
23 changes: 23 additions & 0 deletions src/biotite/sequence/align/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"score",
"find_terminal_gaps",
"remove_terminal_gaps",
"remove_gaps",
]


Expand Down Expand Up @@ -666,6 +667,28 @@ def remove_terminal_gaps(alignment):
return alignment[start:stop]


def remove_gaps(alignment):
"""
Remove all gap columns from an alignment.
Parameters
----------
alignment : Alignment
The alignment to be modified.
Returns
-------
truncated_alignment : Alignment
The alignment without gap columns.
See Also
--------
remove_terminal_gaps : Remove only terminal gap columns
"""
non_gap_mask = (alignment.trace != -1).all(axis=1)
return alignment[non_gap_mask]


def _is_single_letter(alphabet):
"""
More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`:
Expand Down
1 change: 1 addition & 0 deletions src/biotite/structure/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,5 +129,6 @@
from .sequence import *
from .sse import *
from .superimpose import *
from .tm import *
from .transform import *
# util and segments are used internally
115 changes: 65 additions & 50 deletions src/biotite/structure/superimpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@


import numpy as np
from biotite.sequence.align import SubstitutionMatrix, align_optimal, get_codes
from biotite.sequence.align.alignment import get_codes, remove_gaps
from biotite.sequence.align.matrix import SubstitutionMatrix
from biotite.sequence.align.pairwise import align_optimal
from biotite.sequence.alphabet import common_alphabet
from biotite.sequence.seqtypes import ProteinSequence
from biotite.structure.atoms import coord
from biotite.structure.chains import chain_iter
from biotite.structure.filter import filter_amino_acids, filter_nucleotides
from biotite.structure.geometry import centroid, distance
from biotite.structure.sequence import to_sequence
Expand Down Expand Up @@ -464,10 +467,16 @@ def superimpose_without_outliers(


def superimpose_homologs(
fixed, mobile, substitution_matrix=None, gap_penalty=-10, min_anchors=3, **kwargs
fixed,
mobile,
substitution_matrix=None,
gap_penalty=-10,
min_anchors=3,
terminal_penalty=False,
**kwargs,
):
r"""
Superimpose one protein or nucleotide chain onto another one,
Superimpose a protein or nucleotide structure onto another one,
considering sequence differences and conformational outliers.
The method finds corresponding residues by sequence alignment and
Expand All @@ -480,11 +489,11 @@ def superimpose_homologs(
----------
fixed : AtomArray, shape(n,) or AtomArrayStack, shape(m,n)
The fixed structure(s).
Must comprise a single chain.
mobile : AtomArray, shape(n,) or AtomArrayStack, shape(m,n)
The structure(s) which is/are superimposed on the `fixed`
structure.
Must comprise a single chain.
The structure(s) which is/are superimposed on the `fixed` structure.
Must contain the same number of chains as `fixed` with corresponding chains
being in the same order.
The specific chain IDs can be different.
substitution_matrix : str or SubstitutionMatrix, optional
The (name of the) substitution matrix used for sequence
alignment.
Expand All @@ -504,6 +513,9 @@ def superimpose_homologs(
`mobile` in this fallback case, an exception is raised.
Furthermore, the outlier removal is stopped, if less than
`min_anchors` anchors would be left.
terminal_penalty : bool, optional
If set to true, gap penalties are applied to terminal gaps in the sequence
alignment.
**kwargs
Additional parameters for
:func:`superimpose_without_outliers()`.
Expand All @@ -527,6 +539,7 @@ def superimpose_homologs(
--------
superimpose : Superimposition without outlier removal
superimpose_without_outliers : Internally used for outlier removal
superimpose_structural_homologs : Better suited for low sequence similarity
Notes
-----
Expand All @@ -540,26 +553,27 @@ def superimpose_homologs(
or len(mobile_anchor_indices) < min_anchors
):
raise ValueError(
"Structures have too few CA atoms for required number of anchors"
"Structures have too few backbone atoms for required number of anchors"
)

anchor_indices = _find_matching_anchors(
fixed[..., fixed_anchor_indices],
mobile[..., mobile_anchor_indices],
substitution_matrix,
gap_penalty,
terminal_penalty,
)
if len(anchor_indices) < min_anchors:
# Fallback: Match all backbone anchors
if len(fixed_anchor_indices) != len(mobile_anchor_indices):
raise ValueError(
"Tried fallback due to low anchor number, "
"but number of CA atoms does not match"
"but number of backbone atoms does not match"
)
fixed_anchor_indices = fixed_anchor_indices
mobile_anchor_indices = mobile_anchor_indices
else:
# The anchor indices point to the CA atoms
# The anchor indices point to the backbone atoms
# -> get the corresponding indices for the whole structure
fixed_anchor_indices = fixed_anchor_indices[anchor_indices[:, 0]]
mobile_anchor_indices = mobile_anchor_indices[anchor_indices[:, 1]]
Expand Down Expand Up @@ -639,51 +653,52 @@ def _find_matching_anchors(
mobile_anchors_atoms,
substitution_matrix,
gap_penalty,
terminal_penalty,
):
"""
Find corresponding residues using pairwise sequence alignment.
"""
fixed_seq = _to_sequence(fixed_anchor_atoms)
mobile_seq = _to_sequence(mobile_anchors_atoms)
common_alph = common_alphabet([fixed_seq.alphabet, mobile_seq.alphabet])
if common_alph is None:
raise ValueError("Cannot superimpose peptides with nucleic acids")

if substitution_matrix is None:
if isinstance(fixed_seq, ProteinSequence):
substitution_matrix = SubstitutionMatrix.std_protein_matrix()
else:
substitution_matrix = SubstitutionMatrix.std_nucleotide_matrix()
elif isinstance(substitution_matrix, str):
substitution_matrix = SubstitutionMatrix(
common_alph, common_alph, substitution_matrix
)
score_matrix = substitution_matrix.score_matrix()
anchor_list = []
fixed_seq_offset = 0
mobile_seq_offset = 0
for fixed_chain, mobile_chain in zip(
chain_iter(fixed_anchor_atoms), chain_iter(mobile_anchors_atoms), strict=True
):
# The input is a single chain -> expect a single sequence
fixed_seq = to_sequence(fixed_chain, allow_hetero=True)[0][0]
mobile_seq = to_sequence(mobile_chain, allow_hetero=True)[0][0]

common_alph = common_alphabet([fixed_seq.alphabet, mobile_seq.alphabet])
if common_alph is None:
raise ValueError("Cannot superimpose peptides with nucleic acids")
if substitution_matrix is None:
if isinstance(fixed_seq, ProteinSequence):
substitution_matrix = SubstitutionMatrix.std_protein_matrix()
else:
substitution_matrix = SubstitutionMatrix.std_nucleotide_matrix()
elif isinstance(substitution_matrix, str):
substitution_matrix = SubstitutionMatrix(
common_alph, common_alph, substitution_matrix
)

alignment = align_optimal(
fixed_seq,
mobile_seq,
substitution_matrix,
gap_penalty,
terminal_penalty=False,
max_number=1,
)[0]
alignment_codes = get_codes(alignment)
anchor_mask = (
# Anchors must be similar amino acids
(score_matrix[alignment_codes[0], alignment_codes[1]] > 0)
alignment = align_optimal(
fixed_seq,
mobile_seq,
substitution_matrix,
gap_penalty,
terminal_penalty=terminal_penalty,
max_number=1,
)[0]
# Cannot anchor gaps
& (alignment_codes[0] != -1)
& (alignment_codes[1] != -1)
)
anchors = alignment.trace[anchor_mask]
return anchors
alignment = remove_gaps(alignment)
ali_codes = get_codes(alignment)
score_matrix = substitution_matrix.score_matrix()
# Anchors must be similar amino acids
anchors = alignment.trace[score_matrix[ali_codes[0], ali_codes[1]] > 0]

anchors += fixed_seq_offset, mobile_seq_offset
fixed_seq_offset += len(fixed_seq)
mobile_seq_offset += len(mobile_seq)
anchor_list.append(anchors)

def _to_sequence(atoms):
sequences, _ = to_sequence(atoms, allow_hetero=True)
if len(sequences) == 0:
raise ValueError("Structure does not contain any amino acids or nucleotides")
if len(sequences) > 1:
raise ValueError("Structure contains multiple chains, but only one is allowed")
return sequences[0]
return np.concatenate(anchor_list, axis=0)
Loading

0 comments on commit aecc27d

Please sign in to comment.