Source code for opencadd.structure.superposition.engines.mda

"""
Aligner based on MDAnalysis' superposition algorithms.
"""

import logging

import numpy as np
from MDAnalysis.analysis import align as mda_align, rms
from MDAnalysis.lib.util import canonical_inverse_aa_codes, convert_aa_code
import biotite.sequence.align as align
from biotite.sequence.io.fasta import FastaFile

from .base import BaseAligner
from ..sequences import sequence_alignment, fasta2select
from ....utils import enter_temp_directory

_logger = logging.getLogger(__name__)


[docs]class MDAnalysisAligner(BaseAligner):

    """
    Factory to configure an aligner based on
    MDAnalysis' superposition algorithms.

    Roughly, the algorithm follows these steps:

    1. Sequence alignment -- using "biotite" and "MDAnalysis"
    2. Structural superposition -- using "MDAnalysis"

    Parameters
    ----------
    alignment_strategy : str, optional, default=global
        What type of algorithm will be used to calculate the
        sequence alignment. Choose between:
        - "global" (Needleman-Wunsch)
        - "local" (Smith-Waterman)
    alignment_matrix : str, optional, default=BLOSUM62
        The substitution matrix used for scoring
    alignment_gap : int or (tuple, dtype=int), optional, default=-10
        Int the value will be interpreted as general gap penalty.
        Tupel is provided, an affine gap penalty is used. The first integer in the tuple
        is the gap opening penalty, the second integer is the gap extension penalty.
        The values need to be negative.
    strict_superposition: bool, optional, default=False
        True: Will raise SelectionError if a single atom does not match between the two selections.
        False: Will try to prepare a matching selection by dropping residues with non-matching atoms.
    per_residue_selection: str or AtomGroup or None, optional, default=None
        None: Apply to mobile.universe.atoms (i.e., all atoms in the context of the selection from
        mobile such as the rest of a protein, ligands and the surrounding water)
        str: Apply to mobile.select_atoms(selection-string), e.g "protein and name CA"
        AtomGroup: Apply to the arbitrary group of atoms
    superposition_weights: {“mass”, None} or array_like, optional
        choose weights. With "mass" uses masses as weights;
        None: weigh each atom equally
        If a float array of the same length as mobile is provided, use each element of the
        array_like as a weight for the corresponding atom in mobile.
    superposition_delta_mass_tolerance: float, optional, default=0.1
        Reject match if the atomic masses for matched atoms differ by more than tol_mass

    References
    ----------
    * <https://www.mdanalysis.org/docs/documentation_pages/analysis/align.html#MDAnalysis.analysis.align.alignto>

    """

    def __init__(
        self,
        alignment_strategy: str = "global",
        alignment_matrix: str = "BLOSUM62",
        alignment_gap: int = -10,
        strict_superposition: bool = False,
        per_residue_selection="name CA and not altloc B and not altloc C",
        superposition_weights=None,
        superposition_delta_mass_tolerance=0.1,
    ):

        self.alignment_strategy = alignment_strategy.lower()
        if self.alignment_strategy == "global":
            self._align_local = False
        elif self.alignment_strategy == "local":
            self._align_local = True
        else:
            raise ValueError("`alignment_strategy` must be one of `global, local`.")

        if alignment_matrix not in align.SubstitutionMatrix.list_db():
            raise ValueError(f"Substitution Matrix '{alignment_matrix }' could not be found.")
        else:
            self.alignment_matrix = alignment_matrix

        self.alignment_gap = alignment_gap
        self.strict_superposition = strict_superposition
        self.per_residue_selection = per_residue_selection
        self.superposition_weights = superposition_weights
        self.superposition_delta_mass_tolerance = superposition_delta_mass_tolerance

    def _safety_checks(self):
        """
        Check for `mda` installation passes; added here only for consistency across the engines.
        """

        pass

    def _calculate(self, structures, *args, **kwargs):
        """

        Parameters
        ----------
        structures : list of opencadd.core.Structure
            First one will be the target (static structure). Following ones will be mobile.

        Returns
        -------
        dict
            superposed models
            rmsd
            metadata
        """

        ref_universe, mob_universe = structures

        # Get matching atoms
        selection, alignment = self.matching_selection(*structures)
        ref_atoms = ref_universe.select_atoms(selection["reference"])
        mobile_atoms = mob_universe.select_atoms(selection["mobile"])

        # Compute initial RMSD (no preprocessing)
        initial_rmsd = rms.rmsd(ref_atoms.positions, mobile_atoms.positions)

        # Compute centered RMSD (both structures share now the same center of mass)
        ref_weights = np.asarray([a.mass for a in ref_atoms])
        mobile_weights = np.asarray([a.mass for a in mobile_atoms])
        ref_com = ref_atoms.center(ref_weights)
        mobile_com = mobile_atoms.center(mobile_weights)
        ref_coordinates = ref_atoms.positions - ref_com
        mobile_coordinates = mobile_atoms.positions - mobile_com
        centered_rmsd = rms.rmsd(ref_coordinates, mobile_coordinates)

        # Calculate optimum rotation matrix
        rotation, rmsd = mda_align.rotation_matrix(mobile_coordinates, ref_coordinates)
        mob_universe.atoms.translate(-mobile_com)
        mob_universe.atoms.rotate(rotation)
        mob_universe.atoms.translate(ref_com)

        return {
            "superposed": [ref_universe, mob_universe],
            "scores": {"rmsd": rmsd},
            "metadata": {
                "selection": selection,
                "alignment": alignment,
                "initial_rmsd": initial_rmsd,
                "centered_rmsd": centered_rmsd,
                "translation": ref_com,
                "rotation": rotation,
            },
        }

[docs]    def matching_selection(self, reference, mobile):
        """
        Compute best matching atom sets

        Parameters
        ----------
        structures : list of opencadd.core.Structure

        Returns
        -------
        dict
            Two-element dictionary with the selection string
            to obtain the matching atoms on the original structures
        alignment : biotite.Alignment
            The sequence alignment
        """
        # Compute sequence alignment and matching atoms
        ref_sequence, ref_resids, ref_segids = self._retrieve_sequence(reference)
        mob_sequence, mob_resids, mob_segids = self._retrieve_sequence(mobile)
        alignment = self._align(ref_sequence, mob_sequence)
        with enter_temp_directory():
            fasta = FastaFile()
            fasta["ref"], fasta["mob"], *_empty = alignment.get_gapped_sequences()
            fasta.write("temp.fasta")
            selection = fasta2select(
                "temp.fasta",
                ref_resids=ref_resids,
                target_resids=mob_resids,
                ref_segids=ref_segids,
                target_segids=mob_segids,
                backbone_selection=self.per_residue_selection,
            )
        return selection, alignment

    @staticmethod
    def _retrieve_sequence(universe):
        """
        Get the amino acid sequence

        Parameters
        ----------
        universe : mdanalysis.Universe

        Returns
        -------
        str
            one-letter amino acid sequence
        tuple of int
            residue ids of the protein sequence
        """
        sequences = []
        residue_ids = []
        segment_ids = []
        backbone_atom_names = {"C", "CA", "N", "O"}
        incomplete_residues = []
        for segment in universe.segments:
            for residue in segment.residues:
                if residue.resname in canonical_inverse_aa_codes:
                    residue_atom_names = set([a.name for a in residue.atoms])
                    if not backbone_atom_names.issubset(residue_atom_names):
                        incomplete_residues.append(residue)
                    sequences.append(convert_aa_code(residue.resname))
                    residue_ids.append(residue.resid)
                    segment_ids.append(residue.segid)
        if incomplete_residues:
            _logger.warning(
                "%d residues in %s are missing backbone atoms. "
                "If this system was obtained from a larger structure using a "
                "selection, consider wrapping such selection with "
                "`same residue as (<your original selection>)` "
                "to avoid potential matching problems.",
                len(incomplete_residues),
                universe,
            )
        return "".join(sequences), residue_ids, segment_ids

    def _align(self, sequence_1, sequence_2):
        """
        Perform a global alignment, based on the the Needleman-Wunsch algorithm
        or a local alignment, based on the Smith-Waterman algorithm

        Parameters
        ----------
        sequence_1,sequence_2: str
            str of sequences

        Returns
        -------
        str
            an alignment of two sequences

        Examples
        --------
        >>> _align("seq1", "seq2","PAM250", (-5, -15))
        RKKSLVDIDLSSLRDP
        R-K-I-DLS-S-LRDP
        """

        return sequence_alignment(
            sequence_1,
            sequence_2,
            self.alignment_matrix,
            self.alignment_gap,
            local=self._align_local,
        )