Source code for opencadd.structure.superposition.engines.mda

"""
Aligner based on MDAnalysis' superposition algorithms.
"""

import logging

import numpy as np
from MDAnalysis.analysis import align as mda_align, rms
from MDAnalysis.lib.util import canonical_inverse_aa_codes, convert_aa_code
import biotite.sequence.align as align
from biotite.sequence.io.fasta import FastaFile

from .base import BaseAligner
from ..sequences import sequence_alignment, fasta2select
from ....utils import enter_temp_directory

_logger = logging.getLogger(__name__)


[docs]class MDAnalysisAligner(BaseAligner): """ Factory to configure an aligner based on MDAnalysis' superposition algorithms. Roughly, the algorithm follows these steps: 1. Sequence alignment -- using "biotite" and "MDAnalysis" 2. Structural superposition -- using "MDAnalysis" Parameters ---------- alignment_strategy : str, optional, default=global What type of algorithm will be used to calculate the sequence alignment. Choose between: - "global" (Needleman-Wunsch) - "local" (Smith-Waterman) alignment_matrix : str, optional, default=BLOSUM62 The substitution matrix used for scoring alignment_gap : int or (tuple, dtype=int), optional, default=-10 Int the value will be interpreted as general gap penalty. Tupel is provided, an affine gap penalty is used. The first integer in the tuple is the gap opening penalty, the second integer is the gap extension penalty. The values need to be negative. strict_superposition: bool, optional, default=False True: Will raise SelectionError if a single atom does not match between the two selections. False: Will try to prepare a matching selection by dropping residues with non-matching atoms. per_residue_selection: str or AtomGroup or None, optional, default=None None: Apply to mobile.universe.atoms (i.e., all atoms in the context of the selection from mobile such as the rest of a protein, ligands and the surrounding water) str: Apply to mobile.select_atoms(selection-string), e.g "protein and name CA" AtomGroup: Apply to the arbitrary group of atoms superposition_weights: {“mass”, None} or array_like, optional choose weights. With "mass" uses masses as weights; None: weigh each atom equally If a float array of the same length as mobile is provided, use each element of the array_like as a weight for the corresponding atom in mobile. superposition_delta_mass_tolerance: float, optional, default=0.1 Reject match if the atomic masses for matched atoms differ by more than tol_mass References ---------- * <https://www.mdanalysis.org/docs/documentation_pages/analysis/align.html#MDAnalysis.analysis.align.alignto> """ def __init__( self, alignment_strategy: str = "global", alignment_matrix: str = "BLOSUM62", alignment_gap: int = -10, strict_superposition: bool = False, per_residue_selection="name CA and not altloc B and not altloc C", superposition_weights=None, superposition_delta_mass_tolerance=0.1, ): self.alignment_strategy = alignment_strategy.lower() if self.alignment_strategy == "global": self._align_local = False elif self.alignment_strategy == "local": self._align_local = True else: raise ValueError("`alignment_strategy` must be one of `global, local`.") if alignment_matrix not in align.SubstitutionMatrix.list_db(): raise ValueError(f"Substitution Matrix '{alignment_matrix }' could not be found.") else: self.alignment_matrix = alignment_matrix self.alignment_gap = alignment_gap self.strict_superposition = strict_superposition self.per_residue_selection = per_residue_selection self.superposition_weights = superposition_weights self.superposition_delta_mass_tolerance = superposition_delta_mass_tolerance def _safety_checks(self): """ Check for `mda` installation passes; added here only for consistency across the engines. """ pass def _calculate(self, structures, *args, **kwargs): """ Parameters ---------- structures : list of opencadd.core.Structure First one will be the target (static structure). Following ones will be mobile. Returns ------- dict superposed models rmsd metadata """ ref_universe, mob_universe = structures # Get matching atoms selection, alignment = self.matching_selection(*structures) ref_atoms = ref_universe.select_atoms(selection["reference"]) mobile_atoms = mob_universe.select_atoms(selection["mobile"]) # Compute initial RMSD (no preprocessing) initial_rmsd = rms.rmsd(ref_atoms.positions, mobile_atoms.positions) # Compute centered RMSD (both structures share now the same center of mass) ref_weights = np.asarray([a.mass for a in ref_atoms]) mobile_weights = np.asarray([a.mass for a in mobile_atoms]) ref_com = ref_atoms.center(ref_weights) mobile_com = mobile_atoms.center(mobile_weights) ref_coordinates = ref_atoms.positions - ref_com mobile_coordinates = mobile_atoms.positions - mobile_com centered_rmsd = rms.rmsd(ref_coordinates, mobile_coordinates) # Calculate optimum rotation matrix rotation, rmsd = mda_align.rotation_matrix(mobile_coordinates, ref_coordinates) mob_universe.atoms.translate(-mobile_com) mob_universe.atoms.rotate(rotation) mob_universe.atoms.translate(ref_com) return { "superposed": [ref_universe, mob_universe], "scores": {"rmsd": rmsd}, "metadata": { "selection": selection, "alignment": alignment, "initial_rmsd": initial_rmsd, "centered_rmsd": centered_rmsd, "translation": ref_com, "rotation": rotation, }, }
[docs] def matching_selection(self, reference, mobile): """ Compute best matching atom sets Parameters ---------- structures : list of opencadd.core.Structure Returns ------- dict Two-element dictionary with the selection string to obtain the matching atoms on the original structures alignment : biotite.Alignment The sequence alignment """ # Compute sequence alignment and matching atoms ref_sequence, ref_resids, ref_segids = self._retrieve_sequence(reference) mob_sequence, mob_resids, mob_segids = self._retrieve_sequence(mobile) alignment = self._align(ref_sequence, mob_sequence) with enter_temp_directory(): fasta = FastaFile() fasta["ref"], fasta["mob"], *_empty = alignment.get_gapped_sequences() fasta.write("temp.fasta") selection = fasta2select( "temp.fasta", ref_resids=ref_resids, target_resids=mob_resids, ref_segids=ref_segids, target_segids=mob_segids, backbone_selection=self.per_residue_selection, ) return selection, alignment
@staticmethod def _retrieve_sequence(universe): """ Get the amino acid sequence Parameters ---------- universe : mdanalysis.Universe Returns ------- str one-letter amino acid sequence tuple of int residue ids of the protein sequence """ sequences = [] residue_ids = [] segment_ids = [] backbone_atom_names = {"C", "CA", "N", "O"} incomplete_residues = [] for segment in universe.segments: for residue in segment.residues: if residue.resname in canonical_inverse_aa_codes: residue_atom_names = set([a.name for a in residue.atoms]) if not backbone_atom_names.issubset(residue_atom_names): incomplete_residues.append(residue) sequences.append(convert_aa_code(residue.resname)) residue_ids.append(residue.resid) segment_ids.append(residue.segid) if incomplete_residues: _logger.warning( "%d residues in %s are missing backbone atoms. " "If this system was obtained from a larger structure using a " "selection, consider wrapping such selection with " "`same residue as (<your original selection>)` " "to avoid potential matching problems.", len(incomplete_residues), universe, ) return "".join(sequences), residue_ids, segment_ids def _align(self, sequence_1, sequence_2): """ Perform a global alignment, based on the the Needleman-Wunsch algorithm or a local alignment, based on the Smith-Waterman algorithm Parameters ---------- sequence_1,sequence_2: str str of sequences Returns ------- str an alignment of two sequences Examples -------- >>> _align("seq1", "seq2","PAM250", (-5, -15)) RKKSLVDIDLSSLRDP R-K-I-DLS-S-LRDP """ return sequence_alignment( sequence_1, sequence_2, self.alignment_matrix, self.alignment_gap, local=self._align_local, )