Source code for opencadd.structure.superposition.engines.mmligner

"""
MMLigner (Collier et al., 2017) works by minimizing the ivalue of the alignment. The ivalue is based on
the Minimum Message Length framework (Wallace and Boulton, 1968; Wallace, 2005), a Bayesian framework for
statistical inductive inference. The ivalue represents the hypothetical minimum message length needed to transmit
the computed alignment losslessly (Shannon, 1948).
Using the ivalue measure, the algorithm creates crude-but-effective strucural alignments rapidly to act as seeds.
These seeds are iteratively refined over an Expectation-Maximization approach using the I-value criterion.
By substracting the ivalue from the null model, the statistical significance of the alignment can be computed. If the
difference is positive, the alignment is significant.

Collier, J.H., Allison, L., Lesk A.M., Stuckey, P.J., Garcia de la Banda , M., Konagurthu, A.S. (2017)
Statistical inference of protein structural alignments using information and compression.
Bioinformatics, 33(7), 1005-1013

Wallace,C.S. and Boulton,D.M. (1968) An information measure for classification.
Comput. J., 11, 185–194.

Wallace,C.S. (2005) Statistical and Inductive Inference Using MinimumMessage Length.
Information Science and Statistics. SpringerVerlag, New York, NY.

Shannon,C.E. (1948) A mathematical theory of communication.
Bell Syst.Tech. J., 27, 379–423.
"""

import sys
import subprocess
import logging
from distutils.spawn import find_executable

import numpy as np
import biotite.sequence.io.fasta as fasta

from .base import BaseAligner
from ....utils import enter_temp_directory

_logger = logging.getLogger(__name__)


[docs]class MMLignerAligner(BaseAligner):
    """
    Wraps MMLigner to superpose two protein structures.

    Parameters
    ----------
    executable : str
        Path to the MMLigner executable file
    protein_selection : str, optional, default=protein
        MMLigner will not accept residues beyond the 20 standard AA.

    """

    def __init__(self, executable=None, protein_selector="protein"):
        if executable is None:
            executable = "mmligner64.exe" if sys.platform.startswith("win") else "mmligner"
        self.executable = executable
        self.protein_selector = protein_selector
        _logger.warning(
            "Current MMLigner wrappers produces accurate RMSD values but slightly shifted structures!"
        )

    def _safety_checks(self):
        """
        Check if `mmligner` is installed (executable found?).

        Raises
        ------
        OSError
            Raises error if executable `mmligner` cannot be found.
        """

        mmligner = find_executable("mmligner")
        if mmligner is None:
            raise OSError("mmligner cannot be located. Is it installed?")
        # proceed normally

    def _calculate(self, structures, *args, **kwargs):
        """
        Calculates the superposition of two protein structures.

        It is called by BaseAligner.calculate().

        Parameters
        ----------
        structures: [array like, array like]
            Sequences of two protein structures of same length

        Returns
        -------
        dict
            As returned by ``._parse_metadata(output)``.
            - ``superposed`` ([opencadd.core.Structure, opencadd.core.Structure]): the superposed models
            - ``scores`` (dict):
                - ``rmsd`` (float): RMSD value of the alignment
                - ``score`` (float): ivalue of the alignment
                - ``coverage`` (float): coverage of the alignment
            - ``metadata`` (dict):
                - ``alignment`` (biotite.alignment): computed alignment
                - ``rotation`` (array-like): 3x3 rotation matrix
                - ``translation`` (np.array): array containing the translation
                - ``quarternion`` (array-like): 4x4 quarternion matrix
        """

        with enter_temp_directory() as (cwd, tmpdir):
            # Needed because of the need of a copy of the structures.
            sys.setrecursionlimit(100000)

            path1, path2 = self._edit_pdb(structures)
            output = subprocess.check_output(
                [self.executable, path1, path2, "-o", "temp", "--superpose"]
            )
            # We need access to the temporary files at parse time!
            result = self._parse_metadata(output.decode())

            # checks if there is metadata in the dict, if not, there was no significant alignment found.
            if "metadata" in result:
                superposed_models = self._calculate_transformed(structures, result["metadata"])
                result["superposed"] = superposed_models
        return result

    def _parse_metadata(self, output):
        """
        Retrieves RMSD, score and metadata from the output of the MMLigner subprocess.

        Parameters
        ----------
        output: str
            string of containing the stdout of the mmligener call

        Returns
        -------
        dict
            As returned by ``._parse_metadata(output)``.
            - ``scores`` (dict):
                - ``rmsd`` (float): RMSD value of the alignment
                - ``score`` (float): ivalue of the alignment
                - ``coverage`` (float): coverage of the alignment
            - ``metadata`` (dict):
                - ``alignment``: (biotite.alignment): computed alignment
                - ``rotation``: (array-like): 3x3 rotation matrix
                - ``translation``: (np.array): array containing the translation
                - ``quarternion``: (array-like): 4x4 quarternion matrix
        """
        lines = iter(output.splitlines())
        for line in lines:
            if line.startswith("RMSD"):
                rmsd = float(line.split()[2])
            elif line.startswith("Coverage"):
                coverage = float(line.split()[2])
            elif line.startswith("I(A & <S,T>)"):
                ivalue = float(line.split()[4])
            elif "Print Centers of Mass of moving set:" in line:
                moving_com = np.array([float(x) for x in next(lines).split()])
            elif "Print Centers of Mass of fixed set:" in line:
                fixed_com = np.array([float(x) for x in next(lines).split()])
            elif "Print Rotation matrix" in line:
                rotation = [[float(x) for x in next(lines).split()] for _ in range(3)]
            elif "Print Quaternion matrix" in line:
                quaternion = [[float(x) for x in next(lines).split()] for _ in range(4)]

        # checks if there is a signifcant alignment
        if rmsd == 0 and coverage == 0:
            return {"scores": {"rmsd": rmsd, "score": ivalue, "coverage": coverage}}
        else:
            # fixed_com, moving_com, rotation and quaternion can only be obtained
            # if the patched mmligner is used (check /devtools/conda-recipes/mmligner)
            # -- this will fail in CI for now --
            translation = fixed_com - moving_com

            alignment = fasta.FastaFile()
            alignment.read("temp__1.afasta")

            return {
                "scores": {"rmsd": rmsd, "score": ivalue, "coverage": coverage},
                "metadata": {
                    "alignment": alignment,
                    "rotation": rotation,
                    "translation": translation,
                    "quaternion": quaternion,
                },
            }

    def _parse_scoring(self, output):
        """
        Retrieves RMSD, score and ivalue from the output of the MMLigner subprocess.

        Parameters
        ----------
        output: str
            string containing the stdout of the mmligener call

        Returns
        -------
        dict
            As returned by ``._parse_scoring(output)``.
            - ``scores`` (dict):
                - ``rmsd`` (float): RMSD value of the alignment
                - ``score`` (float): ivalue of the alignment
                - ``coverage`` (float): coverage of the alignment
        """
        lines = iter(output.splitlines())
        for line in lines:
            if line.startswith("RMSD"):
                rmsd = float(line.split()[2])
            elif line.startswith("Coverage"):
                coverage = float(line.split()[2])
            elif line.startswith("I(A & <S,T>)"):
                ivalue = float(line.split()[4])

        return {
            "scores": {"rmsd": rmsd, "score": ivalue, "coverage": coverage},
        }

    def _calculate_transformed(self, structures, metadata):
        """
        Parse back output PDBs and construct updated Structure objects.

        Parameters
        ----------
        structures: list of opencadd.core.Structure
            Original input structures

        Returns
        -------
        list of opencadd.core.Structure
            Input structures with updated coordinates
        """
        ref, mobile, *_ = structures
        translation = metadata["translation"]
        rotation = metadata["rotation"]

        mob_com = mobile.atoms.center_of_geometry()
        ref_com = ref.atoms.center_of_geometry()

        mobile.atoms.translate(-mob_com)
        mobile.atoms.rotate(rotation)
        mobile.atoms.translate(ref_com)

        return ref, mobile

[docs]    def ivalue(self, structures, alignment):
        """
        Parse back output PDBs and construct updated Structure models.

        Parameters
        ----------
        structures: [array like, array like]
            sequences of two protein structures of same length
        alignment: biotite.alignment
            alignment of the given two sequences

        Returns
        -------
        dict
            As returned by ``._parse_scoring(output)``.

            - ``scores`` (dict):
                - ``rmsd`` (float): RMSD value of the alignment
                - ``score`` (float): ivalue of the alignment
                - ``coverage`` (float): coverage of the alignment
        """

        with enter_temp_directory() as (cwd, tmpdir):
            paths = "structure1.pdb", "structure2.pdb"
            structures[0].select_atoms(self.protein_selector).write(paths[0])
            structures[1].select_atoms(self.protein_selector).write(paths[1])

            fasta_file = fasta.FastaFile()

            for header, string in alignment.items():
                fasta_file[header] = string

            fasta_file.write("temp_alignment.afasta")

            self._edit_fasta("temp_alignment.afasta")

            output = subprocess.check_output(
                [self.executable, paths[0], paths[1], "--ivalue", "temp_alignment.afasta"]
            )
            # We need access to the temporary files at parse time!
            result = self._parse_scoring(output.decode())

        return result

    def _edit_pdb(self, structures, path=("structure1.pdb", "structure2.pdb")):
        """
        Method to write Structure protein models to PDBs readable by MMLigner.

        Parameters
        ----------
        structures: [array like, array like]
            two protein structures

        path: [str, str], Optional=["structure1.pdb, "structure2.pdb"]
            Path where the pdbs should be written

        Returns
        -------
        str, str
            Paths of both structures

        .. note::

            This is a temporary workaround to fix issue #9 at:
            https://github.com/volkamerlab/opencadd/issues/9
        """
        assert len(path) == 2

        structures[0].select_atoms(self.protein_selector).write(path[0])
        structures[1].select_atoms(self.protein_selector).write(path[1])

        for i in range(len(path)):
            pdb = []
            with open(path[i], "r") as infile:
                pdb = infile.readlines()
                for j in range(1, len(pdb)):
                    if pdb[j].startswith("TER"):
                        pdb[j] = pdb[j].split()[0] + "    " + pdb[j - 1].split()[1] + "\n"

            self._write_pdb(path[i], pdb)

        return path[0], path[1]

    def _write_pdb(self, path, pdb):
        """
        Method to write Structure protein models to PDBs readable by MMLigner.

        Parameters
        ----------
        path: str
            Path where the pdb should be written

        pdb: array-like
            edited pdb file

        .. note::

            This is a temporary workaround to fix issue #9 at:
            https://github.com/volkamerlab/opencadd/issues/9
        """
        with open(path, "w") as outfile:
            for line in pdb:
                outfile.write(line)

    def _edit_fasta(self, path):
        """
        Method to edit FASTA files written by biotite to FASTA files readable by MMLigner. This is needed,
        because MMLigner expects an empty line after each sequence.

        Parameters
        ----------
        path: str
            Path to the fasta file that is to be edited.
        """
        with open(path, "r") as fasta:
            data = fasta.readlines()

        lines = iter(data)
        for line in lines:
            if next(lines).startswith(">structure2.pdb"):
                line = line + "\n"

        data[-1] = data[-1] + "\n"

        with open(path, "w") as fasta:
            for line in data:
                fasta.write(line)