Source code for opencadd.structure.superposition.engines.mmligner

"""
MMLigner (Collier et al., 2017) works by minimizing the ivalue of the alignment. The ivalue is based on
the Minimum Message Length framework (Wallace and Boulton, 1968; Wallace, 2005), a Bayesian framework for
statistical inductive inference. The ivalue represents the hypothetical minimum message length needed to transmit
the computed alignment losslessly (Shannon, 1948).
Using the ivalue measure, the algorithm creates crude-but-effective strucural alignments rapidly to act as seeds.
These seeds are iteratively refined over an Expectation-Maximization approach using the I-value criterion.
By substracting the ivalue from the null model, the statistical significance of the alignment can be computed. If the
difference is positive, the alignment is significant.

Collier, J.H., Allison, L., Lesk A.M., Stuckey, P.J., Garcia de la Banda , M., Konagurthu, A.S. (2017)
Statistical inference of protein structural alignments using information and compression.
Bioinformatics, 33(7), 1005-1013

Wallace,C.S. and Boulton,D.M. (1968) An information measure for classification.
Comput. J., 11, 185–194.

Wallace,C.S. (2005) Statistical and Inductive Inference Using MinimumMessage Length.
Information Science and Statistics. SpringerVerlag, New York, NY.

Shannon,C.E. (1948) A mathematical theory of communication.
Bell Syst.Tech. J., 27, 379–423.
"""

import sys
import subprocess
import logging
from distutils.spawn import find_executable

import numpy as np
import biotite.sequence.io.fasta as fasta

from .base import BaseAligner
from ....utils import enter_temp_directory

_logger = logging.getLogger(__name__)


[docs]class MMLignerAligner(BaseAligner): """ Wraps MMLigner to superpose two protein structures. Parameters ---------- executable : str Path to the MMLigner executable file protein_selection : str, optional, default=protein MMLigner will not accept residues beyond the 20 standard AA. """ def __init__(self, executable=None, protein_selector="protein"): if executable is None: executable = "mmligner64.exe" if sys.platform.startswith("win") else "mmligner" self.executable = executable self.protein_selector = protein_selector _logger.warning( "Current MMLigner wrappers produces accurate RMSD values but slightly shifted structures!" ) def _safety_checks(self): """ Check if `mmligner` is installed (executable found?). Raises ------ OSError Raises error if executable `mmligner` cannot be found. """ mmligner = find_executable("mmligner") if mmligner is None: raise OSError("mmligner cannot be located. Is it installed?") # proceed normally def _calculate(self, structures, *args, **kwargs): """ Calculates the superposition of two protein structures. It is called by BaseAligner.calculate(). Parameters ---------- structures: [array like, array like] Sequences of two protein structures of same length Returns ------- dict As returned by ``._parse_metadata(output)``. - ``superposed`` ([opencadd.core.Structure, opencadd.core.Structure]): the superposed models - ``scores`` (dict): - ``rmsd`` (float): RMSD value of the alignment - ``score`` (float): ivalue of the alignment - ``coverage`` (float): coverage of the alignment - ``metadata`` (dict): - ``alignment`` (biotite.alignment): computed alignment - ``rotation`` (array-like): 3x3 rotation matrix - ``translation`` (np.array): array containing the translation - ``quarternion`` (array-like): 4x4 quarternion matrix """ with enter_temp_directory() as (cwd, tmpdir): # Needed because of the need of a copy of the structures. sys.setrecursionlimit(100000) path1, path2 = self._edit_pdb(structures) output = subprocess.check_output( [self.executable, path1, path2, "-o", "temp", "--superpose"] ) # We need access to the temporary files at parse time! result = self._parse_metadata(output.decode()) # checks if there is metadata in the dict, if not, there was no significant alignment found. if "metadata" in result: superposed_models = self._calculate_transformed(structures, result["metadata"]) result["superposed"] = superposed_models return result def _parse_metadata(self, output): """ Retrieves RMSD, score and metadata from the output of the MMLigner subprocess. Parameters ---------- output: str string of containing the stdout of the mmligener call Returns ------- dict As returned by ``._parse_metadata(output)``. - ``scores`` (dict): - ``rmsd`` (float): RMSD value of the alignment - ``score`` (float): ivalue of the alignment - ``coverage`` (float): coverage of the alignment - ``metadata`` (dict): - ``alignment``: (biotite.alignment): computed alignment - ``rotation``: (array-like): 3x3 rotation matrix - ``translation``: (np.array): array containing the translation - ``quarternion``: (array-like): 4x4 quarternion matrix """ lines = iter(output.splitlines()) for line in lines: if line.startswith("RMSD"): rmsd = float(line.split()[2]) elif line.startswith("Coverage"): coverage = float(line.split()[2]) elif line.startswith("I(A & <S,T>)"): ivalue = float(line.split()[4]) elif "Print Centers of Mass of moving set:" in line: moving_com = np.array([float(x) for x in next(lines).split()]) elif "Print Centers of Mass of fixed set:" in line: fixed_com = np.array([float(x) for x in next(lines).split()]) elif "Print Rotation matrix" in line: rotation = [[float(x) for x in next(lines).split()] for _ in range(3)] elif "Print Quaternion matrix" in line: quaternion = [[float(x) for x in next(lines).split()] for _ in range(4)] # checks if there is a signifcant alignment if rmsd == 0 and coverage == 0: return {"scores": {"rmsd": rmsd, "score": ivalue, "coverage": coverage}} else: # fixed_com, moving_com, rotation and quaternion can only be obtained # if the patched mmligner is used (check /devtools/conda-recipes/mmligner) # -- this will fail in CI for now -- translation = fixed_com - moving_com alignment = fasta.FastaFile() alignment.read("temp__1.afasta") return { "scores": {"rmsd": rmsd, "score": ivalue, "coverage": coverage}, "metadata": { "alignment": alignment, "rotation": rotation, "translation": translation, "quaternion": quaternion, }, } def _parse_scoring(self, output): """ Retrieves RMSD, score and ivalue from the output of the MMLigner subprocess. Parameters ---------- output: str string containing the stdout of the mmligener call Returns ------- dict As returned by ``._parse_scoring(output)``. - ``scores`` (dict): - ``rmsd`` (float): RMSD value of the alignment - ``score`` (float): ivalue of the alignment - ``coverage`` (float): coverage of the alignment """ lines = iter(output.splitlines()) for line in lines: if line.startswith("RMSD"): rmsd = float(line.split()[2]) elif line.startswith("Coverage"): coverage = float(line.split()[2]) elif line.startswith("I(A & <S,T>)"): ivalue = float(line.split()[4]) return { "scores": {"rmsd": rmsd, "score": ivalue, "coverage": coverage}, } def _calculate_transformed(self, structures, metadata): """ Parse back output PDBs and construct updated Structure objects. Parameters ---------- structures: list of opencadd.core.Structure Original input structures Returns ------- list of opencadd.core.Structure Input structures with updated coordinates """ ref, mobile, *_ = structures translation = metadata["translation"] rotation = metadata["rotation"] mob_com = mobile.atoms.center_of_geometry() ref_com = ref.atoms.center_of_geometry() mobile.atoms.translate(-mob_com) mobile.atoms.rotate(rotation) mobile.atoms.translate(ref_com) return ref, mobile
[docs] def ivalue(self, structures, alignment): """ Parse back output PDBs and construct updated Structure models. Parameters ---------- structures: [array like, array like] sequences of two protein structures of same length alignment: biotite.alignment alignment of the given two sequences Returns ------- dict As returned by ``._parse_scoring(output)``. - ``scores`` (dict): - ``rmsd`` (float): RMSD value of the alignment - ``score`` (float): ivalue of the alignment - ``coverage`` (float): coverage of the alignment """ with enter_temp_directory() as (cwd, tmpdir): paths = "structure1.pdb", "structure2.pdb" structures[0].select_atoms(self.protein_selector).write(paths[0]) structures[1].select_atoms(self.protein_selector).write(paths[1]) fasta_file = fasta.FastaFile() for header, string in alignment.items(): fasta_file[header] = string fasta_file.write("temp_alignment.afasta") self._edit_fasta("temp_alignment.afasta") output = subprocess.check_output( [self.executable, paths[0], paths[1], "--ivalue", "temp_alignment.afasta"] ) # We need access to the temporary files at parse time! result = self._parse_scoring(output.decode()) return result
def _edit_pdb(self, structures, path=("structure1.pdb", "structure2.pdb")): """ Method to write Structure protein models to PDBs readable by MMLigner. Parameters ---------- structures: [array like, array like] two protein structures path: [str, str], Optional=["structure1.pdb, "structure2.pdb"] Path where the pdbs should be written Returns ------- str, str Paths of both structures .. note:: This is a temporary workaround to fix issue #9 at: https://github.com/volkamerlab/opencadd/issues/9 """ assert len(path) == 2 structures[0].select_atoms(self.protein_selector).write(path[0]) structures[1].select_atoms(self.protein_selector).write(path[1]) for i in range(len(path)): pdb = [] with open(path[i], "r") as infile: pdb = infile.readlines() for j in range(1, len(pdb)): if pdb[j].startswith("TER"): pdb[j] = pdb[j].split()[0] + " " + pdb[j - 1].split()[1] + "\n" self._write_pdb(path[i], pdb) return path[0], path[1] def _write_pdb(self, path, pdb): """ Method to write Structure protein models to PDBs readable by MMLigner. Parameters ---------- path: str Path where the pdb should be written pdb: array-like edited pdb file .. note:: This is a temporary workaround to fix issue #9 at: https://github.com/volkamerlab/opencadd/issues/9 """ with open(path, "w") as outfile: for line in pdb: outfile.write(line) def _edit_fasta(self, path): """ Method to edit FASTA files written by biotite to FASTA files readable by MMLigner. This is needed, because MMLigner expects an empty line after each sequence. Parameters ---------- path: str Path to the fasta file that is to be edited. """ with open(path, "r") as fasta: data = fasta.readlines() lines = iter(data) for line in lines: if next(lines).startswith(">structure2.pdb"): line = line + "\n" data[-1] = data[-1] + "\n" with open(path, "w") as fasta: for line in data: fasta.write(line)