Source code for opencadd.structure.superposition.engines.theseus

"""
Theseus superposes a set of macromolecular structures simultaneously
using the method of maximum like-lihood (ML), rather than the
conventional least-squares criterion. Theseus assumes that the
structures are distributed according to a matrix Gaussian
distribution and that the eigenvalues of the atomic
covariancematrix are hierarchically distributed according
to an inverse gamma distribution. This ML superpositioning model
produces much more accurate results by essentially downweighting
variable regions of the structuresand by correcting for correlations
among atoms.

References
----------
* https://theobald.brandeis.edu/theseus/
* Optimal simultaneous superpositioning of multiple structures with missing data.
  Theobald, Douglas L. & Steindel, Philip A. (2012) Bioinformatics 28 (15): 1972-1979
* Accurate structural correlations from maximum likelihood superpositions.
  Theobald, Douglas L. & Wuttke, Deborah S. (2008) PLOS Computational Biology 4(2):e43
"""
import os
import subprocess
import logging
from distutils.spawn import find_executable

import numpy as np

from .base import BaseAligner
from ....utils import enter_temp_directory


_logger = logging.getLogger(__name__)


[docs]class TheseusAligner(BaseAligner): """ Superpose structures with different sequences but similar structures Parameters ---------- alignment_max_iterations : int number of iterations for alignment program (muscle) statistics_only : bool if True, add `-I` flag to just compute the statistics (no superposition) """ def __init__(self, alignment_max_iterations: int = 32, statistics_only: bool = False) -> None: self.alignment_max_iterations = alignment_max_iterations self.statistics_only = statistics_only self._fastafile = "theseus.fasta" self._filemap_file = "theseus.filemap" self._alignment_file = "theseus.aln" self._alignment_log = "muscle.log" self._alignment_file_biotite = "theseus_biotite.aln" self._alignment_executable = "muscle" self._theseus_transformation_file = "theseus_transf.txt" def _safety_checks(self): """ Check if `theseus` is installed (executable found?). Raises ------ OSError Raises error if executable `theseus` cannot be found. """ theseus = find_executable("theseus") if theseus is None: raise OSError("theseus cannot be located. Is it installed?") # proceed normally def _calculate(self, structures, *args, **kwargs) -> dict: """ Align the sequences with an alignment tool (``muscle``) A mode for superpositioning macromolecules with identical sequences and numbers of residues, for instance, multiple models in an NMR family or multiple structures from different crystal forms of the same protein. Parameters ---------- structures : list list of opencadd.core.Structures objects **kwargs : dict optional parameters Returns ------- dict As returned by ``._parse_superposition(output)``. - ``scores`` (dict) ``rmsd`` (float): RMSD value of the alignment - ``metadata`` (dict) ``least_squares``: least squares ``maximum_likelihood``: maximum likelihood ``log_marginal_likelihood``: log marginal likelihood ``aic``: Akaike information criterion ``bic``: Bayesion information criterion ``omnibus_chi_square``: omnibus chi square ``hierarchical_var_chi_square``: hierarchical var chi square ``rotational_translational_covar_chi_square``: rotational translational covar chi square ``hierarchical_minimum_var``: hierarchical minimum var ``hierarchical_minimum_sigma``: hierarchical minimum sigma ``skewness``: skewness ``skewness_z``: skewness z ``kurtosis``: kurtosis ``kurtosis_z``: kurtosis z ``data_pts``: data points ``free_params``: free params ``d_p``: d_p ``median_structure``: median_structure ``n_total``: number total ``n_atoms``: number of atoms ``n_structures``: number of structures ``total_rounds``: total rounds """ with enter_temp_directory(remove=True) as (cwd, tmpdir): _logger.debug("All files are located in: %s", tmpdir) # 1st - Dump PDBs to disk filenames = [] for index, structure in enumerate(structures): filename = f"structure_{index}.pdb" structure.write(filename) filenames.append(filename) # 2nd - Prepare sequence alignment with MUSCLE seq_alignment_output = self._run_alignment(filenames) _logger.info(seq_alignment_output) # 3rd - Run theseus superposition itself theseus_output = subprocess.check_output( [ "theseus", "-f", ] + (["-I"] if self.statistics_only else []) + [ "-M", self._filemap_file, "-A", self._alignment_file, "-o", # all matrices with respect to this file filenames[0], *filenames, ], universal_newlines=True, ) _logger.debug(theseus_output) results = self._parse_superposition(theseus_output) # 4th reapply transformation to the mobile model mobile = structures[1] transformation = results["metadata"]["transformation"] rotation = transformation[:, :3] translation = transformation[:, 3:].reshape((3,)) mobile.atoms.translate(translation) mobile.atoms.rotate(rotation) results["superposed"] = structures return results def _get_fasta(self, filenames) -> str: """ Use Theseus to create fasta files Parameter --------- filenames : list list of pdb filenames Returns ------- str Output of the created fasta filenames """ return subprocess.check_output(["theseus", "-f", "-F", *filenames]) def _concatenate_fasta(self, filenames) -> None: """ Concatenate the fasta files created with Theseus into one multiple sequence fasta file Parameter --------- filenames : list list of pdb filenames """ with open(self._fastafile, "w") as outfile: for fname in filenames: with open(f"{fname}.fst") as infile: for line in infile: outfile.write(line) def _filemap(self, filenames) -> None: """ Create filemap for Theseus Parameter --------- filenames : list list of pdb filenames """ with open(self._filemap_file, "w") as outfile: for fname in filenames: outfile.write(f"{fname} {fname}\n") def _run_alignment(self, filenames): """ Run MUSCLE Returns ------- filenames : list of str Paths to PDB files containing the structures str Output of MUSCLE """ self._get_fasta(filenames) self._concatenate_fasta(filenames) self._filemap(filenames) output = subprocess.check_output( [ self._alignment_executable, "-maxiters", str(self.alignment_max_iterations), "-in", self._fastafile, "-out", self._alignment_file, "-clwstrict", "-verbose", "-log", self._alignment_log, ], stderr=subprocess.PIPE, universal_newlines=True, ) _logger.info(output) return output def _get_transformation_matrix(self) -> dict: """ Get the rotation matrix and translation vector Returns ------- dict Rotation matrix and translation vector """ translations, rotations = {}, {} if not os.path.isfile(self._theseus_transformation_file): return None with open(self._theseus_transformation_file, "r") as infile: lines = infile.readlines() for line in lines: if " R:" in line: # rotation matrix model, matrix = line.split(":") model_id = int(model.split()[1]) matrix_array = np.reshape(list(map(float, matrix.split())), (-1, 3)) rotations[model_id] = matrix_array elif " t:" in line: # translation vector model, vector = line.split(":") model_id = int(model.split()[1]) vector_array = np.array([[float(x)] for x in vector.split()]) translations[model_id] = vector_array matrices = {} for model_id, rotation in rotations.items(): translation = translations[model_id] matrix = np.empty((3, 4)) matrix[:, :3] = rotation matrix[:, 3:] = translation matrices[model_id] = matrix return matrices[2] def _parse_superposition(self, output) -> dict: """ Parse the output from theseus itself Parameters ---------- output : bytes Returns ------- dict All the information provided by Theseus """ for line in output.splitlines(): blocks = line.split() if "Classical LS pairwise <RMSD>" in line: rmsd = float(blocks[5]) elif "Least-squares <sigma>" in line: least_squares = float(blocks[3]) elif "Maximum Likelihood <sigma>" in line: maximum_likelihood = float(blocks[4]) elif "Marginal Log Likelihood" in line: log_marginal_likelihood = float(blocks[4]) elif "AIC" in line: aic = float(blocks[2]) elif "BIC" in line: bic = float(blocks[2]) elif "Omnibus chi^2" in line: omnibus_chi_square = float(blocks[3]) elif "Hierarchical var" in line: hierarchical_var_chi_square = float(blocks[6]) elif "Rotational, translational, covar" in line: rotational_translational_covar_chi_square = float(blocks[5]) elif "Hierarchical minimum var" in line: hierarchical_minimum_var = float(blocks[5]) hierarchical_minimum_sigma = float(blocks[-1][1:-1]) elif "skewness Z-value" in line: skewness_z = float(blocks[3]) elif "skewness" in line: skewness = float(blocks[2]) elif "kurtosis Z-value" in line: kurtosis_z = float(blocks[3]) elif "kurtosis" in line: kurtosis = float(blocks[2]) elif "Data pts" in line: fields = line.split(",") data_pts = float(fields[0].split()[-1]) free_params = float(fields[1].split()[-1]) d_p = float(fields[2].split()[-1]) elif "Median" in line: median_structure = float(blocks[4][1:]) elif "N(total)" in line: fields = line.split(",") n_total = float(fields[0].split()[-1]) n_atoms = float(fields[1].split()[-1]) n_structures = float(fields[2].split()[-1]) elif "Total rounds" in line: total_rounds = float(blocks[3]) return { "scores": {"rmsd": rmsd}, "metadata": { "transformation": self._get_transformation_matrix(), "least_squares": least_squares, "maximum_likelihood": maximum_likelihood, "log_marginal_likelihood": log_marginal_likelihood, "aic": aic, "bic": bic, "omnibus_chi_square": omnibus_chi_square, "hierarchical_var_chi_square": hierarchical_var_chi_square, "rotational_translational_covar_chi_square": rotational_translational_covar_chi_square, "hierarchical_minimum_var": hierarchical_minimum_var, "hierarchical_minimum_sigma": hierarchical_minimum_sigma, "skewness": skewness, "skewness_z": skewness_z, "kurtosis": kurtosis, "kurtosis_z": kurtosis_z, "data_pts": data_pts, "free_params": free_params, "d_p": d_p, "median_structure": median_structure, "n_total": n_total, "n_atoms": n_atoms, "n_structures": n_structures, "total_rounds": total_rounds, }, }