Source code for opencadd.structure.superposition.sequences

"""
Utilities for sequence alignment
"""
import logging

import numpy as np
import biotite.sequence.align as seq_align
import biotite.sequence as seq
import Bio

logger = logging.getLogger(__name__)


[docs]def matrices(name):
    """
    A SubstitutionMatrix maps each possible pairing of a symbol of a first alphabet with
    a symbol of a second alphabet to a score (int)

    Parameters
    ----------
    name: string
        Name of the matrix which is loaded from the internal matrix database.
        If the name of Substitution Matrix could not be found, the default SubstitutionMatrix
        will be BLOSUM62.

    Returns
    -------
    SubstitutionMatrix
        The class uses a 2-D (m x n) ndarray, where each element stores the score
        for a symbol pairing, indexed by the symbol codes of the respective symbols
        in an m-length alphabet 1 and an n-length alphabet 2

    """
    if name == "BLOSUM62":
        matrix = seq_align.SubstitutionMatrix.std_protein_matrix()
    else:
        alph = seq.ProteinSequence.alphabet
        matrix = seq_align.SubstitutionMatrix(alph, alph, name)

    return matrix


[docs]def sequence_alignment(seq1: str, seq2: str, matrix: str, gap: int, local: bool = False) -> str:
    """
    Perform a global alignment, based on the Needleman-Wunsch algorithm

    Parameters
    ----------
    seq1,seq2: str
        The sequences to be aligned

    matrix: SubstitutionMatrix
        The substitution matrix used for scoring

    gap: int or (tuple, dtype=int)
         Int the value will be interpreted as general gap penalty.
         Tupel is provided, an affine gap penalty is used. The first integer in the tuple is the gap opening penalty,
         the second integer is the gap extension penalty. The values need to be negative.

    local : bool, optional, default=False
        Whether to use local alignment (Smith-Waterman) or global (Needleman-Wunsch)

    Returns
    -------
    str
        An optimal alignment of two sequences
    """

    matrix = matrices(matrix)
    alignment = seq_align.align_optimal(
        seq.ProteinSequence(seq1),
        seq.ProteinSequence(seq2),
        matrix,
        gap_penalty=gap,
        local=local,
    )
    return alignment[0]


[docs]def fasta2select(
    fastafilename,
    ref_resids=None,
    ref_segids=None,
    target_resids=None,
    target_segids=None,
    backbone_selection="backbone or name CB",
):
    """Return selection strings that will select equivalent residues.

    The function takes two pre-aligned sequences in a FASTA file and
    constructs MDAnalysis selection strings of the common atoms. When
    these two strings are applied to the two different proteins they
    will generate AtomGroups of the aligned residues.


    Parameters
    ----------
    fastafilename : str, path to filename
        FASTA file with first sequence as reference and
        second the one to be aligned (ORDER IS IMPORTANT!)
    ref_resids : list (optional)
        sequence of resids as they appear in the reference structure
    target_resids : list (optional)
        sequence of resids as they appear in the target
    ref_segids : list (optional)
        sequence of segids as they appear in the reference structure
    target_segids : list (optional)
        sequence of segids as they appear in the target
    Returns
    -------
    dict
        dictionary with 'reference' and 'mobile' selection string

    """
    protein_gapped = Bio.Alphabet.Gapped(Bio.Alphabet.IUPAC.protein)
    with open(fastafilename) as fasta:
        alignment = Bio.AlignIO.read(fasta, "fasta", alphabet=protein_gapped)

    nseq = len(alignment)
    if nseq != 2:
        raise ValueError("Only two sequences in the alignment can be processed.")

    orig_resids = [ref_resids, target_resids]
    orig_segids = [np.asarray(ref_segids), np.asarray(target_segids)]
    for iseq, a in enumerate(alignment):
        # need iseq index to change orig_resids
        if orig_resids[iseq] is None:
            # build default: assume consecutive numbering of all
            # residues in the alignment
            GAP = a.seq.alphabet.gap_char
            length = len(a.seq) - a.seq.count(GAP)
            orig_resids[iseq] = np.arange(1, length + 1)
        else:
            orig_resids[iseq] = np.asarray(orig_resids[iseq])
        # repeat for segment ids
        if orig_segids[iseq] is None:
            # build default: assume consecutive numbering of all
            # residues in the alignment
            GAP = a.seq.alphabet.gap_char
            length = len(a.seq) - a.seq.count(GAP)
            orig_segids[iseq] = np.full(length, "")
        else:
            orig_segids[iseq] = np.asarray(orig_segids[iseq])
    seq2resids = list(zip(orig_resids, orig_segids))

    def resid_factory(alignment, seq2resids):
        """Return a function that gives the resid for a position ipos in
        the nseq'th alignment.

        resid = resid_factory(alignment,seq2resids)
        r = resid(nseq,ipos)

        It is based on a look up table that translates position in the
        alignment to the residue number in the original
        sequence/structure.

        The first index of resid() is the alignmment number, the
        second the position in the alignment.

        seq2resids translates the residues in the sequence to resid
        numbers in the psf. In the simplest case this is a linear map
        but if whole parts such as loops are ommitted from the protein
        the seq2resids may have big gaps.

        Format: a tuple of two numpy arrays; the first array is for
        the reference, the second for the target, The index in each
        array gives the consecutive number of the amino acid in the
        sequence, the value the resid in the structure/psf.

        Note: assumes that alignments have same length and are padded if
        necessary.
        """
        # could maybe use Bio.PDB.StructureAlignment instead?
        nseq = len(alignment)
        t = np.zeros((nseq, alignment.get_alignment_length()), dtype=int)
        s = np.zeros((nseq, alignment.get_alignment_length()), dtype=object)
        for iseq, a in enumerate(alignment):
            GAP = a.seq.alphabet.gap_char
            indices = np.cumsum(np.where(np.array(list(a.seq)) == GAP, 0, 1)) - 1
            t[iseq, :] = seq2resids[iseq][0][indices]
            s[iseq, :] = seq2resids[iseq][1][indices]
            # -1 because seq2resid is index-1 based (resids start at 1)

        def resid(nseq, ipos, t=t, s=s):
            return t[nseq, ipos], s[nseq, ipos]

        return resid

    resid = resid_factory(alignment, seq2resids)
    res_list = []  # collect individual selection string

    # should be the same for both seqs
    GAP = alignment[0].seq.alphabet.gap_char
    if GAP != alignment[1].seq.alphabet.gap_char:
        raise ValueError("Different gap characters in sequence 'target' and 'mobile'.")
    for ipos in range(alignment.get_alignment_length()):
        aligned = list(alignment[:, ipos])
        if GAP in aligned:
            continue  # skip residue if it's not matching any
        template = "( resid {:d} and segid {:s}" + f" and ( {backbone_selection} ) )"
        res_list.append([template.format(*resid(iseq, ipos)) for iseq in range(nseq)])

    sel = np.array(res_list).transpose()

    ref_selection = " or ".join(sel[0])
    target_selection = " or ".join(sel[1])
    return {"reference": ref_selection, "mobile": target_selection}