Source code for opencadd.structure.superposition.sequences

"""
Utilities for sequence alignment
"""
import logging

import numpy as np
import biotite.sequence.align as seq_align
import biotite.sequence as seq
import Bio

logger = logging.getLogger(__name__)


[docs]def matrices(name): """ A SubstitutionMatrix maps each possible pairing of a symbol of a first alphabet with a symbol of a second alphabet to a score (int) Parameters ---------- name: string Name of the matrix which is loaded from the internal matrix database. If the name of Substitution Matrix could not be found, the default SubstitutionMatrix will be BLOSUM62. Returns ------- SubstitutionMatrix The class uses a 2-D (m x n) ndarray, where each element stores the score for a symbol pairing, indexed by the symbol codes of the respective symbols in an m-length alphabet 1 and an n-length alphabet 2 """ if name == "BLOSUM62": matrix = seq_align.SubstitutionMatrix.std_protein_matrix() else: alph = seq.ProteinSequence.alphabet matrix = seq_align.SubstitutionMatrix(alph, alph, name) return matrix
[docs]def sequence_alignment(seq1: str, seq2: str, matrix: str, gap: int, local: bool = False) -> str: """ Perform a global alignment, based on the Needleman-Wunsch algorithm Parameters ---------- seq1,seq2: str The sequences to be aligned matrix: SubstitutionMatrix The substitution matrix used for scoring gap: int or (tuple, dtype=int) Int the value will be interpreted as general gap penalty. Tupel is provided, an affine gap penalty is used. The first integer in the tuple is the gap opening penalty, the second integer is the gap extension penalty. The values need to be negative. local : bool, optional, default=False Whether to use local alignment (Smith-Waterman) or global (Needleman-Wunsch) Returns ------- str An optimal alignment of two sequences """ matrix = matrices(matrix) alignment = seq_align.align_optimal( seq.ProteinSequence(seq1), seq.ProteinSequence(seq2), matrix, gap_penalty=gap, local=local, ) return alignment[0]
[docs]def fasta2select( fastafilename, ref_resids=None, ref_segids=None, target_resids=None, target_segids=None, backbone_selection="backbone or name CB", ): """Return selection strings that will select equivalent residues. The function takes two pre-aligned sequences in a FASTA file and constructs MDAnalysis selection strings of the common atoms. When these two strings are applied to the two different proteins they will generate AtomGroups of the aligned residues. Parameters ---------- fastafilename : str, path to filename FASTA file with first sequence as reference and second the one to be aligned (ORDER IS IMPORTANT!) ref_resids : list (optional) sequence of resids as they appear in the reference structure target_resids : list (optional) sequence of resids as they appear in the target ref_segids : list (optional) sequence of segids as they appear in the reference structure target_segids : list (optional) sequence of segids as they appear in the target Returns ------- dict dictionary with 'reference' and 'mobile' selection string """ protein_gapped = Bio.Alphabet.Gapped(Bio.Alphabet.IUPAC.protein) with open(fastafilename) as fasta: alignment = Bio.AlignIO.read(fasta, "fasta", alphabet=protein_gapped) nseq = len(alignment) if nseq != 2: raise ValueError("Only two sequences in the alignment can be processed.") orig_resids = [ref_resids, target_resids] orig_segids = [np.asarray(ref_segids), np.asarray(target_segids)] for iseq, a in enumerate(alignment): # need iseq index to change orig_resids if orig_resids[iseq] is None: # build default: assume consecutive numbering of all # residues in the alignment GAP = a.seq.alphabet.gap_char length = len(a.seq) - a.seq.count(GAP) orig_resids[iseq] = np.arange(1, length + 1) else: orig_resids[iseq] = np.asarray(orig_resids[iseq]) # repeat for segment ids if orig_segids[iseq] is None: # build default: assume consecutive numbering of all # residues in the alignment GAP = a.seq.alphabet.gap_char length = len(a.seq) - a.seq.count(GAP) orig_segids[iseq] = np.full(length, "") else: orig_segids[iseq] = np.asarray(orig_segids[iseq]) seq2resids = list(zip(orig_resids, orig_segids)) def resid_factory(alignment, seq2resids): """Return a function that gives the resid for a position ipos in the nseq'th alignment. resid = resid_factory(alignment,seq2resids) r = resid(nseq,ipos) It is based on a look up table that translates position in the alignment to the residue number in the original sequence/structure. The first index of resid() is the alignmment number, the second the position in the alignment. seq2resids translates the residues in the sequence to resid numbers in the psf. In the simplest case this is a linear map but if whole parts such as loops are ommitted from the protein the seq2resids may have big gaps. Format: a tuple of two numpy arrays; the first array is for the reference, the second for the target, The index in each array gives the consecutive number of the amino acid in the sequence, the value the resid in the structure/psf. Note: assumes that alignments have same length and are padded if necessary. """ # could maybe use Bio.PDB.StructureAlignment instead? nseq = len(alignment) t = np.zeros((nseq, alignment.get_alignment_length()), dtype=int) s = np.zeros((nseq, alignment.get_alignment_length()), dtype=object) for iseq, a in enumerate(alignment): GAP = a.seq.alphabet.gap_char indices = np.cumsum(np.where(np.array(list(a.seq)) == GAP, 0, 1)) - 1 t[iseq, :] = seq2resids[iseq][0][indices] s[iseq, :] = seq2resids[iseq][1][indices] # -1 because seq2resid is index-1 based (resids start at 1) def resid(nseq, ipos, t=t, s=s): return t[nseq, ipos], s[nseq, ipos] return resid resid = resid_factory(alignment, seq2resids) res_list = [] # collect individual selection string # should be the same for both seqs GAP = alignment[0].seq.alphabet.gap_char if GAP != alignment[1].seq.alphabet.gap_char: raise ValueError("Different gap characters in sequence 'target' and 'mobile'.") for ipos in range(alignment.get_alignment_length()): aligned = list(alignment[:, ipos]) if GAP in aligned: continue # skip residue if it's not matching any template = "( resid {:d} and segid {:s}" + f" and ( {backbone_selection} ) )" res_list.append([template.format(*resid(iseq, ipos)) for iseq in range(nseq)]) sel = np.array(res_list).transpose() ref_selection = " or ".join(sel[0]) target_selection = " or ".join(sel[1]) return {"reference": ref_selection, "mobile": target_selection}