rmsd.py

import numpy as np
import Bio.PDB as pdb
from os.path import exists, splitext
from typing import Optional
import BioHelpers_FABER.bio_mod as bm

# All the notation is oriented on Using Quaternions to Calculate RMSD from Coutsias


def calc_bary(x: np.ndarray) -> np.ndarray:
    """Calculation of the barycenter"""
    num_vect = np.shape(x)[1]
    return np.sum(x, axis=1) / num_vect


def mat_R(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    """Calculation of R matrix"""
    return np.matmul(x, np.transpose(y))


def mat_F(r: np.ndarray) -> np.ndarray:
    f = np.ndarray(shape=(4, 4), dtype=float)
    f[0, 0] = r[0, 0] + r[1, 1] + r[2, 2]
    f[0, 1] = r[1, 2] - r[2, 1]
    f[0, 2] = r[2, 0] - r[0, 2]
    f[0, 3] = r[0, 1] - r[1, 0]
    f[1, 0] = r[1, 2] - r[2, 1]
    f[1, 1] = r[0, 0] - r[1, 1] - r[2, 2]
    f[1, 2] = r[0, 1] + r[1, 0]
    f[1, 3] = r[0, 2] + r[2, 0]
    f[2, 0] = r[2, 0] - r[0, 2]
    f[2, 1] = r[0, 1] + r[1, 0]
    f[2, 2] = -r[0, 0] + r[1, 1] - r[2, 2]
    f[2, 3] = r[1, 2] + r[2, 1]
    f[3, 0] = r[0, 1] - r[1, 0]
    f[3, 1] = r[0, 2] + r[2, 0]
    f[3, 2] = r[1, 2] + r[2, 1]
    f[3, 3] = -r[0, 0] - r[1, 1] + r[2, 2]
    return f


def mat_U(q: np.ndarray) -> np.ndarray:
    """Calculation of the rotation matrix U from quaternion q

    :param q: Quaterion
    :type q: np.ndarray
    :return: 3x3 rotation matrix
    :rtype: np.ndarray
    """
    u = np.ndarray(shape=(3, 3), dtype=float)
    u[0, 0] = q[0] ** 2 + q[1] ** 2 - q[2] ** 2 - q[3] ** 2
    u[0, 1] = 2 * (q[1] * q[2] - q[0] * q[3])
    u[0, 2] = 2 * (q[1] * q[3] + q[0] * q[2])
    u[1, 0] = 2 * (q[1] * q[2] + q[0] * q[3])
    u[1, 1] = q[0] ** 2 - q[1] ** 2 + q[2] ** 2 - q[3] ** 2
    u[1, 2] = 2 * (q[2] * q[3] - q[0] * q[1])
    u[2, 0] = 2 * (q[1] * q[3] - q[0] * q[2])
    u[2, 1] = 2 * (q[2] * q[3] + q[0] * q[1])
    u[2, 2] = q[0] ** 2 - q[1] ** 2 - q[2] ** 2 + q[3] ** 2
    return u


def get_rotation_matrix(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    """Get the rotation matrix to a corresponding set of atom vectors

    :param x: set 1
    :type x: np.ndarray
    :param y: set 2
    :type y: np.ndarray
    :return: rotation matrix U(q_max)
    :rtype: np.ndarray
    """
    # Find the barycenter and shift input sets
    shift_x = np.transpose(np.transpose(x) - calc_bary(x))
    shift_y = np.transpose(np.transpose(y) - calc_bary(y))

    # Calculate the max eigenvalue for F
    eigenval, eigenvec = np.linalg.eig(mat_F(mat_R(shift_x, shift_y)))
    index_max = np.argmax(eigenval)
    return mat_U(eigenvec[:, index_max])


def get_translation_vector(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    """Get the translation vector :math:`\mathbf{r}` |br|
    **Important Note:** The translation vector differs from Kabsch algorithm, there you translate befor rotating. In Coutsias paper it is the otherway round.

    :param x: set of vectors
    :type x: np.ndarray
    :param y: set of vectors
    :type y: np.ndarray
    :return: translation vector w/ shape (3,1)
    :rtype: np.ndarray
    """
    rot_mat = get_rotation_matrix(x, y)
    r = calc_bary(y) - np.matmul(rot_mat, calc_bary(x))
    return r


def rmsd_from_rot_tran(
    x: np.ndarray, y: np.ndarray, rot: np.ndarray, trans: np.ndarray
) -> float:
    """Calculate an RMSD like value for a given set of vectors x and y with a given rotation matrix and translation vector
    Mathematical formula:

    .. math:: \mathrm{E} = \\frac{1}{M} \sum_{k=1..M} |\mathscr{U}\mathbf{x}^k + \mathbf{r} - \mathbf{y}^k|^2

    **Important Note:** The translation vector differs from Kabsch algorithm, there you translate before rotating. In Coutsias paper it is the otherway round.

    :param x: Set of vectors
    :type x: np.ndarray
    :param y: Set of vectors
    :type y: np.ndarray
    :param rot: rotation matrix (3x3)
    :type rot: np.ndarray
    :param trans: Translation vector r, see note above
    :type trans: np.ndarray
    :return: RMSD like value (:math:`\sqrt{E}`)
    :rtype: float
    """
    num_vec = x.shape[1]
    intermediate = np.matmul(rot, x) + trans[:, np.newaxis] + y
    squared = np.trace(np.matmul(np.transpose(intermediate), intermediate)) / num_vec
    return np.sqrt(squared)


def rmsd(x: np.ndarray, y: np.ndarray) -> float:
    """Calculation of Root Mean Square Displacement

    :param x: Array of all atoms, molecule 1
    :type x: np.ndarray
    :param y: Array of all atoms, molecule 2
    :type y: np.ndarray
    :return: RMSD
    :rtype: float
    """

    num_vect = np.shape(x)[1]

    # Find the barycenter and shift input sets
    shift_x = np.transpose(np.transpose(x) - calc_bary(x))
    shift_y = np.transpose(np.transpose(y) - calc_bary(y))

    # Calculate the max eigenvalue for F
    eigenval, _ = np.linalg.eig(mat_F(mat_R(shift_x, shift_y)))
    lam = np.sort(eigenval)[-1]  # Choose largest eigenvalue

    # Squared vectors
    x_square = np.trace(np.matmul(np.transpose(shift_x), shift_x))
    y_square = np.trace(np.matmul(np.transpose(shift_y), shift_y))

    # RMSD
    meanSquareDis = round((x_square + y_square - 2 * lam) / num_vect, 4)

    return np.sqrt(meanSquareDis)


def __get_chain_from_file(file: str) -> pdb.Chain.Chain:
    if not exists(file):
        raise FileNotFoundError
    _, extension = splitext(file)
    if extension == ".pdb":
        structure = pdb.PDBParser().get_structure("", file)[0]
        chain = next(structure.get_chains())
        return chain
    elif extension == ".cif":
        structure = pdb.MMCIFParser().get_structure("", file)[0]
        chain = next(structure.get_chains())
        return chain
    else:
        raise ValueError


def rmsd_pdb(file1: str, file2: str, exclude_residues: Optional[list] = None) -> float:
    """Calculate RMSD for two given PDBs

    :param file1: Filename/Path of PDB/CIF1
    :type file1: str
    :param file2: Filename/Path of PDB/CIF2
    :type file2: str
    :param exclude_residues: Which residues shouldn't be considered for RMSD (eg. [(1,3), (9,12)] for not considering residue 1,2,3 and residue 9,10,11,12)
    :type exclude_residues: list
    :return: RMSD as float, if files do not exist return 0
    :rtype: float
    """
    p1 = __get_chain_from_file(file1)
    p2 = __get_chain_from_file(file2)
    bm.cleanup_chain(p1)
    bm.cleanup_chain(p2)
    if exclude_residues != None:
        for border in exclude_residues:
            ids = list(range(border[0], border[1] + 1))
            for i in ids:
                p1.detach_child((" ", i, " "))
                p2.detach_child((" ", i, " "))
    # p1_AA = [list(a.get_vector()) for a in p1.get_atoms()]
    # p2_AA =  [list(a.get_vector()) for a in p2.get_atoms()]
    p1_AA = [
        list(a.get_vector())
        for a in p1.get_atoms()
        if a.id != "OP3" and a.element != "H"
    ]
    p2_AA = [
        list(a.get_vector())
        for a in p2.get_atoms()
        if a.id != "OP3" and a.element != "H"
    ]
    p1_AA = np.transpose(np.array(p1_AA))
    p2_AA = np.transpose(np.array(p2_AA))
    return rmsd(p1_AA, p2_AA)


def rmsd_per_residue(file1: str, file2: str) -> list[float]:
    """RMSD for all residues seperately

    :param file1: PDB/cif File 1
    :type file1: str
    :param file2: PDB/cif File 2
    :type file2: str
    :return: List of length L (number of residues) with all rmsds per residue
    :rtype: list[float]
    """
    values = []
    p1 = __get_chain_from_file(file1)
    p2 = __get_chain_from_file(file2)
    bm.cleanup_chain(p1)
    bm.cleanup_chain(p2)
    p1_AA = [
        list(a.get_vector())
        for a in p1.get_atoms()
        if a.id != "OP3" and a.element != "H"
    ]
    p2_AA = [
        list(a.get_vector())
        for a in p2.get_atoms()
        if a.id != "OP3" and a.element != "H"
    ]
    p1_AA = np.transpose(np.array(p1_AA))
    p2_AA = np.transpose(np.array(p2_AA))
    rot, trans = get_rotation_matrix(p1_AA, p2_AA), get_translation_vector(p1_AA, p2_AA)
    for res1, res2 in zip(p1.get_residues(), p2.get_residues()):
        res1_AA = [list(a.get_vector()) for a in res1.get_atoms() if a.element != "H"]
        res2_AA = [list(a.get_vector()) for a in res2.get_atoms() if a.element != "H"]
        res1_AA = np.transpose(np.array(res1_AA))
        res2_AA = np.transpose(np.array(res2_AA))
        values.append(rmsd_from_rot_tran(res1_AA, res2_AA, rot, trans))

    return values


if __name__ == "__main__":
    print("------------------")
    print("     RMSD 1.1     ")
    print("------------------\n")
    print("Module to calculate the RMSD.")