Source code for openprotein.protein

import io
from collections.abc import Sequence
from pathlib import Path
from typing import TYPE_CHECKING, Literal

import gemmi
import numpy as np
import numpy.typing as npt

from . import fasta

if TYPE_CHECKING:
    from openprotein.align import MSAFuture


# fmt: off
_ATOM_TYPES = (
    'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
    'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
    'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
    'CZ3', 'NZ', 'OXT'
)
# fmt: on
_N_ATOM = len(_ATOM_TYPES)
_ATOM_TYPE_TO_IDX = {atom_type: i for i, atom_type in enumerate(_ATOM_TYPES)}

_BACKBONE_ATOM_TYPES = ("N", "CA", "C")

_EXPERIMENTAL_METHODS = {
    "X-RAY DIFFRACTION",
    "ELECTRON MICROSCOPY",
    "SOLUTION NMR",
    "SOLID-STATE NMR",
    "NEUTRON DIFFRACTION",
    "ELECTRON CRYSTALLOGRAPHY",
    "FIBER DIFFRACTION",
    "POWDER DIFFRACTION",
    "INFRARED SPECTROSCOPY",
    "FLUORESCENCE TRANSFER",
    "EPR",
    "SOLUTION SCATTERING",
}
_NAN_BFACTOR_VALUE = 9999.75  # can't/hard to use 9999.99 due to precision issues



[docs]
class Protein:
    """
    Represents a protein with optional sequence, atomic coordinates, per-residue
    confidence scores (pLDDT), and name.

    This class supports partial or complete information: users may initialize a Protein
    with only a sequence, only a structure, or both. The class ensures that all
    provided fields have consistent residue-level lengths and provides convenient
    methods for indexing, masking, and structural comparisons.

    Attributes:
        sequence: Amino acid sequence as bytes. Unknown or masked residues are
            represented as b"X".
        coordinates: an array containing the 3D coordinates of the heavy atoms of the
            protein in atom37 format. It has shape `(L, 37, 3)`, where `L` is the
            length of the protein, `37` is the number of heavy atoms, and `3` is the
            number of coordinates (x, y, and z).
        plddt: an array of shape `(L,)`. For predicted structures, this contains the
            pLDDT of each residue, which is a measure of prediction confidence. For
            experimental structures, this should be set to `100` if the coordinates of
            the alpha carbon are known, and `NaN` otherwise.
        name: Optional identifier for the protein as a string.

    Conventions:
        - Missing or unknown residues in the sequence are denoted by b"X".
        - Missing structural data (coordinates or pLDDT) are represented by NaN.
        - Residue indices are 1-based for user-facing methods (e.g., `mask_sequence_at`),
          but internally stored as 0-based arrays.

    Examples:
        Create a Protein from sequence only:
            Protein(sequence="ACDEFGHIK")

        Create a Protein from sequence and name:
            Protein(sequence="ACDEFGHIK", name="my_protein")

        Create a Protein with sequence and structure:
            Protein(sequence="ACD", coordinates=coords_array, plddt=plddt_array)

    Raises:
        ValueError: If sequence, coordinates, or pLDDT are specified with inconsistent lengths.
        ValueError: If none of sequence, coordinates, or pLDDT are provided.
    """


[docs]
    def __init__(
        self,
        sequence: bytes | str | None = None,
        coordinates: npt.NDArray[np.float32] | None = None,
        plddt: npt.NDArray[np.float32] | None = None,
        name: bytes | str | None = None,
    ):
        lengths = {len(x) for x in (sequence, coordinates, plddt) if x is not None}
        if len(lengths) == 0:
            raise ValueError(
                "At least one of sequence, coordinates, or plddt must be specified."
            )
        elif len(lengths) > 1:
            raise ValueError(
                "Specified sequence, coordinates, and plddt must all have the same length."
            )
        length = next(iter(lengths))
        if sequence is not None:
            self._sequence = (
                sequence.encode() if isinstance(sequence, str) else sequence
            )
        else:
            self._sequence = b"X" * length
        if coordinates is not None:
            self._coordinates = coordinates
        else:
            self._coordinates = np.full((length, _N_ATOM, 3), np.nan, dtype=np.float32)
        if plddt is not None:
            self._plddt = plddt
        else:
            self._plddt = np.full((length,), np.nan, dtype=np.float32)
        if name is not None:
            self._name = name if isinstance(name, str) else name.decode()
        else:
            self._name = name
        self._tags = {}


    @property
    def name(self) -> str | None:
        return self._name

    @name.setter
    def name(self, x: bytes | str) -> None:
        self._name = x if isinstance(x, str) else x.decode()

    @property
    def sequence(self) -> bytes:
        return self._sequence

    @sequence.setter
    def sequence(self, x: bytes | str) -> None:
        assert len(x) == len(self)
        self._sequence = x.encode() if isinstance(x, str) else x

    @property
    def coordinates(self) -> npt.NDArray[np.float32]:
        return self._coordinates

    @coordinates.setter
    def coordinates(self, x: npt.NDArray[np.float32]) -> None:
        assert len(x) == len(self)
        self._coordinates = x

    @property
    def plddt(self) -> npt.NDArray[np.float32]:
        return self._plddt

    @plddt.setter
    def plddt(self, x: npt.NDArray[np.float32]) -> None:
        assert len(x) == len(self)
        self._plddt = x

    @property
    def chain_id(self) -> str | list[str] | None:
        return self._tags.get("chain_id")

    @chain_id.setter
    def chain_id(self, chain_id: str | list[str]) -> None:
        self._tags["chain_id"] = chain_id

    @property
    def cyclic(self) -> bool:
        return self._tags.get("cyclic") or False

    @cyclic.setter
    def cyclic(self, cyclic: bool) -> None:
        self._tags["cyclic"] = cyclic

    class NullMSA: ...

    single_sequence_mode = NullMSA

    @property
    def msa(self) -> "str | MSAFuture | None | NullMSA":
        return self._tags.get("msa")

    @msa.setter
    def msa(self, msa: "str | MSAFuture | None | NullMSA") -> None:
        self._tags["msa"] = msa

    def __len__(self):
        lengths = {
            len(x)
            for x in (self.sequence, self.coordinates, self.plddt)
            if x is not None
        }
        assert len(lengths) == 1
        return next(iter(lengths))

    def __getitem__(
        self, idx: int | list[int] | slice | npt.NDArray[np.integer]
    ) -> "Protein":
        """Return a new Protein object indexing into residues by `idx`."""
        if isinstance(idx, int):
            idx = np.array([idx], dtype=int)
        return Protein(
            sequence=np.frombuffer(self.sequence, dtype=np.uint8)[idx].tobytes(),
            coordinates=self.coordinates[idx].copy(),
            plddt=self.plddt[idx].copy(),
            name=self.name,
        )

    def __add__(self, tgt: "Protein") -> "Protein":
        """Return a new Protein object by concatenating with another Protein."""
        assert isinstance(tgt, Protein)
        return Protein(
            sequence=self.sequence + tgt.sequence,
            coordinates=np.concatenate((self.coordinates, tgt.coordinates)),
            plddt=np.concatenate((self.plddt, tgt.plddt)),
            name=self.name if self.name == tgt.name else None,
        )

    def at(self, positions: Sequence[int] | npt.NDArray[np.integer]) -> "Protein":
        """
        Return a new Protein object containing residues at given 1-indexed positions.
        """
        if not isinstance(positions, np.ndarray):
            positions = np.array(positions, dtype=int)
        return self[positions - 1]

    def mask_sequence_at(
        self, positions: Sequence[int] | npt.NDArray[np.integer]
    ) -> "Protein":
        """Mask sequence at given 1-indexed positions."""
        if not isinstance(positions, np.ndarray):
            positions = np.array(positions, dtype=int)
        idxs = positions - 1
        sequence = np.frombuffer(self.sequence, dtype=np.uint8).copy()
        sequence[idxs] = ord(b"X")
        return Protein(
            sequence=sequence.tobytes(),
            coordinates=self.coordinates.copy(),
            plddt=self.plddt.copy(),
            name=self.name,
        )

    def mask_sequence_except_at(
        self, positions: Sequence[int] | npt.NDArray[np.integer]
    ) -> "Protein":
        """Mask sequence at all positions except the given 1-indexed positions."""
        if not isinstance(positions, np.ndarray):
            positions = np.array(positions, dtype=int)
        idxs = positions - 1
        sequence = np.frombuffer(self.sequence, dtype=np.uint8).copy()
        mask = np.ones_like(sequence, dtype=bool)
        mask[idxs] = False
        sequence[mask] = ord(b"X")
        return Protein(
            sequence=sequence.tobytes(),
            coordinates=self.coordinates.copy(),
            plddt=self.plddt.copy(),
            name=self.name,
        )

    def mask_structure_at(
        self, positions: Sequence[int] | npt.NDArray[np.integer]
    ) -> "Protein":
        """Mask structure at given 1-indexed positions."""
        if not isinstance(positions, np.ndarray):
            positions = np.array(positions, dtype=int)
        idxs = positions - 1
        coordinates, plddt = self.coordinates.copy(), self.plddt.copy()
        coordinates[idxs], plddt[idxs] = np.nan, np.nan
        return Protein(
            sequence=self.sequence, coordinates=coordinates, plddt=plddt, name=self.name
        )

    def mask_structure_except_at(
        self, positions: Sequence[int] | npt.NDArray[np.integer]
    ) -> "Protein":
        """Mask structure at all positions except the given 1-indexed positions."""
        if not isinstance(positions, np.ndarray):
            positions = np.array(positions, dtype=int)
        idxs = positions - 1
        mask = np.ones(len(self), dtype=bool)
        mask[idxs] = False
        coordinates, plddt = self.coordinates.copy(), self.plddt.copy()
        coordinates[mask], plddt[mask] = np.nan, np.nan
        return Protein(
            sequence=self.sequence, coordinates=coordinates, plddt=plddt, name=self.name
        )

    @property
    def has_structure(self) -> bool:
        """Whether or not the structure is known at any position in the protein."""
        return (not np.isnan(self.coordinates).all()) or (
            not np.isnan(self.plddt).all()
        )

    def rmsd(
        self, tgt: "Protein", backbone_only: bool | str | Sequence[str] = False
    ) -> float:
        """
        Compute the root-mean-square deviation (RMSD) between this Protein and a target
        Protein.

        Only atoms that are present (i.e., not NaN) in both structures are included in
        the calculation.

        Args:
            tgt: The target Protein to compare against.
            backbone_only: Specifies which atoms to include in the RMSD calculation.
                - If False (default), all atom types are included.
                - If True, only backbone atoms ("N", "CA", "C") are included.
                - If a string, it must be a single atom type (e.g., "CA").
                - If a sequence of strings, it must be a non-empty list of atom types
                  (e.g., ["CA", "CB", "O"]). All specified atom types must be valid.

        Returns:
            The RMSD value between the aligned structures.

        Notes:
            This method assumes that residues in `self` and `tgt` are already aligned.
        """
        if backbone_only is False:
            atom_idxs = np.arange(len(_ATOM_TYPES))
        elif backbone_only is True:
            atom_idxs = np.arange(3)
        elif isinstance(backbone_only, str):
            atom_idxs = [_ATOM_TYPE_TO_IDX[backbone_only]]
        elif isinstance(backbone_only, Sequence):
            assert len(backbone_only) > 0 and isinstance(next(iter(backbone_only)), str)
            atom_idxs = [_ATOM_TYPE_TO_IDX[x] for x in backbone_only]
        else:
            raise ValueError(backbone_only)
        src_coords = self.coordinates[:, atom_idxs]
        tgt_coords = tgt.coordinates[:, atom_idxs]
        src_known_atoms = ~np.isnan(src_coords).any(axis=2)
        tgt_known_atoms = ~np.isnan(tgt_coords).any(axis=2)
        overlapping_known_atoms = src_known_atoms & tgt_known_atoms
        src_coords = src_coords[overlapping_known_atoms]
        tgt_coords = tgt_coords[overlapping_known_atoms]
        rmsd, _ = calc_rmsd(src_coords, tgt_coords)
        return rmsd

    def __make_gemmi(self) -> gemmi.Structure:
        # TODO: add note about _NAN_BFACTOR_VALUE
        assert (
            self.has_structure
        ), "cannot make cif string for protein with no structure data"
        # Create an empty structure and add a model with a default chain.
        structure = gemmi.Structure()
        if self.name is not None:
            structure.name = self.name
        model = structure.add_model(gemmi.Model(1))

        # Process the sequence.
        resnames = gemmi.expand_one_letter_sequence(
            self.sequence.decode(), gemmi.ResidueKind.AA
        )
        entity = gemmi.Entity("1")
        entity.full_sequence = resnames
        entity.entity_type = gemmi.EntityType.Polymer
        entity.polymer_type = gemmi.PolymerType.PeptideL
        entity.subchains = ["A"]
        structure.entities.append(entity)

        # Process the coordinates.
        n_nan_coords = np.isnan(self.coordinates).sum(axis=2)
        assert (
            (n_nan_coords == 0) | (n_nan_coords == 3)
        ).all(), "either all coords of an atom must be nan, or none are"
        # Process the plddt.
        assert (
            np.isnan(self.plddt) | (~np.isnan(self.plddt) & (n_nan_coords[:, 1] == 0))
        ).all(), "if plddt is known, coord of CA must be known"

        # Write the chain
        chain = model.add_chain(gemmi.Chain("A"))
        for i in range(len(self)):
            # Add a residue to the chain; note that residue numbering starts at 1.
            residue = gemmi.Residue()
            residue.entity_id = "1"
            residue.entity_type = gemmi.EntityType.Polymer
            residue.subchain = "A"
            residue.name = resnames[i]
            residue.label_seq = i + 1
            residue.seqid = gemmi.SeqId(str(i + 1))
            residue = chain.add_residue(residue, i + 1)
            # For each residue, add the atoms.
            for j, atom_name in enumerate(_ATOM_TYPES):
                if np.isnan(self.coordinates[i, j]).any():
                    continue
                atom = gemmi.Atom()
                atom.name = atom_name
                atom.element = gemmi.Element(atom_name[0])
                atom.pos = gemmi.Position(*self.coordinates[i, j])
                if not np.isnan(self.plddt[i]):
                    atom.b_iso = self.plddt[i]
                else:
                    atom.b_iso = _NAN_BFACTOR_VALUE
                atom = residue.add_atom(atom)
        return structure

    def make_cif_string(self) -> str:
        structure = self.__make_gemmi()
        # NB: gemmi doesn't seem to write the _chem_comp category properly... it says
        #     the type is `.`, but is should be something like `L-PEPTIDE LINKING`...
        #     see also: https://github.com/project-gemmi/gemmi/discussions/362
        block = structure.make_mmcif_block(
            groups=gemmi.MmcifOutputGroups(True, chem_comp=False)
        )
        return block.as_string()

    def make_pdb_string(self) -> str:
        structure = self.__make_gemmi()
        return structure.make_pdb_string()

    def make_fasta_bytes(self) -> bytes:
        assert self.name is not None
        data = io.BytesIO()
        data.write(b">")
        data.write(self.name.encode())
        data.write(b"\n")
        data.write(
            self.sequence.encode()
            if not isinstance(self.sequence, bytes)
            else self.sequence
        )
        data.write(b"\n")
        return data.getvalue()

    @staticmethod
    def from_filepath(
        path: str | Path,
        chain_id: str,
        use_bfactor_as_plddt: bool | None = None,
        model_idx: int = 0,
        verbose: bool = True,
    ) -> "Protein":
        """
        Create a Protein from a structure file.

        If the structure file has multiple conformers, the first conformer is always
        used.

        Args:
            path: path to structure file (e.g. pdb or cif file)
            chain_id: id of the chain in the structure file to use
            use_bfactor_as_plddt: whether or not to use the bfactor of the CA atom as
                the plddt of structure of its residue. If None, this will be set to
                true only if the resolution of the structure is unspecified or zero.
            model_idx: index of the model in the structure file to use
            verbose: whether or not to print debugging information such as oddities in
                the structure e.g. missing atoms
        """
        structure = gemmi.read_structure(str(path))
        structure.name = Path(path).stem
        return Protein.from_structure(
            structure=structure,
            chain_id=chain_id,
            use_bfactor_as_plddt=use_bfactor_as_plddt,
            model_idx=model_idx,
            verbose=verbose,
        )

    @staticmethod
    def from_string(
        filestring: bytes | str,
        format: Literal["pdb", "cif"],
        chain_id: str,
        use_bfactor_as_plddt: bool | None = None,
        model_idx: int = 0,
        verbose: bool = True,
    ) -> "Protein":
        if format == "pdb":
            structure = gemmi.read_pdb_string(filestring)
        elif format == "cif":
            structure = gemmi.make_structure_from_block(
                gemmi.cif.read_string(filestring).sole_block()
            )
        else:
            raise ValueError(f"Unknown {format=}")
        return Protein.from_structure(
            structure=structure,
            chain_id=chain_id,
            use_bfactor_as_plddt=use_bfactor_as_plddt,
            model_idx=model_idx,
            verbose=verbose,
        )

    @staticmethod
    def from_structure(
        structure: gemmi.Structure,
        chain_id: str,
        use_bfactor_as_plddt: bool | None = None,
        model_idx: int = 0,
        verbose: bool = True,
    ) -> "Protein":
        structure.setup_entities()
        structure.assign_label_seq_id()
        if use_bfactor_as_plddt is None:
            use_bfactor_as_plddt = _use_bfactor_as_plddt(structure=structure)
        model = structure[model_idx]
        chain = model.find_chain(chain_id)
        assert chain is not None
        polymer = chain.get_polymer()

        # extract sequence
        entity = structure.get_entity_of(polymer)
        if len(entity.full_sequence) > 0:
            chain_seq = entity.full_sequence
        else:
            chain_seq = [residue.name for residue in polymer]
        chain_seq = [
            gemmi.find_tabulated_residue(
                # gemmi.Entity.first_mon extracts the first conformer
                gemmi.Entity.first_mon(residue_name)
            ).one_letter_code
            for residue_name in chain_seq
        ]
        # for find_tabulated_residue: lowercase means nonstandard, " " means unknown
        chain_seq = [c.upper() if c != " " else "X" for c in chain_seq]
        # extract coordinates and plddt
        coordinates = np.full((len(chain_seq), _N_ATOM, 3), np.nan, dtype=np.float32)
        plddt = np.full(len(chain_seq), np.nan, dtype=np.float32)
        for residue_idx, residue in enumerate(polymer):
            i = residue.label_seq - 1 if residue.label_seq is not None else residue_idx
            code = gemmi.find_tabulated_residue(residue.name).one_letter_code
            code = code.upper() if code != " " else "X"
            if code != chain_seq[i]:
                if verbose:
                    # TODO: can this ever happen...? probably want to have this regardless i guess
                    # TODO: improve this message?
                    print(
                        f"Amino acid mismatch at position {i + 1}: SEQRES {chain_seq[i]} Structure {code}"
                    )
                chain_seq[i] = code
            if verbose and code == "X" and residue.name != "UNK":
                print(f"Unknown amino acid at position {i + 1}: {residue.name}")
            if verbose:
                for j, atom_name in enumerate(_BACKBONE_ATOM_TYPES):
                    if atom_name not in residue:
                        print(
                            f"Residue at position {i + 1} missing backbone atom={atom_name}"
                        )
            for atom in residue.first_conformer():
                atom_name = atom.name
                if residue.name == "MSE" and atom_name == "SE":
                    atom_name = "SD"
                if (j := _ATOM_TYPE_TO_IDX.get(atom.name)) is None:
                    continue
                coordinates[i, j] = atom.pos.tolist()
                if use_bfactor_as_plddt and atom_name == "CA":
                    plddt[i] = (
                        atom.b_iso if atom.b_iso != _NAN_BFACTOR_VALUE else np.nan
                    )
            # TODO: we should experiment and see if this is the behavior we want
            if (
                not use_bfactor_as_plddt
                and np.isfinite(coordinates[i, _ATOM_TYPE_TO_IDX["CA"]]).all()
            ):
                plddt[i] = 100.0
        assert np.isnan(plddt).all() or (
            (np.nanmin(plddt) >= 0) and (np.nanmax(plddt) <= 100)
        )
        return Protein(
            sequence="".join(chain_seq),
            coordinates=coordinates,
            plddt=plddt,
            name=structure.name if structure.name != "" else None,
        )



def parse_fasta_as_proteins(path: str | Path) -> list[Protein]:
    proteins = []
    with open(path, "rb") as fp:
        for name, sequence in fasta.parse_stream(fp):
            proteins.append(Protein(name=name, sequence=sequence))
    return proteins


def _is_experimental_structure(structure: gemmi.Structure) -> bool:
    """
    This heuristic decides whether the structure is an experimental structure.
    This heuristic may be changed in the future.
    """
    if structure.resolution > 0:
        return True
    else:
        return ("_exptl.method" in structure.info) and (
            structure.info["_exptl.method"] in _EXPERIMENTAL_METHODS
        )


def _use_bfactor_as_plddt(structure: gemmi.Structure) -> bool:
    """
    This heuristic decides whether to use B-factor as pLDDT.
    This heuristic may be changed in the future.
    """
    return not _is_experimental_structure(structure=structure)


def calc_rmsd(
    xyz1: npt.NDArray[np.floating], xyz2: npt.NDArray[np.floating], eps: float = 1e-6
) -> tuple[float, npt.NDArray[np.floating]]:
    """
    Calculates RMSD between two sets of atoms (L, 3)
    Adapted from https://github.com/RosettaCommons/RFdiffusion/blob/b44206a2a79f219bb1a649ea50603a284c225050/rfdiffusion/util.py#L719
    """
    # center to CA centroid
    xyz1 = xyz1 - xyz1.mean(0)
    xyz2 = xyz2 - xyz2.mean(0)

    # Computation of the covariance matrix
    C = xyz2.T @ xyz1

    # Compute otimal rotation matrix using SVD
    V, S, W = np.linalg.svd(C)

    # get sign to ensure right-handedness
    d = np.ones([3, 3])
    d[:, -1] = np.sign(np.linalg.det(V) * np.linalg.det(W))

    # Rotation matrix U
    U = (d * V) @ W

    # Rotate xyz2
    xyz2_ = xyz2 @ U
    L = xyz2_.shape[0]
    rmsd = np.sqrt(np.sum((xyz2_ - xyz1) * (xyz2_ - xyz1), axis=(0, 1)) / L + eps)

    return rmsd, U