from typing import Iterator
from openprotein import config
from openprotein.api import align
from openprotein.base import APISession
from openprotein.schemas import JobType, MSAJob, MSASamplingMethod, MafftJob, ClustalOJob, AbNumberJob
from ..futures import Future, InvalidFutureError
from .base import AlignFuture
from .prompt import PromptFuture
# TODO - AbNumber should probably be different subclass, because it supports an additional `get` API for the antibody numbering
[docs]
class MSAFuture(AlignFuture, Future):
"""
Represents a result of a MSA job.
Attributes
----------
session : APISession
An instance of APISession for API interactions.
job : Job
The MSA job.
page_size : int
The number of results to fetch in a single page.
Methods
-------
get(verbose=False)
Get the MSA.
Returns
-------
Iterator[list[str]]
A CSV reader for the MSA data.
"""
job: MSAJob | MafftJob | ClustalOJob | AbNumberJob
[docs]
def __init__(
self, session: APISession, job: MSAJob, page_size: int = config.POET_PAGE_SIZE
):
"""
Init a MSAFuture instance.
Parameters
----------
session : APISession
An instance of APISession for API interactions.
job : Job
The MSA job.
page_size : int
The number of results to fetch in a single page.
"""
super().__init__(session, job)
self.page_size = page_size
self.msa_id = self.job.job_id
[docs]
def get(self, verbose: bool = False) -> Iterator[list[str]]:
return align.get_msa(self.session, self.job)
[docs]
def sample_prompt(
self,
num_sequences: int | None = None,
num_residues: int | None = None,
method: MSASamplingMethod = MSASamplingMethod.NEIGHBORS_NONGAP_NORM_NO_LIMIT,
homology_level: float = 0.8,
max_similarity: float = 1.0,
min_similarity: float = 0.0,
always_include_seed_sequence: bool = False,
num_ensemble_prompts: int = 1,
random_seed: int | None = None,
) -> PromptFuture:
"""
Create a protein sequence prompt from a linked MSA (Multiple Sequence Alignment) for PoET Jobs.
Parameters
----------
num_sequences : int, optional
Maximum number of sequences in the prompt. Must be <100.
num_residues : int, optional
Maximum number of residues (tokens) in the prompt. Must be less than 24577.
method : MSASamplingMethod, optional
Method to use for MSA sampling. Defaults to NEIGHBORS_NONGAP_NORM_NO_LIMIT.
homology_level : float, optional
Level of homology for sequences in the MSA (neighbors methods only). Must be between 0 and 1. Defaults to 0.8.
max_similarity : float, optional
Maximum similarity between sequences in the MSA and the seed. Must be between 0 and 1. Defaults to 1.0.
min_similarity : float, optional
Minimum similarity between sequences in the MSA and the seed. Must be between 0 and 1. Defaults to 0.0.
always_include_seed_sequence : bool, optional
Whether to always include the seed sequence in the MSA. Defaults to False.
num_ensemble_prompts : int, optional
Number of ensemble jobs to run. Defaults to 1.
random_seed : int, optional
Seed for random number generation. Defaults to a random number between 0 and 2**32-1.
Raises
------
InvalidParameterError
If provided parameter values are not in the allowed range.
MissingParameterError
If both or none of 'num_sequences', 'num_residues' is specified.
Returns
-------
PromptJob
"""
msa_id = self.msa_id
job = align.prompt_post(
self.session,
msa_id=msa_id,
num_sequences=num_sequences,
num_residues=num_residues,
method=method,
homology_level=homology_level,
max_similarity=max_similarity,
min_similarity=min_similarity,
always_include_seed_sequence=always_include_seed_sequence,
num_ensemble_prompts=num_ensemble_prompts,
random_seed=random_seed,
)
future = PromptFuture.create(session=self.session, job=job)
return future