Source code for openprotein.app.services.align

from collections.abc import Sequence
from typing import BinaryIO, Iterator

from openprotein.api import align
from openprotein.app.models import MSAFuture, PromptFuture
from openprotein.base import APISession
from openprotein.schemas import AlignType, Job

from io import BytesIO


[docs] class AlignAPI: """API interface for calling Poet and Align endpoints"""
[docs] def __init__(self, session: APISession): self.session = session
# TODO - document the `ep` and `op` parameters
[docs] def mafft( self, sequences: Sequence[bytes | str], names: Sequence[str] | None = None, auto: bool = True, ep: float | None = None, op: float | None = None, ) -> MSAFuture: """ Align sequences using the `mafft` algorithm. Set auto to True to automatically attempt the best params. Leave a parameter as None to use system defaults. Parameters ---------- sequences : Sequence[bytes | str] Sequences to align names : Sequence[string], optional Optional list of sequence names, must be same length as sequences if provided. auto : bool = True, optional Set to true to automatically set algorithm parameters. ep : float, optional mafft parameter op : float, optional mafft parameter Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ if names is not None and len(names) != len(sequences): raise Exception( f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}" ) lines = [] if names is None: # as CSV lines = [s.encode() if isinstance(s, str) else s for s in sequences] else: # as fasta for name, sequence in zip(names, sequences): if isinstance(name, str): name = name.encode() if isinstance(sequence, str): sequence = sequence.encode() lines.append(b">" + name) lines.append(sequence) content = b"\n".join(lines) stream = BytesIO(content) return self.mafft_file(stream, auto=auto, ep=ep, op=op)
# TODO - document the `ep` and `op` parameters
[docs] def mafft_file(self, file, auto=True, ep=None, op=None) -> MSAFuture: """ Align sequences using the `mafft` algorithm. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns. Set auto to True to automatically attempt the best params. Leave a parameter as None to use system defaults. Parameters ---------- file : File Sequences to align in fasta or csv format. auto : bool = True, optional Set to true to automatically set algorithm parameters. ep : float, optional mafft parameter op : float, optional mafft parameter Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ job = align.mafft_post(self.session, file, auto=auto, ep=ep, op=op) return MSAFuture.create(session=self.session, job=job)
# TODO - document the parameters
[docs] def clustalo( self, sequences, names=None, clustersize=None, iterations=None ) -> MSAFuture: """ Align sequences using the `clustal omega` algorithm. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns. Leave a parameters as None to use system defaults. Parameters ---------- sequences : List[bytes] Sequences to align names : List[string], optional Optional list of sequence names, must be same length as sequences if provided. clustersize : int, optional clustal omega parameter iterations : int, optional clustal omega parameter Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ if names is not None and len(names) != len(sequences): raise Exception( f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}" ) lines = [] if names is None: # as CSV lines = sequences else: # as fasta for name, sequence in zip(names, sequences): if type(name) is str: name = name.encode() lines.append(b">" + name) lines.append(sequence) content = b"\n".join(lines) stream = BytesIO(content) return self.clustalo_file( stream, clustersize=clustersize, iterations=iterations )
# TODO - document the parameters
[docs] def clustalo_file(self, file, clustersize=None, iterations=None) -> MSAFuture: """ Align sequences using the `clustal omega` algorithm. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns. Leave a parameters as None to use system defaults. Parameters ---------- file : File Sequences to align in fasta or csv format. clustersize : int, optional clustal omega parameter iterations : int, optional clustal omega parameter Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ job = align.clustalo_post( self.session, file, clustersize=clustersize, iterations=iterations ) return MSAFuture.create(session=self.session, job=job)
[docs] def abnumber(self, sequences, names=None, scheme="imgt") -> MSAFuture: """ Align antibody using `AbNumber`. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns. The antibody numbering scheme can be specified from `imgt` (default), `chothia`, `kabat`, or `aho`. Parameters ---------- sequences : List[bytes] Sequences to align names : List[string], optional Optional list of sequence names, must be same length as sequences if provided. scheme : str = 'imgt' Antibody numbering scheme. Can be one of 'imgt', 'chothia', 'kabat', or 'aho' Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ if names is not None and len(names) != len(sequences): raise Exception( f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}" ) lines = [] if names is None: # as CSV lines = sequences else: # as fasta for name, sequence in zip(names, sequences): if type(name) is str: name = name.encode() lines.append(b">" + name) lines.append(sequence) content = b"\n".join(lines) stream = BytesIO(content) return self.abnumber_file(stream, scheme=scheme)
# TODO - properly test me and add new AbNumberFuture to support additional GET endpoint
[docs] def abnumber_file(self, file, scheme="imgt") -> MSAFuture: """ Align antibody using `AbNumber`. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns. The antibody numbering scheme can be specified from `imgt` (default), `chothia`, `kabat`, or `aho`. Parameters ---------- file : File Sequences to align in fasta or csv format. scheme : str = 'imgt' Antibody numbering scheme. Can be one of 'imgt', 'chothia', 'kabat', or 'aho' Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ job = align.abnumber_post(self.session, file, scheme=scheme) return MSAFuture.create(session=self.session, job=job)
[docs] def upload_msa(self, msa_file) -> MSAFuture: """ Upload an MSA from file. Parameters ---------- msa_file : str, optional Ready-made MSA. If not provided, default value is None. Raises ------ APIError If there is an issue with the API request. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ return MSAFuture.create( session=self.session, job=align.msa_post(self.session, msa_file=msa_file) )
[docs] def create_msa(self, seed: bytes) -> MSAFuture: """ Construct an MSA via homology search with the seed sequence. Parameters ---------- seed : bytes Seed sequence for the MSA construction. Raises ------ APIError If there is an issue with the API request. Returns ------- MSAJob Job object containing the details of the MSA construction. """ return MSAFuture.create( session=self.session, job=align.msa_post(self.session, seed=seed) )
[docs] def upload_prompt(self, prompt_file: BinaryIO) -> PromptFuture: """ Directly upload a prompt. Bypass post_msa and prompt_post steps entirely. In this case PoET will use the prompt as is. You can specify multiple prompts (one per replicate) with an <END_PROMPT> and newline between CSVs. Parameters ---------- prompt_file : BinaryIO Binary I/O object representing the prompt file. Raises ------ APIError If there is an issue with the API request. Returns ------- PromptJob An object representing the status and results of the prompt job. """ return PromptFuture.create( session=self.session, job=align.upload_prompt_post(session=self.session, prompt_file=prompt_file), )
[docs] def get_prompt( self, job: Job, prompt_index: int | None = None ) -> Iterator[list[str]]: """ Get prompts for a given job. Parameters ---------- job : Job The job for which to retrieve data. prompt_index : Optional[int] The replicate number for the prompt (input_type=-PROMPT only) Returns ------- csv.reader A CSV reader for the response data. """ return align.get_input( session=self.session, job=job, input_type=AlignType.PROMPT, prompt_index=prompt_index, )
[docs] def get_seed(self, job: Job) -> Iterator[list[str]]: """ Get input data for a given msa job. Parameters ---------- job : Job The job for which to retrieve data. Returns ------- csv.reader A CSV reader for the response data. """ return align.get_input( session=self.session, job=job, input_type=AlignType.INPUT )
[docs] def get_msa(self, job: Job) -> Iterator[list[str]]: """ Get generated MSA for a given job. Parameters ---------- job : Job The job for which to retrieve data. Returns ------- csv.reader A CSV reader for the response data. """ return align.get_input(session=self.session, job=job, input_type=AlignType.MSA)