from collections.abc import Sequence
from typing import BinaryIO, Iterator
from openprotein.api import align
from openprotein.app.models import MSAFuture, PromptFuture
from openprotein.base import APISession
from openprotein.schemas import AlignType, Job
from io import BytesIO
[docs]
class AlignAPI:
"""API interface for calling Poet and Align endpoints"""
[docs]
def __init__(self, session: APISession):
self.session = session
# TODO - document the `ep` and `op` parameters
[docs]
def mafft(
self,
sequences: Sequence[bytes | str],
names: Sequence[str] | None = None,
auto: bool = True,
ep: float | None = None,
op: float | None = None,
) -> MSAFuture:
"""
Align sequences using the `mafft` algorithm.
Set auto to True to automatically attempt the best params. Leave a parameter as None to use system defaults.
Parameters
----------
sequences : Sequence[bytes | str]
Sequences to align
names : Sequence[string], optional
Optional list of sequence names, must be same length as sequences if provided.
auto : bool = True, optional
Set to true to automatically set algorithm parameters.
ep : float, optional
mafft parameter
op : float, optional
mafft parameter
Returns
-------
MSAFuture
Future object awaiting the contents of the MSA upload.
"""
if names is not None and len(names) != len(sequences):
raise Exception(
f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}"
)
lines = []
if names is None:
# as CSV
lines = [s.encode() if isinstance(s, str) else s for s in sequences]
else:
# as fasta
for name, sequence in zip(names, sequences):
if isinstance(name, str):
name = name.encode()
if isinstance(sequence, str):
sequence = sequence.encode()
lines.append(b">" + name)
lines.append(sequence)
content = b"\n".join(lines)
stream = BytesIO(content)
return self.mafft_file(stream, auto=auto, ep=ep, op=op)
# TODO - document the `ep` and `op` parameters
[docs]
def mafft_file(self, file, auto=True, ep=None, op=None) -> MSAFuture:
"""
Align sequences using the `mafft` algorithm. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns.
Set auto to True to automatically attempt the best params. Leave a parameter as None to use system defaults.
Parameters
----------
file : File
Sequences to align in fasta or csv format.
auto : bool = True, optional
Set to true to automatically set algorithm parameters.
ep : float, optional
mafft parameter
op : float, optional
mafft parameter
Returns
-------
MSAFuture
Future object awaiting the contents of the MSA upload.
"""
job = align.mafft_post(self.session, file, auto=auto, ep=ep, op=op)
return MSAFuture.create(session=self.session, job=job)
# TODO - document the parameters
[docs]
def clustalo(
self, sequences, names=None, clustersize=None, iterations=None
) -> MSAFuture:
"""
Align sequences using the `clustal omega` algorithm. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns.
Leave a parameters as None to use system defaults.
Parameters
----------
sequences : List[bytes]
Sequences to align
names : List[string], optional
Optional list of sequence names, must be same length as sequences if provided.
clustersize : int, optional
clustal omega parameter
iterations : int, optional
clustal omega parameter
Returns
-------
MSAFuture
Future object awaiting the contents of the MSA upload.
"""
if names is not None and len(names) != len(sequences):
raise Exception(
f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}"
)
lines = []
if names is None:
# as CSV
lines = sequences
else:
# as fasta
for name, sequence in zip(names, sequences):
if type(name) is str:
name = name.encode()
lines.append(b">" + name)
lines.append(sequence)
content = b"\n".join(lines)
stream = BytesIO(content)
return self.clustalo_file(
stream, clustersize=clustersize, iterations=iterations
)
# TODO - document the parameters
[docs]
def clustalo_file(self, file, clustersize=None, iterations=None) -> MSAFuture:
"""
Align sequences using the `clustal omega` algorithm. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns.
Leave a parameters as None to use system defaults.
Parameters
----------
file : File
Sequences to align in fasta or csv format.
clustersize : int, optional
clustal omega parameter
iterations : int, optional
clustal omega parameter
Returns
-------
MSAFuture
Future object awaiting the contents of the MSA upload.
"""
job = align.clustalo_post(
self.session, file, clustersize=clustersize, iterations=iterations
)
return MSAFuture.create(session=self.session, job=job)
[docs]
def abnumber(self, sequences, names=None, scheme="imgt") -> MSAFuture:
"""
Align antibody using `AbNumber`. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns.
The antibody numbering scheme can be specified from `imgt` (default), `chothia`, `kabat`, or `aho`.
Parameters
----------
sequences : List[bytes]
Sequences to align
names : List[string], optional
Optional list of sequence names, must be same length as sequences if provided.
scheme : str = 'imgt'
Antibody numbering scheme. Can be one of 'imgt', 'chothia', 'kabat', or 'aho'
Returns
-------
MSAFuture
Future object awaiting the contents of the MSA upload.
"""
if names is not None and len(names) != len(sequences):
raise Exception(
f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}"
)
lines = []
if names is None:
# as CSV
lines = sequences
else:
# as fasta
for name, sequence in zip(names, sequences):
if type(name) is str:
name = name.encode()
lines.append(b">" + name)
lines.append(sequence)
content = b"\n".join(lines)
stream = BytesIO(content)
return self.abnumber_file(stream, scheme=scheme)
# TODO - properly test me and add new AbNumberFuture to support additional GET endpoint
[docs]
def abnumber_file(self, file, scheme="imgt") -> MSAFuture:
"""
Align antibody using `AbNumber`. Sequences can be provided as `fasta` or `csv` formats. If `csv`, the file must be headerless with either a single sequence column or name, sequence columns.
The antibody numbering scheme can be specified from `imgt` (default), `chothia`, `kabat`, or `aho`.
Parameters
----------
file : File
Sequences to align in fasta or csv format.
scheme : str = 'imgt'
Antibody numbering scheme. Can be one of 'imgt', 'chothia', 'kabat', or 'aho'
Returns
-------
MSAFuture
Future object awaiting the contents of the MSA upload.
"""
job = align.abnumber_post(self.session, file, scheme=scheme)
return MSAFuture.create(session=self.session, job=job)
[docs]
def upload_msa(self, msa_file) -> MSAFuture:
"""
Upload an MSA from file.
Parameters
----------
msa_file : str, optional
Ready-made MSA. If not provided, default value is None.
Raises
------
APIError
If there is an issue with the API request.
Returns
-------
MSAFuture
Future object awaiting the contents of the MSA upload.
"""
return MSAFuture.create(
session=self.session, job=align.msa_post(self.session, msa_file=msa_file)
)
[docs]
def create_msa(self, seed: bytes) -> MSAFuture:
"""
Construct an MSA via homology search with the seed sequence.
Parameters
----------
seed : bytes
Seed sequence for the MSA construction.
Raises
------
APIError
If there is an issue with the API request.
Returns
-------
MSAJob
Job object containing the details of the MSA construction.
"""
return MSAFuture.create(
session=self.session, job=align.msa_post(self.session, seed=seed)
)
[docs]
def upload_prompt(self, prompt_file: BinaryIO) -> PromptFuture:
"""
Directly upload a prompt.
Bypass post_msa and prompt_post steps entirely. In this case PoET will use the prompt as is.
You can specify multiple prompts (one per replicate) with an <END_PROMPT> and newline between CSVs.
Parameters
----------
prompt_file : BinaryIO
Binary I/O object representing the prompt file.
Raises
------
APIError
If there is an issue with the API request.
Returns
-------
PromptJob
An object representing the status and results of the prompt job.
"""
return PromptFuture.create(
session=self.session,
job=align.upload_prompt_post(session=self.session, prompt_file=prompt_file),
)
[docs]
def get_prompt(
self, job: Job, prompt_index: int | None = None
) -> Iterator[list[str]]:
"""
Get prompts for a given job.
Parameters
----------
job : Job
The job for which to retrieve data.
prompt_index : Optional[int]
The replicate number for the prompt (input_type=-PROMPT only)
Returns
-------
csv.reader
A CSV reader for the response data.
"""
return align.get_input(
session=self.session,
job=job,
input_type=AlignType.PROMPT,
prompt_index=prompt_index,
)
[docs]
def get_seed(self, job: Job) -> Iterator[list[str]]:
"""
Get input data for a given msa job.
Parameters
----------
job : Job
The job for which to retrieve data.
Returns
-------
csv.reader
A CSV reader for the response data.
"""
return align.get_input(
session=self.session, job=job, input_type=AlignType.INPUT
)
[docs]
def get_msa(self, job: Job) -> Iterator[list[str]]:
"""
Get generated MSA for a given job.
Parameters
----------
job : Job
The job for which to retrieve data.
Returns
-------
csv.reader
A CSV reader for the response data.
"""
return align.get_input(session=self.session, job=job, input_type=AlignType.MSA)