Source code for proteopy.get.proteoforms

from __future__ import annotations

from collections.abc import Sequence

import pandas as pd
from anndata import AnnData

from proteopy.utils.anndata import check_proteodata



[docs]
def proteoforms_df(
    adata: AnnData,
    proteins: Sequence[str] | str | None = None,
    *,
    only_proteins: bool = False,
    score_threshold: float | None = None,
    pval_threshold: float | None = None,
    pval_adj_threshold: float | None = None,
) -> pd.DataFrame:
    """
    Return proteoform peptide assignment results as a tidy dataframe.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data object containing proteoform annotations in ``.var``.
    proteins : str | Sequence[str] | None
        Optional subset of protein identifiers to include.
    only_proteins : bool
        When ``True``, output unique protein-level information of identified
        proteoforms.
    score_threshold : float | None
        Minimum proteoform score to retain.
    pval_threshold : float | None
        Maximum raw p-value allowed.
    pval_adj_threshold : float | None
        Maximum adjusted p-value allowed.

    Returns
    -------
    pandas.DataFrame
        Proteoform assignments filtered according to the provided arguments.

    Raises
    ------
    TypeError
        If ``proteins`` is neither a string nor a sequence of strings.
    KeyError
        If the expected proteoform columns are not present in ``adata.var``.
    """
    check_proteodata(adata)

    proteoform_columns = [
        "protein_id",
        "peptide_id",
        "cluster_id",
        "proteoform_score",
        "proteoform_score_pval",
        "proteoform_score_pval_adj",
        "is_proteoform",
    ]

    missing_columns = [
        column for column in proteoform_columns
        if column not in adata.var.columns
    ]

    if missing_columns:
        missing = ", ".join(missing_columns)
        raise KeyError(
            "Missing required proteoform annotation columns in `adata.var`: "
            f"{missing}"
        )

    if proteins is None:
        selected_proteins = adata.var["protein_id"].tolist()
    elif isinstance(proteins, str):
        selected_proteins = [proteins]
    elif isinstance(proteins, Sequence):
        if not all(isinstance(protein, str) for protein in proteins):
            raise TypeError(
                "`proteins` must contain only strings; received "
                f"{proteins!r}."
            )
        selected_proteins = list(proteins)
    else:
        raise TypeError(
            "`proteins` must be a string or a sequence of strings, "
            f"received {type(proteins)!r}."
        )

    selection = adata.var["protein_id"].isin(selected_proteins)
    proteoforms = adata.var.loc[selection, proteoform_columns].copy()

    proteoforms = proteoforms[
        proteoforms["proteoform_score_pval"].notna()
    ].sort_values(
        ["proteoform_score_pval_adj", "proteoform_score", "cluster_id"]
    )

    if score_threshold is not None:
        proteoforms = proteoforms[
            proteoforms["proteoform_score"] >= score_threshold
        ]

    if pval_threshold is not None:
        proteoforms = proteoforms[
            proteoforms["proteoform_score_pval"] <= pval_threshold
        ]

    if pval_adj_threshold is not None:
        proteoforms = proteoforms[
            proteoforms["proteoform_score_pval_adj"] <= pval_adj_threshold
        ]

    if only_proteins:
        proteoforms = (
            proteoforms.drop(columns=["peptide_id", "cluster_id"])
            .drop_duplicates(ignore_index=True)
        )
        return proteoforms

    return proteoforms.reset_index(drop=True)