Source code for proteopy.get.proteoforms

from __future__ import annotations

from collections.abc import Sequence

import pandas as pd
from anndata import AnnData

from proteopy.utils.anndata import check_proteodata


[docs] def proteoforms_df( adata: AnnData, proteins: Sequence[str] | str | None = None, *, only_proteins: bool = False, score_threshold: float | None = None, pval_threshold: float | None = None, pval_adj_threshold: float | None = None, ) -> pd.DataFrame: """ Return proteoform peptide assignment results as a tidy dataframe. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data object containing proteoform annotations in ``.var``. proteins : str | Sequence[str] | None Optional subset of protein identifiers to include. only_proteins : bool When ``True``, output unique protein-level information of identified proteoforms. score_threshold : float | None Minimum proteoform score to retain. pval_threshold : float | None Maximum raw p-value allowed. pval_adj_threshold : float | None Maximum adjusted p-value allowed. Returns ------- pandas.DataFrame Proteoform assignments filtered according to the provided arguments. Raises ------ TypeError If ``proteins`` is neither a string nor a sequence of strings. KeyError If the expected proteoform columns are not present in ``adata.var``. """ check_proteodata(adata) proteoform_columns = [ "protein_id", "peptide_id", "cluster_id", "proteoform_score", "proteoform_score_pval", "proteoform_score_pval_adj", "is_proteoform", ] missing_columns = [ column for column in proteoform_columns if column not in adata.var.columns ] if missing_columns: missing = ", ".join(missing_columns) raise KeyError( "Missing required proteoform annotation columns in `adata.var`: " f"{missing}" ) if proteins is None: selected_proteins = adata.var["protein_id"].tolist() elif isinstance(proteins, str): selected_proteins = [proteins] elif isinstance(proteins, Sequence): if not all(isinstance(protein, str) for protein in proteins): raise TypeError( "`proteins` must contain only strings; received " f"{proteins!r}." ) selected_proteins = list(proteins) else: raise TypeError( "`proteins` must be a string or a sequence of strings, " f"received {type(proteins)!r}." ) selection = adata.var["protein_id"].isin(selected_proteins) proteoforms = adata.var.loc[selection, proteoform_columns].copy() proteoforms = proteoforms[ proteoforms["proteoform_score_pval"].notna() ].sort_values( ["proteoform_score_pval_adj", "proteoform_score", "cluster_id"] ) if score_threshold is not None: proteoforms = proteoforms[ proteoforms["proteoform_score"] >= score_threshold ] if pval_threshold is not None: proteoforms = proteoforms[ proteoforms["proteoform_score_pval"] <= pval_threshold ] if pval_adj_threshold is not None: proteoforms = proteoforms[ proteoforms["proteoform_score_pval_adj"] <= pval_adj_threshold ] if only_proteins: proteoforms = ( proteoforms.drop(columns=["peptide_id", "cluster_id"]) .drop_duplicates(ignore_index=True) ) return proteoforms return proteoforms.reset_index(drop=True)