Source code for proteopy.get.stat_tests

from __future__ import annotations

from collections.abc import Sequence

import pandas as pd
from anndata import AnnData

from proteopy.utils.anndata import check_proteodata
from proteopy.utils.parsers import parse_stat_test_varm_slot


[docs] def differential_abundance_df( adata: AnnData, keys: Sequence[str] | str | None = None, key_group: str | None = None, min_logfc: float | None = None, max_logfc: float | None = None, max_pval: float | None = None, sort_by: str | None = None, ) -> pd.DataFrame: """ Retrieve differential abundance results from ``.varm`` as a long-format DataFrame. Merges one or more test result DataFrames stored in ``adata.varm`` into a single tidy DataFrame with an added column identifying the source test. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data object containing differential abundance results in ``.varm``. keys : str | Sequence[str] | None One or more keys in ``adata.varm`` corresponding to differential abundance test results (e.g., ``"ttest_two_sample_treated-control"`` or ``"welch_A-vs-rest"``). Mutually exclusive with ``key_group``. key_group : str | None Alternative to ``keys``. A key group identifier (e.g., ``"welch_one_vs_rest"``) that selects all ``.varm`` keys belonging to that group. Use :func:`tests` to see available key groups. Mutually exclusive with ``keys``. min_logfc : float | None If provided, filter to rows where ``logfc >= min_logfc``. max_logfc : float | None If provided, filter to rows where ``logfc <= max_logfc``. max_pval : float | None If provided, filter to rows where adjusted p-value <= ``max_pval``. Uses ``pval_adj`` column if present, otherwise falls back to ``pval``. sort_by : str | None Column name to sort by in descending order (e.g., ``"logfc"``). Returns ------- pandas.DataFrame Long-format DataFrame with columns: - ``var_id``: Variable identifier (from ``adata.var_names``). - ``test_type``: The statistical test method (e.g., ``"welch"``). - ``group_by``: The ``.obs`` column used for grouping. - ``design``: Underscore-separated design identifier (e.g., ``"A_vs_rest"``). - ``design_label``: Human-readable description of what the test compares. - ``mean1``: Mean expression in group 1. - ``mean2``: Mean expression in group 2. - ``logfc``: Log fold change. - ``tstat``: t-statistic. - ``pval``: Raw p-value. - ``pval_adj``: Adjusted p-value. - ``is_diff_abundant``: Boolean indicating significance. Raises ------ ValueError If both ``keys`` and ``key_group`` are provided, or if neither is provided. TypeError If ``keys`` is neither a string nor a sequence of strings. KeyError If any specified key is not found in ``adata.varm``, or if ``key_group`` does not match any test group. Examples -------- >>> import proteopy as pp >>> # Using explicit keys >>> df = pp.get.differential_abundance_df( ... adata, ... keys=["welch_treated-control", "welch_A-vs-rest"], ... ) >>> sig_proteins = df[df["is_diff_abundant"]] >>> >>> # Using key_group to select all tests in a group >>> df = pp.get.differential_abundance_df( ... adata, ... key_group="welch_one_vs_rest", ... ) """ check_proteodata(adata) # Validate mutually exclusive parameters if keys is not None and key_group is not None: raise ValueError( "Cannot specify both `keys` and `key_group`. " "Please provide only one." ) if keys is None and key_group is None: raise ValueError( "Must specify either `keys` or `key_group`." ) # Resolve keys from key_group if provided if key_group is not None: tests_df = tests(adata) matching = tests_df[tests_df["key_group"] == key_group] if matching.empty: available_groups = tests_df["key_group"].unique().tolist() raise KeyError( f"key_group '{key_group}' not found. " f"Available key groups: {available_groups}" ) keys_list = matching["key"].tolist() elif isinstance(keys, str): keys_list = [keys] elif isinstance(keys, Sequence): if not all(isinstance(k, str) for k in keys): raise TypeError( "`keys` must contain only strings; received " f"{keys!r}." ) keys_list = list(keys) else: raise TypeError( "`keys` must be a string or a sequence of strings, " f"received {type(keys)!r}." ) # Validate all keys exist in varm missing_keys = [k for k in keys_list if k not in adata.varm] if missing_keys: available = list(adata.varm.keys()) raise KeyError( f"Keys not found in adata.varm: {missing_keys}. " f"Available keys: {available}" ) # Merge DataFrames frames = [] for key in keys_list: df = adata.varm[key].copy() df["var_id"] = df.index parsed = parse_stat_test_varm_slot(key, adata=adata) df["test_type"] = parsed["test_type"] df["group_by"] = parsed["group_by"] df["design"] = parsed["design"] frames.append(df) result = pd.concat(frames, ignore_index=True) # Reorder columns: var_id, test_type, group_by, design, then the rest col_order = ["var_id", "test_type", "group_by", "design", "mean1", "mean2", "logfc", "tstat", "pval", "pval_adj", "is_diff_abundant"] # Include any extra columns that might be present extra_cols = [c for c in result.columns if c not in col_order] result = result[col_order + extra_cols] # Apply filters if min_logfc is not None: result = result[result["logfc"] >= min_logfc] if max_logfc is not None: result = result[result["logfc"] <= max_logfc] if max_pval is not None: pval_col = "pval_adj" if "pval_adj" in result.columns else "pval" result = result[result[pval_col] <= max_pval] # Apply sorting if sort_by is not None: if sort_by not in result.columns: raise KeyError( f"sort_by column '{sort_by}' not found in result. " f"Available columns: {result.columns.tolist()}" ) result = result.sort_values(by=sort_by, ascending=True) result = result.reset_index(drop=True) return result
[docs] def tests(adata: AnnData) -> pd.DataFrame: """ Retrieve a summary of all differential abundance tests stored in ``.varm``. Scans the ``.varm`` slots of the AnnData object for statistical test results and returns a DataFrame summarizing the tests performed. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data object containing differential abundance results in ``.varm``. Returns ------- pandas.DataFrame DataFrame with columns: - ``key``: The ``.varm`` slot name. - ``key_group``: String identifier for the test group in format ``"<test_type>;<group_by>;<design_mode>"`` or ``"<test_type>;<group_by>;<design_mode>;<layer>"`` if a layer was used. - ``test_type``: The statistical test type (e.g., ``"ttest_two_sample"``). - ``group_by``: The ``.obs`` column used for grouping. - ``design``: Underscore-separated design identifier (e.g., ``"A_vs_rest"``). - ``design_label``: Human-readable description of what the test compares. - ``design_mode``: Either ``"one_vs_rest"`` or ``"one_vs_one"``. - ``layer``: The layer used for the test, or ``None`` if ``.X`` was used. Examples -------- >>> import proteopy as pp >>> # After running differential abundance tests >>> tests_df = pp.get.tests(adata) >>> tests_df key key_group ... design_mode 0 welch;condition;A_vs_rest welch;condition;one_vs_rest ... one_vs_rest 1 welch;condition;B_vs_rest welch;condition;one_vs_rest ... one_vs_rest """ from proteopy.utils.parsers import parse_stat_test_varm_slot check_proteodata(adata) records = [] for key in adata.varm.keys(): try: parsed = parse_stat_test_varm_slot(key, adata=adata) design = parsed["design"] design_mode = ( "one_vs_rest" if design.endswith("_vs_rest") else "one_vs_one" ) records.append({ "key": key, "test_type": parsed["test_type"], "group_by": parsed["group_by"], "design": design, "design_label": parsed["design_label"], "design_mode": design_mode, "layer": parsed["layer"], }) except ValueError: # Not a stat-test slot, skip continue if not records: return pd.DataFrame( columns=["key", "key_group", "test_type", "group_by", "design", "design_label", "design_mode", "layer"] ) df = pd.DataFrame(records) # Build key_group string: "<test_type>;<group_by>;<design_mode>" or # "<test_type>;<group_by>;<design_mode>;<layer>" if layer is not None def build_key_group(row): parts = [row["test_type"], row["group_by"], row["design_mode"]] if row["layer"] is not None: parts.append(row["layer"]) return ";".join(parts) df["key_group"] = df.apply(build_key_group, axis=1) df = df[["key", "key_group", "test_type", "group_by", "design", "design_label", "design_mode", "layer"]] return df