Source code for proteopy.get.stat_tests

from __future__ import annotations

from collections.abc import Sequence

import pandas as pd
from anndata import AnnData

from proteopy.utils.anndata import check_proteodata
from proteopy.utils.parsers import parse_stat_test_varm_slot



[docs]
def differential_abundance_df(
    adata: AnnData,
    keys: Sequence[str] | str | None = None,
    key_group: str | None = None,
    min_logfc: float | None = None,
    max_logfc: float | None = None,
    max_pval: float | None = None,
    sort_by: str | None = None,
) -> pd.DataFrame:
    """
    Retrieve differential abundance results from ``.varm`` as a long-format DataFrame.

    Merges one or more test result DataFrames stored in ``adata.varm`` into a
    single tidy DataFrame with an added column identifying the source test.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data object containing differential abundance results in
        ``.varm``.
    keys : str | Sequence[str] | None
        One or more keys in ``adata.varm`` corresponding to differential
        abundance test results (e.g., ``"ttest_two_sample_treated-control"``
        or ``"welch_A-vs-rest"``). Mutually exclusive with ``key_group``.
    key_group : str | None
        Alternative to ``keys``. A key group identifier (e.g.,
        ``"welch_one_vs_rest"``) that selects all ``.varm`` keys belonging
        to that group. Use :func:`tests` to see available key groups.
        Mutually exclusive with ``keys``.
    min_logfc : float | None
        If provided, filter to rows where ``logfc >= min_logfc``.
    max_logfc : float | None
        If provided, filter to rows where ``logfc <= max_logfc``.
    max_pval : float | None
        If provided, filter to rows where adjusted p-value <= ``max_pval``.
        Uses ``pval_adj`` column if present, otherwise falls back to ``pval``.
    sort_by : str | None
        Column name to sort by in descending order (e.g., ``"logfc"``).

    Returns
    -------
    pandas.DataFrame
        Long-format DataFrame with columns:

        - ``var_id``: Variable identifier (from ``adata.var_names``).
        - ``test_type``: The statistical test method (e.g., ``"welch"``).
        - ``group_by``: The ``.obs`` column used for grouping.
        - ``design``: Underscore-separated design identifier (e.g., ``"A_vs_rest"``).
        - ``design_label``: Human-readable description of what the test compares.
        - ``mean1``: Mean expression in group 1.
        - ``mean2``: Mean expression in group 2.
        - ``logfc``: Log fold change.
        - ``tstat``: t-statistic.
        - ``pval``: Raw p-value.
        - ``pval_adj``: Adjusted p-value.
        - ``is_diff_abundant``: Boolean indicating significance.

    Raises
    ------
    ValueError
        If both ``keys`` and ``key_group`` are provided, or if neither is
        provided.
    TypeError
        If ``keys`` is neither a string nor a sequence of strings.
    KeyError
        If any specified key is not found in ``adata.varm``, or if
        ``key_group`` does not match any test group.

    Examples
    --------
    >>> import proteopy as pp
    >>> # Using explicit keys
    >>> df = pp.get.differential_abundance_df(
    ...     adata,
    ...     keys=["welch_treated-control", "welch_A-vs-rest"],
    ... )
    >>> sig_proteins = df[df["is_diff_abundant"]]
    >>>
    >>> # Using key_group to select all tests in a group
    >>> df = pp.get.differential_abundance_df(
    ...     adata,
    ...     key_group="welch_one_vs_rest",
    ... )
    """
    check_proteodata(adata)

    # Validate mutually exclusive parameters
    if keys is not None and key_group is not None:
        raise ValueError(
            "Cannot specify both `keys` and `key_group`. "
            "Please provide only one."
        )
    if keys is None and key_group is None:
        raise ValueError(
            "Must specify either `keys` or `key_group`."
        )

    # Resolve keys from key_group if provided
    if key_group is not None:
        tests_df = tests(adata)
        matching = tests_df[tests_df["key_group"] == key_group]
        if matching.empty:
            available_groups = tests_df["key_group"].unique().tolist()
            raise KeyError(
                f"key_group '{key_group}' not found. "
                f"Available key groups: {available_groups}"
            )
        keys_list = matching["key"].tolist()
    elif isinstance(keys, str):
        keys_list = [keys]
    elif isinstance(keys, Sequence):
        if not all(isinstance(k, str) for k in keys):
            raise TypeError(
                "`keys` must contain only strings; received "
                f"{keys!r}."
            )
        keys_list = list(keys)
    else:
        raise TypeError(
            "`keys` must be a string or a sequence of strings, "
            f"received {type(keys)!r}."
        )

    # Validate all keys exist in varm
    missing_keys = [k for k in keys_list if k not in adata.varm]
    if missing_keys:
        available = list(adata.varm.keys())
        raise KeyError(
            f"Keys not found in adata.varm: {missing_keys}. "
            f"Available keys: {available}"
        )

    # Merge DataFrames
    frames = []
    for key in keys_list:
        df = adata.varm[key].copy()
        df["var_id"] = df.index
        parsed = parse_stat_test_varm_slot(key, adata=adata)
        df["test_type"] = parsed["test_type"]
        df["group_by"] = parsed["group_by"]
        df["design"] = parsed["design"]
        frames.append(df)

    result = pd.concat(frames, ignore_index=True)

    # Reorder columns: var_id, test_type, group_by, design, then the rest
    col_order = ["var_id", "test_type", "group_by",
                 "design", "mean1", "mean2", "logfc", "tstat",
                 "pval", "pval_adj", "is_diff_abundant"]
    # Include any extra columns that might be present
    extra_cols = [c for c in result.columns if c not in col_order]
    result = result[col_order + extra_cols]

    # Apply filters
    if min_logfc is not None:
        result = result[result["logfc"] >= min_logfc]
    if max_logfc is not None:
        result = result[result["logfc"] <= max_logfc]
    if max_pval is not None:
        pval_col = "pval_adj" if "pval_adj" in result.columns else "pval"
        result = result[result[pval_col] <= max_pval]

    # Apply sorting
    if sort_by is not None:
        if sort_by not in result.columns:
            raise KeyError(
                f"sort_by column '{sort_by}' not found in result. "
                f"Available columns: {result.columns.tolist()}"
            )
        result = result.sort_values(by=sort_by, ascending=True)
        result = result.reset_index(drop=True)

    return result




[docs]
def tests(adata: AnnData) -> pd.DataFrame:
    """
    Retrieve a summary of all differential abundance tests stored in ``.varm``.

    Scans the ``.varm`` slots of the AnnData object for statistical test results
    and returns a DataFrame summarizing the tests performed.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data object containing differential abundance results in
        ``.varm``.

    Returns
    -------
    pandas.DataFrame
        DataFrame with columns:

        - ``key``: The ``.varm`` slot name.
        - ``key_group``: String identifier for the test group in format
          ``"<test_type>;<group_by>;<design_mode>"`` or
          ``"<test_type>;<group_by>;<design_mode>;<layer>"``
          if a layer was used.
        - ``test_type``: The statistical test type (e.g., ``"ttest_two_sample"``).
        - ``group_by``: The ``.obs`` column used for grouping.
        - ``design``: Underscore-separated design identifier (e.g., ``"A_vs_rest"``).
        - ``design_label``: Human-readable description of what the test compares.
        - ``design_mode``: Either ``"one_vs_rest"`` or ``"one_vs_one"``.
        - ``layer``: The layer used for the test, or ``None`` if ``.X`` was used.

    Examples
    --------
    >>> import proteopy as pp
    >>> # After running differential abundance tests
    >>> tests_df = pp.get.tests(adata)
    >>> tests_df
                             key               key_group  ...  design_mode
    0  welch;condition;A_vs_rest  welch;condition;one_vs_rest  ...  one_vs_rest
    1  welch;condition;B_vs_rest  welch;condition;one_vs_rest  ...  one_vs_rest
    """
    from proteopy.utils.parsers import parse_stat_test_varm_slot

    check_proteodata(adata)

    records = []
    for key in adata.varm.keys():
        try:
            parsed = parse_stat_test_varm_slot(key, adata=adata)
            design = parsed["design"]
            design_mode = (
                "one_vs_rest" if design.endswith("_vs_rest") else "one_vs_one"
            )
            records.append({
                "key": key,
                "test_type": parsed["test_type"],
                "group_by": parsed["group_by"],
                "design": design,
                "design_label": parsed["design_label"],
                "design_mode": design_mode,
                "layer": parsed["layer"],
            })
        except ValueError:
            # Not a stat-test slot, skip
            continue

    if not records:
        return pd.DataFrame(
            columns=["key", "key_group", "test_type", "group_by", "design",
                     "design_label", "design_mode", "layer"]
        )

    df = pd.DataFrame(records)

    # Build key_group string: "<test_type>;<group_by>;<design_mode>" or
    # "<test_type>;<group_by>;<design_mode>;<layer>" if layer is not None
    def build_key_group(row):
        parts = [row["test_type"], row["group_by"], row["design_mode"]]
        if row["layer"] is not None:
            parts.append(row["layer"])
        return ";".join(parts)

    df["key_group"] = df.apply(build_key_group, axis=1)
    df = df[["key", "key_group", "test_type", "group_by", "design",
             "design_label", "design_mode", "layer"]]

    return df