Source code for proteopy.pl.intensities

import warnings
from functools import partial
from typing import Any, Sequence
from collections.abc import Sequence as SequenceABC
from numbers import Real

import numpy as np
import pandas as pd
from pandas.api.types import is_categorical_dtype
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.patches import Patch
from adjustText import adjust_text
import seaborn as sns
import matplotlib as mpl
import anndata as ad
import math
import os
import sys
from scipy import sparse

from proteopy.utils.anndata import check_proteodata, is_proteodata
from proteopy.utils.matplotlib import _resolve_color_scheme
from proteopy.utils.array import is_log_transformed
from proteopy.utils.functools import partial_with_docsig


[docs] def peptide_intensities( adata: ad.AnnData, protein_ids: str | Sequence[str] | None = None, order_by: str | None = None, order: Sequence[str] | None = None, groups: str | Sequence[str] | None = None, color: str | None = None, group_by: str | None = None, log_transform: float | None = None, fill_na: float | None = None, z_transform: bool = False, show_zeros: bool = True, xlab_rotation: float = 0, order_by_label_rotation: float = 0, figsize: tuple[float, float] = (15, 6), show: bool = True, save: str | os.PathLike[str] | None = None, ax: bool = False, color_scheme: Any = None, ) -> Axes | list[Axes] | None: """ Plot peptide intensities across samples for the requested proteins. Parameters ---------- adata : AnnData Proteomics :class:`~anndata.AnnData`. protein_ids : str | Sequence[str] Show peptides mapping to this protein_id. order_by : str, optional Column in ``adata.obs`` used to group and order observations on the x-axis. When ``None``, observations follow ``adata.obs_names``. order : Sequence[str], optional Explicit order of groups (when ``order_by`` is set) or observations (when ``order_by`` is ``None``). groups : str | Sequence[str], optional Restrict ``order_by`` to selected categorical levels (requires ``order_by``). The provided order determines the plotting order unless ``order`` is supplied. color : str, optional ``adata.var`` column used for per-peptide coloring. group_by : str, optional ``adata.var`` column whose categories are aggregated into a single line. Mutually exclusive with ``color``; each group is colored via ``color_scheme``. log_transform : float, optional Logarithm base (>0 and !=1). Values are transformed as ``log(value + 1, base)``. fill_na : float, optional Replace missing intensities before zero/log/z transformations when set. z_transform : bool, optional Standardize each peptide across observations after optional log transform. Skips NA. show_zeros : bool, optional Display zero intensities when ``True``; otherwise zeros become ``NaN``. xlab_rotation : float, optional Rotation angle (degrees) applied to x-axis tick labels. order_by_label_rotation : float, optional Rotation angle for the group labels drawn above grouped sections. figsize : tuple[float, float], optional Size of each generated figure passed to :func:`matplotlib.pyplot.subplots`. color_scheme : Any, optional Palette specification forwarded to :func:`proteopy.utils.matplotlib._resolve_color_scheme` for either the per-peptide ``color`` or aggregated ``group_by`` categories. show : bool, optional Display the generated figure(s) with :func:`matplotlib.pyplot.show`. save : str | os.PathLike, optional Path for saving the figure(s). Multi-protein selections are written to a PDF stack. ax : bool, optional When ``True``, return the underlying Axes objects instead of closing them. Returns ------- Axes | list[Axes] | None Axes handle(s) when ``ax`` is ``True``; otherwise ``None``. """ # Check input check_proteodata(adata) if protein_ids is None: raise ValueError( "peptide_intensities requires at least one protein_id; " "pass a string or an iterable of IDs." ) if isinstance(protein_ids, str): protein_ids = [protein_ids] if not protein_ids: raise ValueError("protein_ids cannot be empty.") if color and group_by: raise ValueError("`color` and `group_by` are mutually exclusive.") if groups is not None and order_by is None: raise ValueError("`groups` can only be used when `order_by` is provided.") if groups is None: group_levels = None elif isinstance(groups, str): group_levels = [groups] elif isinstance(groups, SequenceABC): group_levels = list(groups) else: raise TypeError("`groups` must be a string or a sequence of strings.") if group_levels is not None: if not group_levels: raise ValueError("`groups` cannot be empty.") seen_groups: set[Any] = set() deduped_groups: list[Any] = [] for grp in group_levels: if grp in seen_groups: continue seen_groups.add(grp) deduped_groups.append(grp) group_levels = deduped_groups # Format input if log_transform is not None: if log_transform <= 0: raise ValueError("log_transform must be positive.") if log_transform == 1: raise ValueError("log_transform cannot be 1.") log_base = float(log_transform) else: log_base = None var_cols = ['protein_id'] if color: if color not in adata.var.columns: raise KeyError( f"Column '{color}' is not present in adata.var; " "peptide coloring must use a .var annotation." ) var_cols.append(color) if group_by: if group_by not in adata.var.columns: raise KeyError( f"Column '{group_by}' is not present in adata.var; " "grouping requires a .var annotation." ) if group_by not in var_cols: var_cols.append(group_by) var = adata.var[var_cols].copy() var = var.reset_index(names='var_index') var = var[var['protein_id'].isin(protein_ids)] if color and color in var and is_categorical_dtype(var[color]): var[color] = var[color].cat.remove_unused_categories() if group_by and group_by in var and is_categorical_dtype(var[group_by]): var[group_by] = var[group_by].cat.remove_unused_categories() selected_vars = var['var_index'].tolist() palette_map = None if group_by: hue_labels = ( pd.Series(pd.unique(var[group_by])) .dropna() .tolist() ) elif color: hue_labels = pd.Series(pd.unique(var[color])).dropna().tolist() else: hue_labels = pd.Series(pd.unique(var['var_index'])).dropna().tolist() if hue_labels: palette_values = _resolve_color_scheme(color_scheme, hue_labels) if palette_values: palette_map = dict(zip(hue_labels, palette_values)) obs = adata.obs.reset_index(names='obs_index') if order_by: if order_by not in obs.columns: raise KeyError(f"'{order_by}' is not present in adata.obs") if not is_categorical_dtype(obs[order_by]): obs[order_by] = obs[order_by].astype('category') obs = obs[['obs_index', order_by]] if group_levels is not None: available_groups = set(obs[order_by].dropna().unique()) missing_groups = [grp for grp in group_levels if grp not in available_groups] if missing_groups: raise ValueError( "Items in 'groups' are not present in the selected " f"'{order_by}' categories: {sorted(missing_groups)}" ) obs = obs[obs[order_by].isin(group_levels)].copy() if obs.empty: raise ValueError( "No observations remain after filtering with `groups`." ) if is_categorical_dtype(obs[order_by]): obs[order_by] = obs[order_by].cat.remove_unused_categories() else: obs = obs[['obs_index']] if selected_vars: adata_subset = adata[:, selected_vars] X_matrix = adata_subset.X was_sparse = sparse.issparse(X_matrix) if was_sparse: data_matrix = X_matrix.toarray() else: data_matrix = np.asarray(X_matrix) data_matrix = np.array(data_matrix, dtype=float, copy=True) var_names = list(adata_subset.var_names) else: data_matrix = np.empty((adata.n_obs, 0), dtype=float) var_names = [] if fill_na is not None: if not np.isfinite(fill_na): raise ValueError("fill_na must be a finite float.") data_matrix = data_matrix.copy() data_matrix[np.isnan(data_matrix)] = float(fill_na) zero_mask = data_matrix == 0 X_processed = data_matrix.copy() if log_base is not None: with np.errstate(divide='ignore', invalid='ignore'): X_processed = np.log1p(X_processed) / np.log(log_base) if z_transform and selected_vars: with np.errstate(divide='ignore', invalid='ignore'): arr_mean = np.nanmean(X_processed, axis=0, keepdims=True) arr_std = np.nanstd(X_processed, axis=0, keepdims=True) arr_std[arr_std == 0] = 1.0 X_processed = (X_processed - arr_mean) / arr_std if not show_zeros and zero_mask.size: X_processed[zero_mask] = np.nan expr_df = pd.DataFrame( X_processed, columns=var_names, index=adata.obs_names, ) expr_df = expr_df.reset_index(names='obs_index') df = expr_df.melt( id_vars='obs_index', var_name='var_index', value_name='intensity', ) df = pd.merge(df, var, on='var_index', how='left') df = pd.merge(df, obs, on='obs_index', how='left') # Explicitly order the x axis observations cat_index_map = {} cats_ordered = [] if order_by: if is_categorical_dtype(obs[order_by]): base_categories = list(obs[order_by].cat.categories) else: base_categories = list(pd.unique(obs[order_by])) base_categories_set = set(base_categories) if group_levels is not None: categories = [ cat for cat in group_levels if cat in base_categories_set ] else: categories = base_categories cat_index_map = { cat: obs.loc[obs[order_by] == cat, 'obs_index'].to_list() for cat in categories } if order: missing = set(order) - set(cat_index_map) if missing: raise ValueError( "Items in 'order' are not present in the selected " f"'{order_by}' categories: {sorted(missing)}" ) cats_ordered = list(order) seen_order = set(cats_ordered) cats_ordered.extend( cat for cat in categories if cat not in seen_order ) else: cats_ordered = categories obs_index_ordered = [ idx for cat in cats_ordered for idx in cat_index_map[cat] ] else: if order: missing = set(order) - set(obs['obs_index']) if missing: raise ValueError( "Items in 'order' are not present in adata.obs_names: " f"{sorted(missing)}" ) obs_index_base = obs['obs_index'].tolist() seen_order = set(order) obs_index_ordered = list(order) + [ idx for idx in obs_index_base if idx not in seen_order ] else: obs_index_ordered = obs['obs_index'].tolist() df['obs_index'] = pd.Categorical( df['obs_index'], categories=obs_index_ordered, ordered=True) axes = [] if save and len(protein_ids) > 1: save_path = save if save.endswith('.pdf') else f'{save}.pdf' pdf_pages = PdfPages(save_path) for prot_id in protein_ids: sub_df = df[df['protein_id'] == prot_id] fig, _ax = plt.subplots(figsize=figsize) if sub_df.empty: warnings.warn(f'No data found for protein: {prot_id}') _ax.text( 0.5, 0.5, f'No data found for protein: {prot_id}', ha='center', va='center', transform=_ax.transAxes, fontsize=14, color='gray' ) _ax.set_xlim(0, 1) _ax.set_ylim(0, 1) _ax.set_xticks([]) _ax.set_yticks([]) else: #sub_df = sub_df.sort_values(by=order_by) lineplot_kwargs = dict( data=sub_df, x='obs_index', y='intensity', marker='o', dashes=False, legend='brief', ax=_ax, ) if palette_map: lineplot_kwargs['palette'] = palette_map if order_by: lineplot_kwargs['style'] = order_by if group_by: lineplot_kwargs.update( hue=group_by, ) elif color: lineplot_kwargs.update( hue=color, units='var_index', estimator=None, errorbar=None, ) else: lineplot_kwargs.update(hue='var_index') sns.lineplot(**lineplot_kwargs) # Legend handles, labels = _ax.get_legend_handles_labels() # Determine which labels correspond to the hue only (ignore style) if group_by: hue_values = sub_df[group_by].dropna().unique().astype(str) elif color: hue_values = sub_df[color].unique().astype(str) else: hue_values = sub_df['var_index'].unique().astype(str) # Keep only legend entries whose label matches a hue value new_handles_labels = [(h, l) for h, l in zip(handles, labels) if l in hue_values] if new_handles_labels: handles, labels = zip(*new_handles_labels) # unzip back into separate lists legend_title = ( group_by if group_by else color if color else 'Peptide' ) _ax.legend( handles, labels, bbox_to_anchor=(1.01, 1), loc='upper left', title=legend_title, ) # Add group separator lines obs_idxpos_map = {obs: i for i, obs in enumerate(obs_index_ordered)} if order_by: for cat in cats_ordered[:-1]: last_obs_in_cat = cat_index_map[cat][-1] _ax.axvline( x=obs_idxpos_map[last_obs_in_cat] + 0.5, ymin=0.02, ymax=0.95, color='#D8D8D8', linestyle='--' ) # Add group labels above each group section for cat in cats_ordered: group_obs = cat_index_map[cat] if not group_obs: continue # Determine x-axis group regions start = obs_idxpos_map[group_obs[0]] end = obs_idxpos_map[group_obs[-1]] mid = (start + end) / 2 rot = order_by_label_rotation if order_by_label_rotation else 0 ha_for_rot = 'center' if (rot % 360 == 0) else 'left' # Determine padded y-axis limits ymax = sub_df['intensity'].max() ymin = sub_df['intensity'].min() ypad_top = (ymax - ymin) * 0.15 ypad_bottom = (ymax - ymin) * 0.10 _ax.set_ylim(ymin - ypad_bottom, ymax + ypad_top) _ax.text( x=mid, y=ymax + ypad_top * 0.4, s=cat, ha=ha_for_rot, va='bottom', fontsize=12, fontweight='bold', rotation=rot, rotation_mode='anchor', ) plt.xticks(rotation=xlab_rotation, ha='right') _ax.set_title(prot_id) _ax.set_xlabel('Sample') _ax.set_ylabel('Intensity') plt.tight_layout() if ax: axes.append(_ax) if show: plt.show() elif save: if len(protein_ids) == 1: fig.savefig(save, bbox_inches='tight', dpi=300) else: pdf_pages.savefig(fig, bbox_inches='tight') if show: plt.show() plt.close(fig) elif show: plt.show() plt.close(fig) else: print("Warning: Plot created but not displayed, saved, or returned") plt.close(fig) if save and len(protein_ids) > 1: pdf_pages.close() if ax: return axes[0] if len(axes) == 1 else axes
docstr_header = ( "Plot peptide intensities colored by proteoforms across samples for the " "requested proteins." ) proteoform_intensities = partial_with_docsig( peptide_intensities, color = 'proteoform_id', )
[docs] def intensity_box_per_sample( adata: ad.AnnData, layer: str | None = None, order_by: str | None = None, order: Sequence[str] | None = None, group_by: str | None = None, zero_to_na: bool = False, fill_na: float | int | None = None, log_transform: float | None = None, z_transform: bool = False, ylabel: str = "Intensity", xlabel_rotation: float = 90, order_by_label_rotation: float = 0, show: bool = True, ax: Axes | None = None, save: str | os.PathLike[str] | None = None, figsize: tuple[float, float] = (8, 5), color_scheme: Any | None = None, ) -> Axes: """ Plot intensity distributions as boxplots, either per observation or pooled by a categorical grouping. Parameters ---------- adata : AnnData Proteomics :class:`~anndata.AnnData`. layer : str, optional Key in ``adata.layers`` providing the matrix to plot. When ``None``, use ``adata.X``. order_by : str, optional Column in ``adata.obs`` whose categories annotate observation groups in per-observation mode. Mutually exclusive with ``group_by``. order : Sequence[str], optional Explicit order of ``order_by`` categories. When ``group_by`` is provided, these values order the grouped categories on the x-axis. group_by : str, optional Column in ``adata.obs`` used to pool observations into a single box per category. When ``None``, each observation is shown individually. zero_to_na : bool, optional Convert zero intensities to ``NaN`` before other transforms. fill_na : float, optional Replace missing intensities with this value before transformations. log_transform : float, optional Logarithm base (>0 and !=1). Applies ``log(value + 1, base)``. z_transform : bool, optional Standardize intensities per observation after the log transform. ylabel : str, optional Label for the y-axis. xlabel_rotation : float, optional Rotation angle for observation tick labels. order_by_label_rotation : float, optional Rotation angle for the group labels drawn above the axis. show : bool, optional Display the figure when ``True`` and a new axis is created. ax : Axes, optional Axis to draw on. When ``None``, a new figure and axis are created. save : str | os.PathLike, optional Path to save the figure. ``None`` skips saving. figsize : tuple of float, optional Figure size used when creating a new axis. color_scheme : Any, optional Palette forwarded to :func:`proteopy.utils.matplotlib._resolve_color_scheme` for either grouped categories (``group_by``) or observation-level annotations (``order_by``). Returns ------- Axes Axis containing the rendered boxplot. """ check_proteodata(adata) if order_by is not None and group_by is not None: raise ValueError("`order_by` and `group_by` cannot be combined.") # Validate save target early for clearer error messages if save is not None and not isinstance(save, (str, os.PathLike)): raise TypeError("`save` must be a string, PathLike, or None.") # Select the matrix to plot (layer vs X) while preserving dense/sparse inputs if layer is not None: if layer not in adata.layers: raise KeyError(f"Layer '{layer}' not found in adata.layers.") Xsrc = adata.layers[layer] X = Xsrc.toarray() if sparse.issparse(Xsrc) else np.asarray(Xsrc, dtype=float) df = pd.DataFrame(X, index=adata.obs_names, columns=adata.var_names) else: df = adata.to_df() if zero_to_na and fill_na is not None: raise ValueError("`zero_to_na` and `fill_na` are mutually exclusive.") # Optional NA imputation happens before any log or z transforms fill_value: float | None = None if fill_na is not None: if isinstance(fill_na, bool): raise TypeError("fill_na expects a numeric scalar, not boolean.") if not np.isscalar(fill_na): raise TypeError("fill_na must be a scalar numeric value.") fill_value = float(fill_na) df = df.fillna(fill_value) if zero_to_na: df = df.replace(0, np.nan) # Apply log and z-scaling sequentially if requested if log_transform is not None: if isinstance(log_transform, bool): raise TypeError("log_transform expects a numeric base, not boolean.") if log_transform <= 0: raise ValueError("log_transform must be positive.") if log_transform == 1: raise ValueError("log_transform cannot be 1.") log_base = float(log_transform) df = np.log1p(df) / np.log(log_base) if z_transform: row_means = df.mean(axis=1, skipna=True) row_stds = df.std(axis=1, skipna=True).replace(0, np.nan) df = df.sub(row_means, axis=0).div(row_stds, axis=0) # Long-form table with one row per (obs, var) intensity df_long = ( df.assign(obs=df.index) .melt(id_vars="obs", var_name="var", value_name="intensity") .dropna(subset=["intensity"]) ) if df_long.empty: raise ValueError("No intensities remain after preprocessing; nothing to plot.") if group_by is not None and group_by != "obs" and group_by not in adata.obs.columns: raise KeyError( f"Column '{group_by}' not found in adata.obs; " "pass a valid metadata column or use group_by='obs'." ) obs_index = pd.DataFrame({"obs": adata.obs_names}) if group_by: # --- Grouped plotting branch: pool observations per category --- group_df = obs_index.copy() if group_by == "obs": group_df[group_by] = group_df["obs"] else: group_df[group_by] = adata.obs[group_by].reset_index(drop=True) df_long = df_long.merge(group_df, on="obs", how="left") group_series = group_df[group_by] df_long = df_long.dropna(subset=[group_by]) if df_long.empty: raise ValueError( "No intensities remain after assigning group_by categories; nothing to plot." ) if order: ordered_groups = list(dict.fromkeys(order)) if is_categorical_dtype(group_series): default_order = list(group_series.cat.categories) else: default_order = group_series.dropna().drop_duplicates().tolist() for cat in default_order: if cat not in ordered_groups: ordered_groups.append(cat) else: if is_categorical_dtype(group_series): ordered_groups = list(group_series.cat.categories) else: ordered_groups = group_series.dropna().drop_duplicates().tolist() ordered_groups = [ cat for cat in ordered_groups if cat in df_long[group_by].unique() ] if not ordered_groups: raise ValueError("No group_by categories remain after preprocessing.") df_long[group_by] = pd.Categorical( df_long[group_by], categories=ordered_groups, ordered=True, ) palette_values = None if color_scheme is not None: palette_values = _resolve_color_scheme(color_scheme, ordered_groups) if not palette_values: cmap = mpl.colormaps["Set2"] palette_values = cmap(np.linspace(0, 1, len(ordered_groups))).tolist() palette = dict(zip(ordered_groups, palette_values)) created_fig = False if ax is None: fig, _ax = plt.subplots(figsize=figsize) created_fig = True else: _ax = ax fig = _ax.figure sns.boxplot( data=df_long, x=group_by, y="intensity", hue=group_by, order=ordered_groups, hue_order=ordered_groups, palette=palette, legend=False, flierprops=dict(marker='.', markersize=1), ax=_ax, ) plt.setp(_ax.get_xticklabels(), rotation=xlabel_rotation, ha="right") _ax.set_xlabel(group_by) _ax.set_ylabel(ylabel) if created_fig: fig.tight_layout() if save is not None: fig.savefig(save, dpi=300, bbox_inches="tight") if show and created_fig: plt.show() return _ax # --- Per-observation plotting branch (default) --- order_col = order_by if order_by else "all" order_df = obs_index.copy() if order_by: if order_by not in adata.obs.columns: raise KeyError( f"Column '{order_by}' not found in adata.obs; " "pass a valid metadata column." ) order_df[order_col] = adata.obs[order_by].reset_index(drop=True) else: order_df[order_col] = "all" if order_by: df_long = df_long.merge(order_df, on="obs", how="left") else: df_long[order_col] = "all" if df_long[order_col].isna().any(): missing_obs = df_long.loc[df_long[order_col].isna(), "obs"].unique() preview = ", ".join(map(str, missing_obs[:5])) suffix = "..." if len(missing_obs) > 5 else "" raise ValueError( f"Missing '{order_col}' annotations for observations: {preview}{suffix}" ) group_series = order_df[order_col] if order: ordered_groups = list(dict.fromkeys(order)) if is_categorical_dtype(group_series): default_order = list(group_series.cat.categories) else: default_order = group_series.drop_duplicates().tolist() for cat in default_order: if cat not in ordered_groups: ordered_groups.append(cat) else: if is_categorical_dtype(group_series): ordered_groups = list(group_series.cat.categories) else: ordered_groups = group_series.drop_duplicates().tolist() def _obs_in_category(category): mask = group_series == category return order_df.loc[mask, "obs"].tolist() cat_index_map = {cat: _obs_in_category(cat) for cat in ordered_groups} available_obs = set(df_long["obs"]) filtered_map: dict[Any, list[Any]] = {} for cat, obs_list in cat_index_map.items(): pruned = [obs for obs in obs_list if obs in available_obs] if pruned: filtered_map[cat] = pruned cat_index_map = filtered_map if not cat_index_map: raise ValueError("No observations remain after preprocessing; nothing to plot.") x_ordered = [obs for obs_list in cat_index_map.values() for obs in obs_list] df_long["obs"] = pd.Categorical(df_long["obs"], categories=x_ordered, ordered=True) unique_groups = list(cat_index_map.keys()) if order_col != "all": colors = None if color_scheme is not None: colors = _resolve_color_scheme(color_scheme, unique_groups) if not colors: colors = mpl.colormaps["Set2"](np.linspace(0, 1, len(unique_groups))).tolist() color_map = {grp: colors[i] for i, grp in enumerate(unique_groups)} else: color_map = {"all": "C0"} sample_palette: dict[Any, Any] = {} for obs in x_ordered: group_val = df_long.loc[df_long["obs"] == obs, order_col] if group_val.empty: continue color_key = group_val.iloc[0] sample_palette[obs] = color_map[color_key] created_fig = False if ax is None: fig, _ax = plt.subplots(figsize=figsize) created_fig = True else: _ax = ax fig = _ax.figure sns.boxplot( data=df_long, x="obs", y="intensity", hue="obs", order=x_ordered, hue_order=x_ordered, palette=sample_palette, flierprops=dict(marker='.', markersize=1), ax=_ax, ) if _ax.get_legend() is not None: _ax.get_legend().remove() plt.setp(_ax.get_xticklabels(), rotation=xlabel_rotation, ha="right") _ax.set_xlabel("") _ax.set_ylabel(ylabel) obs_idx_map = {obs: i for i, obs in enumerate(x_ordered)} ymax = df_long["intensity"].max() for cat, obs_list in cat_index_map.items(): start_idx = obs_idx_map[obs_list[0]] end_idx = obs_idx_map[obs_list[-1]] mid_idx = (start_idx + end_idx) / 2 _ax.text( x=mid_idx, y=ymax * 1.05, s=str(cat), ha="center", va="bottom", fontsize=12, fontweight="bold", rotation=order_by_label_rotation, ) if created_fig: fig.tight_layout() if save is not None: fig.savefig(save, dpi=300, bbox_inches="tight") if show and created_fig: plt.show() return _ax
[docs] def intensity_hist( adata, layer: str | None = None, color_imputed: bool = False, bool_layer: str | None = None, log_transform: float | int | None = None, ignore_warning: bool = False, fill_na: float | int | None = None, zero_to_nan: bool = False, bins: int = 60, density: bool = True, kde: bool = False, color_scheme: Any = None, xlim: tuple[float | int, float | int] | None = None, alpha: float = 0.6, title: str | None = None, legend_loc: str = "upper right", figsize=(7, 4), per_obs: bool = False, samples: list | None = None, ncols: int = 4, sharex: bool = True, sharey: bool = True, show: bool = True, ax: bool = False, save: str | os.PathLike[str] | None = None, ) -> Axes | None: """ Plot histogram(s) of var intensities, optionally colored by imputation status. Parameters ---------- adata : AnnData Proteomics :class:`~anndata.AnnData` containing the intensity matrix. layer : str | None, optional Layer to use for intensities; defaults to ``adata.X`` when ``None``. color_imputed : bool, optional When ``True``, overlay imputed values using the layer-specific mask; otherwise plot a single distribution of all intensities. bool_layer : str | None, optional Layer key containing the imputation mask. When ``None`` and ``color_imputed`` is ``True``, use ``imputation_mask_<layer>`` (with ``layer="X"`` for the main matrix). log_transform : float | int | None, optional Logarithm base (>0 and !=1). When ``None`` skip log-transforming; otherwise apply ``log(value + 1, base)`` after validating the input (Defaut = None). ignore_warning : bool, optional When ``True``, force the log transform even if the data already appear log-transformed according to :func:`copro.utils.array.is_log_transformed`. fill_na : float | int | None, optional Replace missing values with this constant before log transformation. zero_to_nan : bool, optional When True, replace exact zeros with ``NaN`` before plotting. bins : int, optional Number of histogram bins passed to ``numpy.histogram_bin_edges``. density : bool, optional Plot density instead of counts. kde : bool, optional Overlay kernel density estimate curves. color_scheme : Any, optional Palette specification forwarded to :func:`proteopy.utils.matplotlib._resolve_color_scheme`. xlim : tuple[float | int, float | int] | None, optional Explicit x-axis limits ``(xmin, xmax)`` applied to all histograms. alpha : float, optional Histogram transparency (0-1 range). title : str | None, optional Custom title for the plot; defaults are auto-generated. legend_loc : str, optional Location argument forwarded to :func:`matplotlib.axes.Axes.legend`. figsize : tuple[float, float], optional Figure size passed to :func:`matplotlib.pyplot.subplots`. per_obs : bool, optional When ``True``, draw per-observation facets; otherwise aggregate all values. samples : list | None, optional Optional ordered subset of observations (by index or name) for ``per_obs`` plots. ncols : int, optional Number of columns in the per-observation facet grid. sharex, sharey : bool, optional Whether subplots share their x- or y-axes in per-observation mode. show : bool, optional If True, call plt.show() at the end. ax : bool, optional When ``True`` and ``per_obs`` is ``False``, return the Axes handle and skip automatic plotting even if ``show`` is ``True``. save : str | Path | None, optional Path where the figure should be written. When ``None``, no file is saved. Returns ------- Axes | None Axes handle when ``ax`` is ``True`` (single histogram mode); otherwise ``None``. """ # Basic parameter validation to fail fast on misconfigured inputs bool_params = { "color_imputed": color_imputed, "ignore_warning": ignore_warning, "zero_to_nan": zero_to_nan, "density": density, "kde": kde, "per_obs": per_obs, "sharex": sharex, "sharey": sharey, "show": show, "ax": ax, } for name, value in bool_params.items(): if not isinstance(value, bool): raise TypeError(f"`{name}` must be a boolean.") if layer is not None and not isinstance(layer, str): raise TypeError("`layer` must be a string or None.") if bool_layer is not None and not isinstance(bool_layer, str): raise TypeError("`bool_layer` must be a string or None.") if not isinstance(bins, int) or bins <= 0: raise ValueError("`bins` must be a positive integer.") if not isinstance(ncols, int) or ncols <= 0: raise ValueError("`ncols` must be a positive integer.") if not isinstance(alpha, Real): raise TypeError("`alpha` must be numeric.") if not 0 <= float(alpha) <= 1: raise ValueError("`alpha` must be between 0 and 1.") if title is not None and not isinstance(title, str): raise TypeError("`title` must be a string or None.") if not isinstance(legend_loc, str): raise TypeError("`legend_loc` must be a string.") if figsize is None or not isinstance(figsize, SequenceABC) or len(figsize) != 2: raise TypeError("`figsize` must be a length-2 sequence of numbers.") try: figsize = (float(figsize[0]), float(figsize[1])) except (TypeError, ValueError): raise TypeError("`figsize` entries must be numeric.") if samples is not None: if ( not isinstance(samples, SequenceABC) or isinstance(samples, (str, bytes)) ): raise TypeError("`samples` must be a sequence of indices or names.") if save is not None and not isinstance(save, (str, os.PathLike)): raise TypeError("`save` must be a path-like object or None.") if xlim is not None: if not isinstance(xlim, SequenceABC) or len(xlim) != 2: raise TypeError("`xlim` must be a tuple (xmin, xmax).") xmin, xmax = xlim if not isinstance(xmin, Real) or not isinstance(xmax, Real): raise TypeError("`xlim` values must be numeric.") xmin = float(xmin) xmax = float(xmax) if not np.isfinite(xmin) or not np.isfinite(xmax): raise ValueError("`xlim` values must be finite.") if xmin >= xmax: raise ValueError("`xlim` must satisfy xmin < xmax.") x_limits = (xmin, xmax) else: x_limits = None if ax and per_obs: raise ValueError("`ax` can only be used when per_obs is False.") check_proteodata(adata) # --- pull data --- Xsrc = adata.layers[layer] if layer is not None else adata.X X = Xsrc.toarray() if sparse.issparse(Xsrc) else np.asarray(Xsrc, dtype=float) # Resolve the imputation mask layer if coloring measured vs imputed mask_layer_key: str | None = None if color_imputed: mask_layer_key = bool_layer if mask_layer_key is None: mask_target = str(layer) if layer is not None else "X" mask_layer_key = f"imputation_mask_{mask_target}" if mask_layer_key not in adata.layers: raise KeyError( f"'{mask_layer_key}' not found in adata.layers. " "Set color_imputed=False or provide `bool_layer` explicitly." ) Bsrc = adata.layers[mask_layer_key] B = Bsrc.toarray() if sparse.issparse(Bsrc) else np.asarray(Bsrc) if B.shape != X.shape: raise ValueError(f"Shape mismatch: data {X.shape} vs {mask_layer_key} {B.shape}") else: B = None # Determine log transform base and guard against double-logging if log_transform is None: log_base = None else: if not isinstance(log_transform, Real): raise TypeError("log_transform must be a numeric value or None.") log_base = float(log_transform) if log_base <= 0: raise ValueError("log_transform must be positive.") if math.isclose(log_base, 1.0): raise ValueError("log_transform cannot be 1.") is_log, stats = is_log_transformed(adata, layer=layer) if is_log and not ignore_warning: raise ValueError( "Input appears already log-transformed. Stats: " f"{stats}. Pass ignore_warning=True to force another log." ) Y = X.copy() Y = Y.astype(float, copy=False) Y[~np.isfinite(Y)] = np.nan if fill_na is not None and zero_to_nan: raise ValueError("fill_na and zero_to_nan cannot be used together.") if fill_na is not None: if not isinstance(fill_na, Real): raise TypeError("fill_na must be a numeric value or None.") fill_value = float(fill_na) if not np.isfinite(fill_value): raise ValueError("fill_na must be a finite numeric value.") np.nan_to_num(Y, copy=False, nan=fill_value) if zero_to_nan: Y[Y == 0] = np.nan if log_base is not None: Y[Y <= -1] = np.nan Y = np.log1p(Y) / math.log(log_base) # palette & order # Resolve colors for the measured/imputed legend default_palette = {"Measured": "#4C78A8", "Imputed": "#F58518"} status_labels = ["Measured", "Imputed"] if color_imputed else ["Measured"] resolved_colors = _resolve_color_scheme(color_scheme, status_labels) if not resolved_colors: resolved_colors = [default_palette[label] for label in status_labels] palette_map = dict(zip(status_labels, resolved_colors)) hue_order = status_labels if color_imputed else None measured_color = palette_map.get("Measured", default_palette["Measured"]) value_col = "intensity_value" if log_base is None: xlabel = "Intensity" descriptor = "" else: base_str = f"{log_base:g}" xlabel = f"Intensity (log{base_str}(x + 1))" descriptor = f"log{base_str}(x + 1)" # ------- Single (combined) histogram ------- if not per_obs: # Flatten matrix for a single overall histogram vals = Y.ravel() m = np.isfinite(vals) vals = vals[m] if vals.size == 0: raise ValueError("No finite values to plot after preprocessing.") bin_edges = np.histogram_bin_edges(vals, bins=bins) fig, _ax = plt.subplots(figsize=figsize) if B is not None: flags = B.astype(bool).ravel()[m] status = np.where(flags, "Imputed", "Measured") df = pd.DataFrame({value_col: vals, "status": status}) present = [h for h in hue_order if (df["status"] == h).any()] sns.histplot( data=df, x=value_col, hue="status", hue_order=present, bins=bin_edges, stat=("density" if density else "count"), multiple="layer", common_norm=False, palette=palette_map, alpha=alpha, edgecolor=None, ax=_ax, legend=False, ) if kde: for k, g in df.groupby("status"): if len(g) > 1: sns.kdeplot(g[value_col], ax=_ax, color=palette_map.get(k), lw=1.5) handles = [ Patch( facecolor=palette_map[h], edgecolor="none", alpha=alpha, label=h, ) for h in present ] _ax.legend(handles=handles, title="Status", loc=legend_loc, frameon=False) else: df = pd.DataFrame({value_col: vals}) sns.histplot( data=df, x=value_col, bins=bin_edges, stat=("density" if density else "count"), color=measured_color, alpha=alpha, edgecolor=None, ax=_ax, ) if kde and len(df) > 1: sns.kdeplot(df[value_col], ax=_ax, color=measured_color, lw=1.5) _ax.set_xlabel(xlabel) _ax.set_ylabel("Density" if density else "Count") default_title = f"Intensity histogram ({descriptor})" _ax.set_title(title or default_title) if x_limits is not None: _ax.set_xlim(x_limits) # save/show if save is not None: fig.savefig(save, dpi=300, bbox_inches='tight') if show and not ax: plt.show() if ax: return _ax return None # ------- Per-observation small multiples ------- # select observations if samples is None: idx = np.arange(adata.n_obs) labels = adata.obs_names.to_numpy() else: idx, labels = [], [] for s in samples: if isinstance(s, (int, np.integer)): idx.append(int(s)); labels.append(adata.obs_names[int(s)]) else: where = np.where(adata.obs_names == str(s))[0] if where.size == 0: raise KeyError(f"Sample '{s}' not in adata.obs_names") idx.append(int(where[0])); labels.append(str(s)) idx = np.asarray(idx, dtype=int) labels = np.asarray(labels, dtype=object) # global bins across all selected samples all_vals = [] for i in idx: vi = Y[i, :] m = np.isfinite(vi) if m.any(): all_vals.append(vi[m]) if len(all_vals) == 0: raise ValueError("No finite values to plot after preprocessing across selected observations.") bin_edges = np.histogram_bin_edges(np.concatenate(all_vals), bins=bins) n = len(idx) ncols = max(1, int(ncols)) nrows = int(math.ceil(n / ncols)) fig, axes = plt.subplots( nrows, ncols, figsize=figsize, squeeze=False, sharex=sharex, sharey=sharey, ) present_any = set() if B is not None else None for k, i in enumerate(idx): r, c = divmod(k, ncols) _ax = axes[r, c] vi = Y[i, :] bi = B[i, :].astype(bool) if B is not None else None m = np.isfinite(vi) vi = vi[m] if bi is not None: bi = bi[m] if vi.size == 0: _ax.set_visible(False) continue if bi is not None: status = np.where(bi, "Imputed", "Measured") df_i = pd.DataFrame({value_col: vi, "status": status}) present = [h for h in hue_order if (df_i["status"] == h).any()] present_any.update(present) sns.histplot( data=df_i, x=value_col, hue="status", hue_order=present, bins=bin_edges, stat=("density" if density else "count"), multiple="layer", common_norm=False, palette=palette_map, alpha=alpha, edgecolor=None, ax=_ax, legend=False, ) if kde: for lab in present: g = df_i[df_i["status"] == lab] if len(g) > 1: sns.kdeplot(g[value_col], ax=_ax, color=palette_map.get(lab), lw=1.2) else: df_i = pd.DataFrame({value_col: vi}) sns.histplot( data=df_i, x=value_col, bins=bin_edges, stat=("density" if density else "count"), color=measured_color, alpha=alpha, edgecolor=None, ax=_ax, legend=False, ) if kde and len(df_i) > 1: sns.kdeplot(df_i[value_col], ax=_ax, color=measured_color, lw=1.2) _ax.set_title(str(labels[k])) if r == nrows - 1: _ax.set_xlabel(xlabel) else: _ax.set_xlabel("") if c == 0: _ax.set_ylabel("Density" if density else "Count") else: _ax.set_ylabel("") if x_limits is not None: _ax.set_xlim(x_limits) # hide any extra axes for k in range(n, nrows * ncols): r, c = divmod(k, ncols) axes[r, c].set_visible(False) # global legend (figure-level unless user asked for 'best') if present_any is not None and present_any: present_any = [h for h in hue_order if h in present_any] handles = [ Patch(facecolor=palette_map[h], edgecolor="none", alpha=alpha, label=h) for h in present_any ] if legend_loc == "best": axes[0, 0].legend(handles=handles, title="Status", loc="best", frameon=False) else: fig.legend(handles=handles, title="Status", loc=legend_loc, frameon=False) per_obs_title = f"Intensity histograms per observation ({descriptor})" plt.suptitle(title or per_obs_title, y=0.995, fontsize=12) plt.tight_layout(rect=[0, 0, 1, 0.98]) # save/show if save is not None: fig.savefig(save, dpi=300, bbox_inches='tight') if show: plt.show() return None
docstr_header = ( "Plot a histogram of var intensities per observation, optionally colored " "by imputation status." ) intensity_hist_per_obs = partial_with_docsig( intensity_hist, per_obs=True, docstr_header=docstr_header, )
[docs] def abundance_rank( adata: ad.AnnData, color: str | None = None, highlight_vars: Sequence[str] | None = None, var_labels_key: str | None = None, var_label_fontsize: float = 8, layer: str | None = None, summary_method: str = "average", log_transform: float | None = 10, input_space: str = "auto", force: bool = False, zero_to_na: bool = False, fill_na: float | int | None = None, figsize: tuple[float, float] = (8, 6), title: str | None = None, xlabel: str | None = None, ylabel: str | None = None, alpha: float = 0.6, s: float = 20, show: bool = True, save: str | os.PathLike[str] | None = None, ax: bool = False, color_scheme: Any = None, ) -> Axes | None: """ Plot variable intensities vs their abundance rank. A typical MS proteomics plot to assess dynamic range and intensity distribution. Each point represents a variable (protein/peptide) with its y-value being the summary statistic (e.g., average, median) across observations (computation ignores NaNs). When ``color`` is specified, summary values and ranks are computed separately for each group. The plot shows one dot per variable per group, with all groups superimposed. When ``color`` is ``None``, a single dot per variable is plotted based on the global summary. Parameters ---------- adata : AnnData Proteomics :class:`~anndata.AnnData`. color : str, optional Column in ``adata.obs`` used to subset observations into groups. Summary values and ranks are computed separately per group, and all groups are plotted superimposed. When ``None``, global summary across all observations is used. highlight_vars : Sequence[str], optional List of variable names to highlight with text labels using adjustText. When ``color`` is specified, each variable is labeled once per group at its group-specific position. var_labels_key : str, optional Column in ``adata.var`` containing alternative labels for highlighted variables. When specified, these labels are displayed instead of the variable names. Useful for displaying gene symbols instead of IDs. var_label_fontsize : float, optional Font size for highlighted variable labels. layer : str, optional Key in ``adata.layers`` providing the intensity matrix. When ``None``, uses ``adata.X``. summary_method : str, optional Method to summarize intensities across observations per variable. Options: ``'sum'``, ``'average'``, ``'median'``, ``'max'``. NAs are ignored; if all values are NA, the result is NA. log_transform : float, optional Base for log transformation of intensities. Defines the target space. When ``None``, no transformation is applied (linear space). input_space : str, optional Specifies the input data space: ``'log'``, ``'linear'``, or ``'auto'``. When ``'auto'``, uses heuristics to infer whether data is already log-transformed. force : bool, optional When ``True``, suppress warnings about input space mismatches. zero_to_na : bool, optional Convert zero intensities to ``NaN`` before transformations. fill_na : float | int, optional Replace missing values with this constant before transformations. figsize : tuple[float, float], optional Figure dimensions (width, height) in inches. title : str, optional Plot title. Defaults to ``"Abundance Rank Plot"``. xlabel : str, optional Label for x-axis. Defaults to ``"Rank"``. ylabel : str, optional Label for y-axis. Auto-generated based on transformation applied. alpha : float, optional Point transparency (0-1 range). s : float, optional Point size for the scatter plot. show : bool, optional Display the figure with ``matplotlib.pyplot.show()``. save : str | os.PathLike, optional Path to save the figure. ``None`` skips saving. ax : bool, optional Return the underlying Axes object instead of ``None``. color_scheme : Any, optional Palette specification forwarded to :func:`proteopy.utils.matplotlib._resolve_color_scheme`. Returns ------- Axes | None The Matplotlib Axes object if ``ax=True``, otherwise ``None``. Raises ------ ValueError If ``input_space`` is ``'log'`` and ``log_transform`` is ``None`` (cannot convert log to linear without knowing the base), if both ``zero_to_na`` and ``fill_na`` are set, or if no valid data remains. KeyError If ``color`` column is not in ``adata.obs``, if ``layer`` is not in ``adata.layers``, or if ``highlight_vars`` contains variables not in ``adata.var_names``. Notes ----- **Input/Output Space Logic:** - ``input_space='auto'``: Heuristically infers whether data is log-transformed. Prints an informational message about the inference. - If input is inferred as log and target is also log (``log_transform`` is set): No transformation; prints a message that the log base is ignored since data is already in log space. - If input is inferred as log and target is linear (``log_transform=None``): Raises an error because the original log base is unknown. - If ``input_space`` is explicitly set (not ``'auto'``): - ``input_space='linear'`` with ``log_transform`` set: Applies log transformation. - ``input_space='log'`` with ``log_transform=None``: Raises a warning (or error if ``force=False``). - Matching spaces: No transformation. - When ``force=False`` and the inferred space doesn't match the declared ``input_space``, a warning is raised. Examples -------- Basic abundance rank plot: >>> proteopy.pl.abundance_rank(adata) Color by sample condition: >>> proteopy.pl.abundance_rank(adata, color="condition") Highlight specific proteins: >>> proteopy.pl.abundance_rank( ... adata, ... highlight_vars=["ProteinA", "ProteinB"], ... ) """ check_proteodata(adata) # --- Parameter validation --- if input_space not in ("auto", "log", "linear"): raise ValueError( "input_space must be 'auto', 'log', or 'linear'." ) if zero_to_na and fill_na is not None: raise ValueError("`zero_to_na` and `fill_na` are mutually exclusive.") if color is not None and color not in adata.obs.columns: raise KeyError(f"Column '{color}' not found in adata.obs.") if layer is not None and layer not in adata.layers: raise KeyError(f"Layer '{layer}' not found in adata.layers.") if highlight_vars is not None: missing_vars = [v for v in highlight_vars if v not in adata.var_names] if missing_vars: raise KeyError( f"Variables not found in adata.var_names: {missing_vars}" ) if var_labels_key is not None and var_labels_key not in adata.var.columns: raise KeyError(f"Column '{var_labels_key}' not found in adata.var.") if log_transform is not None: if not isinstance(log_transform, (int, float)): raise TypeError("log_transform must be a numeric value or None.") if log_transform <= 0: raise ValueError("log_transform must be positive.") if log_transform == 1: raise ValueError("log_transform cannot be 1.") if not isinstance(alpha, (int, float)) or not 0 <= alpha <= 1: raise ValueError("alpha must be a number between 0 and 1.") if not isinstance(s, (int, float)) or s <= 0: raise ValueError("s must be a positive number.") valid_summary_methods = ("sum", "average", "median", "max") if summary_method not in valid_summary_methods: raise ValueError( f"summary_method must be one of {valid_summary_methods}, " f"got '{summary_method}'." ) # --- Get data matrix --- if layer is not None: Xsrc = adata.layers[layer] else: Xsrc = adata.X if sparse.issparse(Xsrc): X = Xsrc.toarray() else: X = np.asarray(Xsrc, dtype=float) X = X.copy() # --- Handle NA and zero values --- if fill_na is not None: if not isinstance(fill_na, (int, float)): raise TypeError("fill_na must be a numeric value.") if not np.isfinite(fill_na): raise ValueError("fill_na must be a finite numeric value.") X[np.isnan(X)] = float(fill_na) if zero_to_na: X[X == 0] = np.nan # --- Determine input space and apply transformations --- inferred_is_log, infer_stats = is_log_transformed(adata, layer=layer) # Determine target space target_is_log = log_transform is not None # Handle input_space='auto' if input_space == "auto": if inferred_is_log: actual_input_is_log = True else: actual_input_is_log = False else: # Explicit input_space actual_input_is_log = (input_space == "log") # Check for mismatch between declared and inferred if actual_input_is_log != inferred_is_log and not force: inferred_str = "log" if inferred_is_log else "linear" warnings.warn( f"Declared input_space='{input_space}' but data appears to be " f"'{inferred_str}' (p95={infer_stats['p95']:.2f}, " f"frac_negative={infer_stats['frac_negative']:.4f}). " f"Set force=True to suppress this warning.", UserWarning, ) # Apply transformation logic transform_applied = None if actual_input_is_log and target_is_log: # Both log: no transformation needed print( "Input is already log-transformed; ignoring log_transform " f"parameter (base={log_transform})." ) transform_applied = "none (already log)" ylabel_default = "Intensity (log)" elif actual_input_is_log and not target_is_log: # Log to linear: cannot do without knowing the original base raise ValueError( "Cannot convert log-transformed data to linear space without " "knowing the original log base. Either set log_transform to keep " "data in log space, or provide data in linear space." ) elif not actual_input_is_log and target_is_log: # Linear to log: apply transformation log_base = float(log_transform) with np.errstate(divide='ignore', invalid='ignore'): X = np.log1p(X) / np.log(log_base) transform_applied = f"log{log_base:g}p1" ylabel_default = f"Intensity ({transform_applied})" else: # Both linear: no transformation transform_applied = "none (linear)" ylabel_default = "Intensity" # --- Define summary function --- def compute_summary(arr, method): """Compute summary statistic ignoring NAs.""" with warnings.catch_warnings(): warnings.filterwarnings("ignore", "All-NaN slice encountered") warnings.filterwarnings("ignore", "Mean of empty slice") if method == "sum": return np.nansum(arr, axis=0) elif method == "average": return np.nanmean(arr, axis=0) elif method == "median": return np.nanmedian(arr, axis=0) elif method == "max": return np.nanmax(arr, axis=0) # --- Compute summary values and ranks --- var_names = adata.var_names.to_numpy() n_vars = len(var_names) # Store summary values and ranks per group (or None for global) group_summary_values = {} group_rank_positions = {} if color is not None: # Get unique groups from the color column groups = adata.obs[color].dropna().unique().tolist() for group in groups: # Get observations belonging to this group group_obs_mask = adata.obs[color] == group group_obs_indices = np.where(group_obs_mask)[0] if len(group_obs_indices) == 0: continue # Compute summary using only this group's observations X_group = X[group_obs_indices, :] summary_vals = compute_summary(X_group, summary_method) # Create ranking for this group (descending by summary value) # Handle NaN values in ranking rank_position = np.full(n_vars, np.nan) valid_mask = np.isfinite(summary_vals) if valid_mask.any(): valid_indices = np.where(valid_mask)[0] valid_summary = summary_vals[valid_mask] rank_order = np.argsort(-valid_summary) for rank, idx in enumerate(rank_order): rank_position[valid_indices[idx]] = rank group_summary_values[group] = summary_vals group_rank_positions[group] = rank_position else: # Global summary across all observations summary_vals = compute_summary(X, summary_method) valid_mask = np.isfinite(summary_vals) if not valid_mask.any(): raise ValueError("No valid intensities remain after preprocessing.") rank_position = np.full(n_vars, np.nan) valid_indices = np.where(valid_mask)[0] valid_summary = summary_vals[valid_mask] rank_order = np.argsort(-valid_summary) for rank, idx in enumerate(rank_order): rank_position[valid_indices[idx]] = rank group_summary_values[None] = summary_vals group_rank_positions[None] = rank_position # --- Build plotting DataFrame --- records = [] if color is not None: for group in groups: if group not in group_summary_values: continue summary_vals = group_summary_values[group] rank_pos = group_rank_positions[group] for j, var_name in enumerate(var_names): summary_val = summary_vals[j] rank_val = rank_pos[j] if not np.isfinite(summary_val) or not np.isfinite(rank_val): continue records.append({ 'var': var_name, 'intensity': summary_val, 'rank': rank_val, 'color_group': group, }) else: summary_vals = group_summary_values[None] rank_pos = group_rank_positions[None] for j, var_name in enumerate(var_names): summary_val = summary_vals[j] rank_val = rank_pos[j] if not np.isfinite(summary_val) or not np.isfinite(rank_val): continue records.append({ 'var': var_name, 'intensity': summary_val, 'rank': rank_val, }) if not records: raise ValueError("No valid data points remain after computing summaries.") plot_df = pd.DataFrame(records) # --- Create plot --- fig, _ax = plt.subplots(figsize=figsize) if color is not None: palette_values = _resolve_color_scheme(color_scheme, groups) if not palette_values: cmap = mpl.colormaps.get_cmap("tab10") palette_values = [cmap(i % 10) for i in range(len(groups))] palette = dict(zip(groups, palette_values)) for group in groups: group_df = plot_df[plot_df['color_group'] == group] _ax.scatter( group_df['rank'], group_df['intensity'], c=[palette[group]], alpha=alpha, s=s, label=str(group), ) _ax.legend( title=color, bbox_to_anchor=(1.02, 1), loc='upper left', ) else: _ax.scatter( plot_df['rank'], plot_df['intensity'], alpha=alpha, s=s, color='steelblue', ) # --- Add highlighted variable labels --- if highlight_vars is not None and len(highlight_vars) > 0: # Build label mapping if var_labels_key is not None: var_to_label = dict(zip(var_names, adata.var[var_labels_key])) else: var_to_label = {v: v for v in var_names} texts = [] highlight_x = [] highlight_y = [] if color is not None: # When coloring by groups, highlight each var in each group for group in groups: if group not in group_summary_values: continue summary_vals = group_summary_values[group] rank_pos = group_rank_positions[group] grp_color = palette.get(group, 'red') for var in highlight_vars: var_idx = np.where(var_names == var)[0] if len(var_idx) == 0: continue var_idx = var_idx[0] var_rank = rank_pos[var_idx] var_summary = summary_vals[var_idx] if not np.isfinite(var_summary) or not np.isfinite(var_rank): continue label = str(var_to_label.get(var, var)) highlight_x.append(var_rank) highlight_y.append(var_summary) texts.append( _ax.text( var_rank, var_summary, label, fontsize=var_label_fontsize, ) ) _ax.scatter( [var_rank], [var_summary], color=grp_color, s=s * 2, zorder=10, marker='o', edgecolors='black', linewidths=0.5, ) else: # Global: single marker per var summary_vals = group_summary_values[None] rank_pos = group_rank_positions[None] for var in highlight_vars: var_idx = np.where(var_names == var)[0] if len(var_idx) == 0: continue var_idx = var_idx[0] var_rank = rank_pos[var_idx] var_summary = summary_vals[var_idx] if not np.isfinite(var_summary) or not np.isfinite(var_rank): continue label = str(var_to_label.get(var, var)) highlight_x.append(var_rank) highlight_y.append(var_summary) texts.append( _ax.text( var_rank, var_summary, label, fontsize=var_label_fontsize, ) ) _ax.scatter( [var_rank], [var_summary], color='red', s=s * 2, zorder=10, marker='o', edgecolors='black', linewidths=0.5, ) if texts: adjust_text( texts, x=highlight_x, y=highlight_y, ax=_ax, arrowprops=dict(arrowstyle="-", color="0.4", lw=0.7), expand=(1.5,1.5), force_text=0.5, force_explode=(4.4, 4.4), avoid_self=True, only_move={'text': 'x+y'}, ) # --- Set labels and title --- _ax.set_xlabel(xlabel or "Rank") _ax.set_ylabel(ylabel or ylabel_default) _ax.set_title(title or "Abundance Rank Plot") plt.tight_layout() # --- Save/Show logic --- if save is not None: fig.savefig(save, dpi=300, bbox_inches="tight") if show: plt.show() if ax: return _ax if not save and not show and not ax: warnings.warn( "Plot created but not displayed, saved, or returned. " "Set show=True, save to a path, or ax=True.", UserWarning, ) plt.close(fig) return None
[docs] def box( adata: ad.AnnData, keys: str | list[str], group_by: str | None = None, order: list[str] | None = None, layer: str | None = None, zero_to_na: bool = False, fill_na: float | int | None = None, log_transform: float | None = None, color_scheme: str | dict | list | None = None, figsize: tuple[float, float] = (8.0, 5.0), title: str | None = None, xlabel_rotation: float = 0, ylabel: str | None = None, show_points: bool = False, point_size: float = 3, point_alpha: float = 0.7, show: bool = True, save: str | os.PathLike[str] | None = None, ax: bool = False, ) -> Axes | list[Axes] | None: """ Boxplot of intensities for one or more variables. Parameters ---------- adata : AnnData :class:`~anndata.AnnData` with intensity data in ``.X`` or a specified layer. keys : str | list[str] Variable name(s) present in ``adata.var_names`` to plot. group_by : str | None, optional Column in ``adata.obs`` whose unique values define the x-axis categories. When ``None``, all observations are pooled into a single box. order : list[str] | None, optional Subset and order groups when ``group_by`` is set. Only groups listed in ``order`` are shown, in the given sequence. Ignored when ``group_by`` is ``None``. layer : str | None, optional Key in ``adata.layers`` providing the intensity matrix. When ``None``, uses ``adata.X``. zero_to_na : bool, optional Convert zero intensities to ``NaN`` before other transforms. fill_na : float | int | None, optional Replace missing intensities with this value before transformations. log_transform : float | None, optional Base for log transformation. When set, applies ``log(value + 1, base)``. color_scheme : str | dict | list | None, optional Color mapping for groups. Accepts a named Matplotlib colormap, a dict mapping group names to colors, or a list of colors. figsize : tuple[float, float], optional Figure dimensions (width, height) in inches. title : str | None, optional Plot title. When ``None``, defaults to the variable name. xlabel_rotation : float, optional Rotation angle (degrees) for x-axis tick labels. ylabel : str | None, optional Label for the y-axis. show_points : bool, optional Overlay individual observations as a strip plot on top of each box. point_size : float, optional Marker size for the individual points when ``show_points`` is ``True``. point_alpha : float, optional Opacity for individual points when ``show_points`` is ``True``. show : bool, optional Call ``matplotlib.pyplot.show()`` to display the plot. save : str | os.PathLike | None, optional File path to save the figure. For a single key, the format is inferred from the file extension (e.g. ``.png``, ``.pdf``, ``.svg``). For multiple keys, saved as a multi-page PDF. ax : bool, optional If ``True``, return the underlying Axes object(s) instead of ``None``. Returns ------- Axes | list[Axes] | None The Matplotlib Axes object(s) if ``ax=True``, otherwise ``None``. Raises ------ KeyError If any element of ``keys`` is not in ``adata.var_names``, if ``group_by`` is not in ``adata.obs``, or if ``layer`` is not in ``adata.layers``. ValueError If ``keys`` is empty, if ``order`` contains values not present in the ``group_by`` column, if ``log_transform`` is not positive or equals 1, or if ``group_by`` is ``None`` and ``order`` is provided. Examples -------- Single variable, grouped by condition: >>> import proteopy as pr >>> pr.pl.box(adata, keys="ProteinA", group_by="condition") Multiple variables saved to PDF: >>> pr.pl.box( ... adata, ... keys=["ProteinA", "ProteinB"], ... group_by="condition", ... save="boxplots.pdf", ... ) """ check_proteodata(adata) if isinstance(keys, str): keys = [keys] if not keys: raise ValueError("keys must contain at least one variable.") # Validate keys exist in var_names missing_keys = [k for k in keys if k not in adata.var_names] if missing_keys: raise KeyError( f"Keys not found in adata.var_names: {missing_keys}" ) # Validate group_by if group_by is not None and group_by not in adata.obs.columns: raise KeyError( f"Column '{group_by}' not found in adata.obs." ) if order is not None and group_by is None: raise ValueError( "order can only be used when group_by is provided." ) # Validate layer if layer is not None and layer not in adata.layers: raise KeyError( f"Layer '{layer}' not found in adata.layers." ) # Validate zero_to_na and fill_na if zero_to_na and fill_na is not None: raise ValueError("zero_to_na and fill_na are mutually exclusive.") # Validate log_transform if log_transform is not None: if log_transform <= 0: raise ValueError("log_transform must be positive.") if log_transform == 1: raise ValueError("log_transform cannot be 1.") # Get intensity data if layer is not None: X = adata[:, keys].layers[layer] else: X = adata[:, keys].X if sparse.issparse(X): X = X.toarray() X = np.asarray(X, dtype=float) # Apply zero_to_na and fill_na if zero_to_na: X[X == 0] = np.nan if fill_na is not None: X[np.isnan(X)] = float(fill_na) # Apply log transform if log_transform is not None: if (X < 0).any(): warnings.warn( "Data contains negative values. Log transform will produce " "NaN for values < -1.", RuntimeWarning, ) log_base = float(log_transform) with np.errstate(invalid='ignore'): X = np.log1p(X) / np.log(log_base) # Build long-format DataFrame df = pd.DataFrame( X, index=adata.obs_names, columns=keys, ) if group_by is not None: df[group_by] = adata.obs[group_by].values id_vars = [group_by] else: df["_group"] = "all" id_vars = ["_group"] df_long = df.melt( id_vars=id_vars, var_name="variable", value_name="intensity", ) # Determine group order and optionally filter groups hue_col = group_by if group_by is not None else "_group" if order is not None: available_groups = set(df_long[hue_col].unique()) missing_in_data = set(order) - available_groups if missing_in_data: raise ValueError( f"Groups in 'order' not found in data: " f"{sorted(missing_in_data)}. " f"Available groups: {sorted(available_groups)}" ) df_long = df_long[df_long[hue_col].isin(order)] group_order = list(order) else: if is_categorical_dtype(df_long[hue_col]): group_order = [ cat for cat in df_long[hue_col].cat.categories if cat in df_long[hue_col].values ] else: group_order = sorted(df_long[hue_col].dropna().unique()) # Filter out rows with NaN groups df_long = df_long[df_long[hue_col].isin(group_order)] # Resolve color scheme palette = None if color_scheme is not None: colors = _resolve_color_scheme(color_scheme, group_order) if colors: palette = dict(zip(group_order, colors)) # Determine ylabel if ylabel is None: if log_transform is not None: base_str = f"{float(log_transform):g}" ylabel = f"Intensity (log{base_str}(x + 1))" else: ylabel = "Intensity" multi = len(keys) > 1 axes_out = [] pdf_pages = None if save and multi: save_path = ( str(save) if str(save).endswith('.pdf') else f'{save}.pdf' ) pdf_pages = PdfPages(save_path) for key in keys: sub = df_long[df_long["variable"] == key] fig, _ax = plt.subplots(figsize=figsize) if group_by is not None: sns.boxplot( data=sub, x=hue_col, y="intensity", hue=hue_col, order=group_order, hue_order=group_order, palette=palette, legend=False, flierprops={'marker': '.', 'markersize': 1}, ax=_ax, ) if show_points: sns.stripplot( data=sub, x=hue_col, y="intensity", order=group_order, color="black", alpha=point_alpha, size=point_size, jitter=0.2, dodge=False, ax=_ax, ) _ax.set_xlabel(group_by) else: sns.boxplot( data=sub, y="intensity", flierprops={'marker': '.', 'markersize': 1}, ax=_ax, ) if show_points: sns.stripplot( data=sub, y="intensity", color="black", alpha=point_alpha, size=point_size, jitter=0.2, ax=_ax, ) _ax.set_xlabel("") _ax.set_xticks([]) _ax.set_ylabel(ylabel) ha = "center" if xlabel_rotation == 0 else "right" plt.setp( _ax.get_xticklabels(), rotation=xlabel_rotation, ha=ha, ) _ax.set_title(title if title is not None else key) plt.tight_layout() if ax: axes_out.append(_ax) if show: plt.show() elif save: if multi: pdf_pages.savefig(fig, bbox_inches='tight') else: fig.savefig(save, dpi=300, bbox_inches='tight') if show: plt.show() plt.close(fig) elif show: plt.show() plt.close(fig) else: plt.close(fig) if pdf_pages is not None: pdf_pages.close() if ax: return axes_out[0] if len(axes_out) == 1 else axes_out return None
[docs] def binary_heatmap( adata: ad.AnnData, layer: str | None = None, threshold: float | int = 0, fill_na: float | int | None = None, order_by: str | None = None, order: Sequence[Any] | None = None, col_cluster: bool = False, row_cluster: bool = False, ylabels: bool = False, var_id_key: str | None = None, cmap: Any | None = "coolwarm", figsize: tuple[float, float] = (10, 8), xtick_rotation: float = 90, ytick_rotation: float = 0, ylabel: str | None = None, title: str | None = None, show: bool = True, save: str | os.PathLike[str] | None = None, ax: bool = False, ) -> Axes | None: """ Plot a binary detection heatmap of intensities across samples and features. Values greater than ``threshold`` are encoded as 1 (present) and values less than or equal to ``threshold`` are encoded as 0 (absent). Missing values are not allowed and trigger a ``ValueError``. Parameters ---------- adata : AnnData Proteomics :class:`~anndata.AnnData`. layer : str, optional Key in ``adata.layers`` to plot. When ``None``, use ``adata.X``. threshold : float | int, optional Values strictly greater than this are considered present (1); values less than or equal become 0. fill_na : float | int | None, optional Replace missing values with this constant before binarization. When ``None`` (default), missing values raise a ``ValueError``. order_by : str, optional Column in ``adata.obs`` used to group and order observations (samples). When ``None``, observations follow ``adata.obs_names`` order. order : Sequence[Any], optional Explicit order of categories when ``order_by`` is set. When ``None`` and ``order_by`` is categorical, uses category order; otherwise uses sorted order. col_cluster : bool, optional Perform hierarchical clustering on samples (columns/x-axis) and display the dendrogram when ``True``. Mutually exclusive with ``order_by``. row_cluster : bool, optional Perform hierarchical clustering on features (rows/y-axis) and display the dendrogram when ``True``. ylabels : bool, optional Display y-axis tick labels and ticks when ``True``. var_id_key : str | None, optional Column in ``adata.var`` used as row labels. When ``None`` (default), the key is derived automatically from the proteodata level: ``"peptide_id"`` for peptide-level data and ``"protein_id"`` for protein-level data. Falls back to ``adata.var_names`` when the derived key is absent. cmap : Any | None, optional Colormap passed directly to :func:`seaborn.clustermap`. Accepts any value recognised by Matplotlib (name string, ``Colormap`` instance, etc.). Defaults to ``\"coolwarm\"``. figsize : tuple of float, optional Figure size passed to Matplotlib. xtick_rotation : float, optional Rotation angle for sample tick labels. ytick_rotation : float, optional Rotation angle for feature tick labels. ylabel : str | None, optional Label for the heatmap y-axis. When ``None`` (default), the label is derived automatically from the proteodata level (``"Peptide"`` or ``"Protein"``). Pass ``""`` to hide the label entirely. title : str | None, optional Optional title placed above the heatmap. show : bool, optional If True, call :func:`matplotlib.pyplot.show` after plotting. save : str | PathLike, optional Path to save the figure. ``None`` skips saving. ax : bool, optional When ``True``, return the axes (heatmap) or ClusterGrid (clustermap) object instead of closing it. Returns ------- Axes | None Handle to the created plot object when ``ax`` is ``True``; otherwise ``None``. Examples -------- Basic binary detection heatmap from ``adata.X``: >>> import proteopy as pr >>> adata = pr.datasets.karayel_2020() >>> pr.pl.binary_heatmap(adata, threshold=0) Order samples by an observation annotation and customize labels: >>> pr.pl.binary_heatmap( ... adata, ... order_by="cell_type", ... order=["Progenitor", "ProE&EBaso", "LBaso", "Poly", "Ortho"], ... row_cluster=True, ... fill_na=0, ... ylabel="Peptides", ... xtick_rotation=45, ... ) """ check_proteodata(adata) _, _level = is_proteodata(adata) # Derive var_id_key from proteodata level when not explicitly provided if var_id_key is None: if _level == "peptide": var_id_key = "peptide_id" elif _level == "protein": var_id_key = "protein_id" # Derive ylabel from proteodata level when not explicitly provided if ylabel is None: ylabel = _level.capitalize() if _level else "" # Validate input and parameters if isinstance(threshold, bool) or not isinstance(threshold, Real): raise TypeError("`threshold` must be a numeric scalar.") if not np.isfinite(threshold): raise ValueError("`threshold` must be finite.") if layer is not None and not isinstance(layer, str): raise TypeError("`layer` must be a string or None.") if fill_na is not None: if isinstance(fill_na, bool): raise TypeError("`fill_na` must be numeric, not boolean.") if not isinstance(fill_na, Real): raise TypeError("`fill_na` must be a numeric scalar.") if not np.isfinite(fill_na): raise ValueError("`fill_na` must be finite.") if title is not None and not isinstance(title, str): raise TypeError("`title` must be a string or None.") if ylabel is not None and not isinstance(ylabel, str): raise TypeError("`ylabel` must be a string or None.") if var_id_key is not None and not isinstance(var_id_key, str): raise TypeError("`var_id_key` must be a string or None.") if not isinstance(show, bool): raise TypeError("`show` must be a boolean.") if not isinstance(ax, bool): raise TypeError("`ax` must be a boolean.") if not isinstance(col_cluster, bool): raise TypeError("`col_cluster` must be a boolean.") if not isinstance(row_cluster, bool): raise TypeError("`row_cluster` must be a boolean.") if not isinstance(ylabels, bool): raise TypeError("`ylabels` must be a boolean.") if order_by is not None and not isinstance(order_by, str): raise TypeError("`order_by` must be a string or None.") if order is not None: if not isinstance(order, SequenceABC): raise TypeError("`order` must be a sequence or None.") if isinstance(order, (str, bytes)): raise TypeError( "`order` must be a sequence of category labels, not a string." ) if order_by is not None and col_cluster: raise ValueError( "`order_by` and `col_cluster` are mutually exclusive; " "choose one ordering method." ) if not isinstance(xtick_rotation, Real) or isinstance( xtick_rotation, bool, ): raise TypeError("`xtick_rotation` must be numeric.") if not isinstance(ytick_rotation, Real) or isinstance( ytick_rotation, bool, ): raise TypeError("`ytick_rotation` must be numeric.") if figsize is None or not isinstance(figsize, SequenceABC): raise TypeError("`figsize` must be a length-2 sequence of numbers.") if len(figsize) != 2: raise TypeError("`figsize` must be a length-2 sequence of numbers.") try: figsize = (float(figsize[0]), float(figsize[1])) except (TypeError, ValueError) as exc: raise TypeError("`figsize` entries must be numeric.") from exc if save is not None and not isinstance(save, (str, os.PathLike)): raise TypeError("`save` must be a path-like string or None.") # Get data if layer is None: X_src = adata.X else: if layer not in adata.layers: raise KeyError(f"Layer '{layer}' not found in adata.layers.") X_src = adata.layers[layer] if sparse.issparse(X_src): X = X_src.toarray() else: X = np.asarray(X_src, dtype=float) # Handle order_by and order for observation ordering if order_by is not None: if order_by not in adata.obs.columns: raise KeyError(f"'{order_by}' not found in adata.obs columns.") obs_col = adata.obs[order_by] # Determine the ordering of categories if pd.api.types.is_categorical_dtype(obs_col): if order is not None: # Validate that all order items are in the categories available_cats = set(obs_col.cat.categories) missing = set(order) - available_cats if missing: raise ValueError( f"Items in 'order' not found in '{order_by}' " f"categories: {sorted(missing)}" ) cats_ordered = list(order) else: # Use categorical order cats_ordered = list(obs_col.cat.categories) else: # Non-categorical column unique_vals = pd.Series(obs_col.unique()).dropna() try: default_order = sorted(unique_vals) except TypeError: default_order = unique_vals.tolist() if order is not None: # Validate that all order items are in the unique values available_vals = set(unique_vals) missing = set(order) - available_vals if missing: raise ValueError( f"Items in 'order' not found in '{order_by}' " f"values: {sorted(missing)}" ) cats_ordered = list(order) else: # Use sorted order cats_ordered = default_order # Build observation index order based on categories obs_index_ordered = [] for cat in cats_ordered: indices = adata.obs.index[obs_col == cat].tolist() obs_index_ordered.extend(indices) # Reorder the matrix and obs_names idx_map = {name: i for i, name in enumerate(adata.obs_names)} reorder_indices = [idx_map[name] for name in obs_index_ordered] X = X[reorder_indices, :] obs_names_ordered = obs_index_ordered else: obs_names_ordered = adata.obs_names # Binary heatmap expects complete data unless fill_na if fill_na is not None: X = np.where(np.isnan(X), float(fill_na), X) elif np.isnan(X).any(): raise ValueError( "Missing values detected; binary_heatmap requires complete data." ) # Convert numbers to binary mask binary = (X > threshold).astype(int) # pick row labels from var_id_key if available, otherwise use var_names if var_id_key is not None and var_id_key in adata.var.columns: row_index = adata.var[var_id_key] else: row_index = adata.var_names # create DataFrame oriented for seaborn (features as rows) plot_df = pd.DataFrame( binary.T, index=row_index, columns=obs_names_ordered, ) # colorbar configuration (binary heatmap uses only 0/1 ticks) cbar_kws = { "ticks": [0, 1], "orientation": "horizontal", } # resolve colormap, falling back to default cmap_resolved = cmap if cmap is not None else "coolwarm" if row_cluster or col_cluster: # estimate recursion limit required for computing dendrograms cluster_sizes = [] if row_cluster: cluster_sizes.append(plot_df.shape[0]) if col_cluster: cluster_sizes.append(plot_df.shape[1]) current_limit = sys.getrecursionlimit() target_limit = max( current_limit, int(4 * max(cluster_sizes) + 1000), ) else: current_limit = None target_limit = None try: if target_limit is not None and target_limit > current_limit: sys.setrecursionlimit(target_limit) # draw the clustermap / heatmap grid = sns.clustermap( plot_df, row_cluster=row_cluster, col_cluster=col_cluster, cmap=cmap_resolved, vmin=0, vmax=1, figsize=figsize, cbar_kws=cbar_kws, tree_kws={'linewidths': 1}, ) grid.fig.subplots_adjust(right=0.70, bottom=0.12) grid.ax_cbar.set_position([0.75, 0.04, 0.18, 0.03]) finally: if target_limit is not None and current_limit is not None: if sys.getrecursionlimit() != current_limit: sys.setrecursionlimit(current_limit) grid.ax_heatmap.set_ylabel(ylabel) if title: grid.ax_heatmap.set_title(title) plt.setp( grid.ax_heatmap.get_xticklabels(), rotation=xtick_rotation, ha="right" ) if ylabels: n_rows = plot_df.shape[0] if row_cluster and grid.dendrogram_row is not None: ordered_labels = [ plot_df.index[i] for i in grid.dendrogram_row.reordered_ind ] else: ordered_labels = list(plot_df.index) # Force a tick for every row; seaborn places centres at i + 0.5 grid.ax_heatmap.set_yticks([i + 0.5 for i in range(n_rows)]) grid.ax_heatmap.set_yticklabels( ordered_labels, rotation=ytick_rotation, ha="left", ) else: grid.ax_heatmap.set_yticks([]) grid.ax_heatmap.set_yticklabels([]) if save is not None: grid.fig.savefig(save, dpi=300, bbox_inches="tight") if show: plt.show() elif not ax: plt.close(grid.fig) if ax: return grid.ax_heatmap return None