Source code for proteopy.tl.copf

import itertools
import copy as copym
import numpy as np
import pandas as pd
import anndata as ad
from scipy import stats
from sklearn.cluster import AgglomerativeClustering
from scipy.stats import norm
from statsmodels.stats.multitest import multipletests

from proteopy.utils.copf import reconstruct_corrs_df_symmetric_from_long_df
from proteopy.utils.data_structures import BinaryClusterTree

NOISE = 1e6


def pairwise_peptide_correlations_(
    df,
    sample_column="filename",
    peptide_column="peptide_id",
    value_column="intensity",
    ):
    '''
    Calculate pairwise peptide correlations.
    Only outputs unique (non-symmetrical) correlations.

    Parameters:
    - df (pandas.DataFrame): The input DataFrame containing the data.
    - sample_column (str): The name of the column in `df` representing the samples.
    - peptide_column (str): The name of the column in `df` representing the peptides.
    - value_column (str): The name of the column in `df` representing the values.

    Returns:
    - result (pandas.DataFrame): A DataFrame containing the pairwise peptide
        correlations. Columns: 'pepA', 'pepB', 'PCC' (Pearson correlation coefficient).
        Only outputs unique (non-symmetrical) correlations (AB, not AB, B-A, AA, BB).
    '''

    # TODO: modify df input to be obs x vars. Here we have redundant steps with
    # AnnDataTrces pairwise_peptide_correlations()
    df = df[[sample_column, peptide_column, value_column]]

    pivot_df = df.pivot_table(index=sample_column, columns=peptide_column, values=value_column)
    columns = pivot_df.columns.tolist()

    corr_dict = {}

    for col_a, col_b in itertools.combinations(columns, 2):

        pivot_col_a = pivot_df.loc[:, col_a]
        pivot_col_b = pivot_df.loc[:, col_b]
        corr_dict[col_a + '_' + col_b] = stats.pearsonr(pivot_col_a, pivot_col_b)

    corr_df = pd.DataFrame.from_dict(corr_dict, orient='index')
    corr_df.columns = ['PCC', 'p-value']
    corr_df['peptide_pair'] = corr_df.index
    corr_df[['pepA', 'pepB']] = corr_df['peptide_pair'].str.split('_', expand=True)
    corr_df = corr_df[["pepA","pepB","PCC"]]
    corr_df = corr_df.reset_index(drop=True)

    return corr_df


[docs] def pairwise_peptide_correlations( adata, protein_id='protein_id', inplace=True, copy=False, batch_key: str | None = None, # per-batch if provided → always pooled min_contrib_batches: int = 1, # pooling threshold min_wsum: float = 0.0, # pooling threshold on sum(n_b-3) ): if inplace and copy: raise ValueError('Arguments raise and copy are mutually exclusive') if protein_id not in adata.var.columns: raise ValueError(f'protein_id: {protein_id} not in .var.columns') STORE_KEY = "pairwise_peptide_correlations" PER_BATCH_STORE_KEY = "pairwise_peptide_correlations_by_batch" def _finalize(out, per_batch=None): if copy: adata_new = adata.copy() adata_new.uns[STORE_KEY] = out if per_batch is not None: adata_new.uns[PER_BATCH_STORE_KEY] = per_batch return adata_new if inplace: adata.uns[STORE_KEY] = out if per_batch is not None: adata.uns[PER_BATCH_STORE_KEY] = per_batch return return out def compute_corrs(df): corrs = pairwise_peptide_correlations_( df, sample_column='obs_id', peptide_column='var_id', value_column='intensity') return corrs anns = adata.var[['protein_id']].reset_index() traces_df = adata.to_df().T.reset_index() traces_df = traces_df.merge(anns, on='index') traces_df = traces_df.rename(columns={'index': 'var_id'}) # TODO: remove unnecessary step of melting which gets unmelted # in protein-level function traces_df = pd.melt( traces_df, id_vars=['protein_id', 'var_id'], var_name='obs_id', value_name='intensity') if batch_key is None: corrs = traces_df.groupby('protein_id', observed=True).apply(compute_corrs, include_groups=False) corrs = corrs.droplevel(1, axis=0) corrs = corrs.sort_values(['pepA', 'pepB']).sort_index() return _finalize(corrs) if batch_key not in adata.obs.columns: raise ValueError(f"batch_key '{batch_key}' not found in .obs.columns") batches = ( adata.obs[[batch_key]] .reset_index() .rename(columns={'index': 'obs_id', batch_key: 'batch_id'}) ) long = traces_df.merge(batches, on='obs_id', how='left') batch_sizes = adata.obs[batch_key].value_counts().to_dict() batch_weights = {b: max(n - 3.0, 0.0) for b, n in batch_sizes.items()} per_batch = ( long .groupby([protein_id, 'batch_id'], observed=True) .apply(compute_corrs, include_groups=False) ) if per_batch.empty: per_batch_df = pd.DataFrame(columns=['pepA', 'pepB', 'PCC']) per_batch_df.index = pd.MultiIndex.from_tuples([], names=[protein_id, 'batch_id']) else: per_batch_df = ( per_batch .reset_index(level=2, drop=True) .sort_values(['pepA', 'pepB']) .sort_index() ) # Fisher pooling across batches rows = [] for prot, gprot in per_batch_df.reset_index().groupby(protein_id, observed=True, sort=False): for (pa, pb), gp in gprot.groupby(['pepA', 'pepB'], observed=True, sort=False): r = gp['PCC'].to_numpy(dtype=float) bids = gp['batch_id'].to_numpy() r = np.clip(r, -0.999999, 0.999999) z = np.arctanh(r) w = np.array([batch_weights.get(b, 0.0) for b in bids], dtype=float) mask = w > 0 if not np.any(mask): continue w = w[mask]; z = z[mask] wsum = float(w.sum()) if (mask.sum() >= min_contrib_batches) and (wsum >= min_wsum): # Fixed-effects mean (zbar_fe) and weighted between-batch variance (var_z_between) zbar_fe = float((w * z).sum() / wsum) Q = float((w * (z - zbar_fe) ** 2).sum()) var_z_between = Q / wsum # Conservative PCC from fixed-effects mean (no DL): shift by var_z_between rhat = float(np.tanh(zbar_fe - var_z_between)) rows.append((prot, pa, pb, rhat, var_z_between)) if rows: pooled_df = ( pd.DataFrame(rows, columns=[protein_id, 'pepA', 'pepB', 'PCC', 'var_z_between']) .set_index(protein_id) .sort_values(['pepA', 'pepB']) .sort_index() ) else: pooled_df = pd.DataFrame(columns=['pepA', 'pepB', 'PCC', 'var_z_between']) pooled_df.index.name = protein_id return _finalize(pooled_df, per_batch=per_batch_df)
def peptide_dendograms_by_correlation_( df, method: str = 'agglomerative-hierarchical-clustering', ): ''' Perform peptide clustering grouped by protein annotation. Parameters: ---------- df : pandas.DataFrame Data frame with pairwise correlations annotated with the protein they belong to.] method : str Which clustering method to apply. Returns: ------- dict Dictionary with clustering method output. - 'agglomerative-hierarchical-clustering' => {protein_id: {'labels': list, 'height': list, 'merge': list(list)}} - labels: list of peptides - merge: steps in which different peptides are merged. n_steps == n_samples - 1 The two ids included for every step represent the index of the peptide in 'labels'. - heights: The height of each merging step in 'merge'. The idx of the height corresponds to the index of the step in 'merge'. ''' assert all(df.index == df.columns) model = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=0, compute_distances=True) model.fit(df) # pylint: disable=no-member dendogram = { 'type': 'sklearn_agglomerative_clustering', 'labels': model.feature_names_in_.tolist(), 'heights': model.distances_.tolist(), 'merge': model.children_.tolist() } # pylint: enable=no-member return dendogram
[docs] def peptide_dendograms_by_correlation( adata, method='agglomerative-hierarchical-clustering', inplace=True, copy=False, ): if inplace and copy: raise ValueError('Arguments raise and copy are mutually exclusive') if 'pairwise_peptide_correlations' not in adata.uns: raise ValueError(f'pairwise_peptide_correlations not in .uns') corrs = adata.uns['pairwise_peptide_correlations'].copy() dends = {} for protein_id, df in corrs.groupby('protein_id', observed=True): corr_sym = reconstruct_corrs_df_symmetric_from_long_df( df, var_a_col='pepA', var_b_col='pepB', corr_col='PCC') corr_dists = 1 - corr_sym dends[protein_id] = peptide_dendograms_by_correlation_( corr_dists, method= 'agglomerative-hierarchical-clustering') if inplace: adata.uns['dendograms'] = dends elif copy: adata_new = adata.copy() adata_new.uns['dendograms'] = dends return adata_new else: return dends
def peptide_clusters_from_dendograms_( dendogram, n_clusters=2, min_peptides_per_cluster=2, noise=1e6, ): ''' Cut clusters from cluster_peptides into N clusters with more than 1 peptide. ''' n_peptides = len(dendogram['labels']) n_real_clusters = 0 k = n_clusters cluster_tree = BinaryClusterTree(constructor=dendogram) while n_real_clusters < n_clusters: clusters = cluster_tree.cut(k, use_labels=True) n_per_cluster = clusters.value_counts() is_multipep = n_per_cluster >= min_peptides_per_cluster n_real_clusters = is_multipep.sum() k += 1 single_pep_clusters = n_per_cluster[~is_multipep].index clusters[clusters.isin(single_pep_clusters)] = noise if k >= n_peptides: clusters[:] = noise break # Rename cluster_ids to systematic format max_cluster = clusters.max() cats = clusters.astype('category').cat.categories n_clusters = len(cats) if max_cluster != n_clusters: for i in range(n_clusters): clusters[clusters == cats[i]] = i if noise in cats: clusters[clusters == max(clusters)] = noise return clusters
[docs] def peptide_clusters_from_dendograms( adata, n_clusters=2, min_peptides_per_cluster=2, noise=NOISE, inplace=True, copy=False, ): if inplace and copy: raise ValueError('Arguments raise and copy are mutually exclusive') if 'dendograms' not in adata.uns: raise ValueError(f'dendograms not in .uns') var = adata.var.copy() var['cluster_id'] = np.nan clusters_ann = {} dends = adata.uns['dendograms'] for prot, dend in dends.items(): dend_upd = copym.deepcopy(dend) dend_upd['type'] = 'sklearn_agglomerative_clustering' clusters = peptide_clusters_from_dendograms_( dend_upd, n_clusters=2, min_peptides_per_cluster=2, noise=noise) mask = (var['protein_id'] == prot) & (var.index.isin(clusters.index)) var.loc[mask, 'cluster_id'] = clusters.reindex(var.index[mask]) clusters_ann[prot] = clusters assert not any((var['cluster_id'] == -1).tolist()) var['proteoform_id'] = ( var['protein_id'].astype(str) + '_' + var['cluster_id'].astype(int).astype(str) ) if inplace: adata.uns['clusters'] = clusters_ann adata.var = var elif copy: adata_new = adata.copy() adata_new.uns['clusters'] = clusters_ann return adata_new else: return clusters_ann
def proteoform_scores_( corrs, clusters, n_fractions, summary_func=np.mean, noise=NOISE ): ''' Calculates a score for proteoforms based on the difference of within cluster distances and between cluster distances. IMPORTANT: currently only implemented properly for n_clusters = 2 Args: corrs (pd.DataFrame): correlation between peptides. In symmetrical matrix form (index == columns) clusters (pd.Series | pd.DataFrame): vector of cluster_ids with indexes corresponding to the peptides for a specific protein. n_fractions (int): Number of samples. summary_func (Callable): Summary function to apply to intra- and inter- cluster correlation coefficients. ''' def replace_upper_triangle(df, replacement, k=0): arr = df.to_numpy().astype(float) rows, cols = np.triu_indices_from(arr, k=k) arr[rows, cols] = replacement new_df = pd.DataFrame(arr, columns=df.columns, index=df.index) return new_df if isinstance(clusters, pd.DataFrame): clusters = clusters['cluster'] if np.issubdtype(clusters.dtype, np.floating): clusters = clusters.astype(int) assert any(corrs.index == corrs.columns) assert all([i in clusters.index for i in corrs.index]), \ f'clusters.index = {clusters.index}' \ f'\ncorrs_index = {corrs.index}' if (clusters == noise).all().all(): return np.array([0, np.nan, np.nan, np.nan]) cluster_ids = clusters.unique() cluster_ids = cluster_ids[cluster_ids != noise].tolist() if len(cluster_ids) > 2: raise ValueError('Functionality with n_clusters > 2 not implemented yet.') mat = corrs.copy(deep=True) stat_v = [] for c in cluster_ids: cluster_ids_inv = cluster_ids[cluster_ids != c] clust1_ids = clusters[clusters == cluster_ids_inv[0]] clust2_ids = clusters[clusters == cluster_ids_inv[1]] clust_ids_ord = clust1_ids + clust2_ids mat_inv = corrs.loc[clust_ids_ord, clust_ids_ord] cross = mat_inv.loc[clust1_ids, clust2_ids] # QUESTION: why no diagonal removal as below? values = cross.to_numpy().flatten() values = values[~np.isnan(values)] stat_across = np.apply_along_axis(summary_func, 0, cross) rows, cols = np.triu_indices_from(mat_inv, k=0) # k=1 excludes diagonal mat_inv.to_numpy()[rows, cols] = np.nan within_c1 = mat_inv.loc[clust1_ids, clust1_ids] values = within_c1.to_numpy().flatten() values = values[~np.isnan(values)] stat_within_c1 = np.apply_along_axis(summary_func, 0, values) within_c2 = mat_inv.loc[clust2_ids, clust2_ids] values = within_c2.to_numpy().flatten() values = values[~np.isnan(values)] stat_within_c2 = np.apply_along_axis(summary_func, 0, values) stat_within = min([stat_within_c1, stat_within_c2]) diff_stat = stat_within - stat_across z_stat_within = np.atanh(stat_within) z_stat_across = np.atanh(stat_across) z_diff_stat = z_stat_within - z_stat_across dz = z_diff_stat / (np.sqrt((1 / (n_fractions-3)) + (1 / (n_fractions-3)))) pval = 2 * (1 - norm.cdf(np.abs(dz))) stat_v.append([diff_stat, z_diff_stat, dz, pval]) diff_stats = [i[0] for i in stat_v] sel_min_diff = np.which(diff_stats == diff_stats.min(skip_na=True))[0] return stat_v[sel_min_diff] else: clust1_ids = clusters[clusters == cluster_ids[0]].index.to_list() clust2_ids = clusters[clusters == cluster_ids[1]].index.to_list() clust_ids_ord = clust1_ids + clust2_ids mat = corrs.loc[clust_ids_ord, clust_ids_ord] # Cross-cluster statistic cross = corrs.loc[clust1_ids, clust2_ids] values = cross.to_numpy().flatten() stat_across = np.apply_along_axis(summary_func, 0, values) mat = replace_upper_triangle(mat, np.nan, k=0) # Within cluster statistic within_c1 = mat.loc[clust1_ids, clust1_ids] wc1_values = within_c1.to_numpy().flatten() wc1_values = wc1_values[~np.isnan(wc1_values)] stat_within_c1 = np.apply_along_axis(summary_func, 0, wc1_values) within_c2 = mat.loc[clust2_ids, clust2_ids] wc2_values = within_c2.to_numpy().flatten() wc2_values = wc2_values[~np.isnan(wc2_values)] stat_within_c2 = np.apply_along_axis(summary_func, 0, wc2_values) stat_within = min([stat_within_c1, stat_within_c2]) diff_stat = stat_within - stat_across # Fisher's z-transformation to norm distr. and rationally scaled values z_stat_within = np.atanh(stat_within) z_stat_across = np.atanh(stat_across) z_diff_stat = z_stat_within - z_stat_across # T-test: intra-cluster peptide correlations are significantly different # from cross-cluster peptide correlations dz = z_diff_stat / np.sqrt((1 / (n_fractions-3)) + (1 / (n_fractions-3))) pval = 2 * (1 - norm.cdf(np.abs(dz))) return np.array([diff_stat, z_diff_stat, dz, pval])
[docs] def proteoform_scores( adata, alpha=None, summary_func=np.mean, noise=NOISE, inplace=True, copy=False, ): if inplace and copy: raise ValueError('Arguments raise and copy are mutually exclusive') if 'pairwise_peptide_correlations' not in adata.uns: raise ValueError(f'pairwise_peptide_correlations not in .uns') if 'dendograms' not in adata.uns: raise ValueError(f'dendograms not in .uns') columns = [ 'protein_id', 'proteoform_score', 'proteoform_score_z', 'proteoform_score_dz', 'proteoform_score_pval', ] corrs = adata.uns['pairwise_peptide_correlations'].copy().reset_index() # pylint: disable=access-member-before-definition var = adata.var # pylint: enable=access-member-before-definition n_fractions = adata.n_obs proteoform_scores_list = [] for prot, corrs_prot in corrs.groupby('protein_id', observed=True): corrs_mat = reconstruct_corrs_df_symmetric_from_long_df( corrs_prot, var_a_col='pepA', var_b_col='pepB', corr_col='PCC') clusters = var.loc[var['protein_id'] == prot, 'cluster_id'] scores = proteoform_scores_( corrs_mat, clusters, n_fractions, summary_func=np.mean) scores_entry = {column:value for column, value in zip(columns[1:5], scores)} scores_entry['protein_id'] = prot scores_entry = pd.DataFrame([scores_entry]) proteoform_scores_list.append(scores_entry) proteoform_scores = pd.concat(proteoform_scores_list, ignore_index=True) proteoform_scores = proteoform_scores[columns] # Perform multiple-testing correction if not alpha: alpha = 0.05 # Just placeholder mask_nonan = proteoform_scores['proteoform_score_pval'].notna() pvals = proteoform_scores.loc[mask_nonan, 'proteoform_score_pval'] rejected, corrected_pvals, _, _ = multipletests(pvals, alpha=alpha, method='fdr_bh') proteoform_scores['proteoform_score_pval_adj'] = np.nan proteoform_scores['is_proteoform'] = np.nan proteoform_scores.loc[pvals.index, 'proteoform_score_pval_adj'] = corrected_pvals if alpha: proteoform_scores.loc[pvals.index, 'is_proteoform'] = rejected.astype(int) # --- drop existing score columns before merge (safe for re-runs) --- score_cols = [ 'proteoform_score', 'proteoform_score_z', 'proteoform_score_dz', 'proteoform_score_pval', 'proteoform_score_pval_adj', 'is_proteoform', ] var = var.drop(columns=[c for c in score_cols if c in var.columns]) # Add all new scores to .var var_upd = pd.merge( var, proteoform_scores, on='protein_id', how='left', validate='many_to_one') var_upd = var_upd.set_index('peptide_id', drop=False) var_upd.index.name = None assert (var.index == var_upd.index).all() if inplace: adata.var = var_upd elif copy: adata_new = adata.copy() adata_new.var = var_upd return adata_new else: return proteoform_scores