Source code for proteopy.read.diann

import re
import warnings
import gc
import pandas as pd
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns



[docs]
def diann(
    diann_output_path,
    aggr_level,
    precursor_pval_max,
    gene_pval_max,
    global_precursor_pval_max,
    show_input_stats = False,
    run_parser = None,
    fill_na = None,
    ):
    '''dia-NN input reader

    Load dia-nn output to anndata format.
    Notes:
        Filters for only proteotypic precursors - peptide maps to a single
        Protein.Ids.
    '''

    # Check args
    aggr_level_options = [
        'Stripped.Sequence',
        'Modified.Sequence',
        'Precursor.Id',
        ]

    if aggr_level not in aggr_level_options:
        raise ValueError((
            f'Wrong option passsed to aggr_level argument: {aggr_level}.'
            ))

    if run_parser is not None and not callable(run_parser):
        raise ValueError((
            'run_parser arg must either be a function or None.'
            ))

    base_required_cols = {
        'Run',
        'Proteotypic',
        'Protein.Ids',
        'Precursor.Quantity',
        'Protein.Q.Value',
        'Global.Q.Value',
        'Q.Value',
        'Protein.Group',
        'Genes',
        'Protein.Names',
        'Stripped.Sequence',
        }

    required_cols = set(base_required_cols)
    required_cols.add(aggr_level)

    if aggr_level == 'Precursor.Id':
        required_cols.update({'Modified.Sequence', 'Precursor.Charge'})
    if aggr_level == 'Modified.Sequence':
        required_cols.add('Modified.Sequence')

    header = pd.read_csv(diann_output_path, sep='\t', nrows=0)
    missing_cols = sorted(required_cols - set(header.columns))

    if missing_cols:
        missing_str = ', '.join(missing_cols)
        raise ValueError(
            f'Missing required columns in DIA-NN output: {missing_str}.'
        )

    data = pd.read_csv(
        diann_output_path,
        sep='\t',
        header=0,
        usecols=sorted(required_cols),
    )

    if run_parser:
        data['Run'] = data['Run'].apply(run_parser)

    if show_input_stats:
        print('Before Q-value and proteotypicity filtering\n------')
        proteotypic_fraction = (data['Proteotypic'] == 1).sum() / len(data)
        print(f'Proteotypic peptide fraction: {proteotypic_fraction:.2f}')

        multimapper_fraction = (
            (data['Protein.Ids'].str.split(';').apply(len) == 1)
            .sum() / len(data)
            )
        print(f'Multimapper peptide fraction: {multimapper_fraction:.2f}')

        # Q value distr. plots 
        fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16,4))
        plt.subplots_adjust(wspace=0.3)

        sns.histplot(data['Q.Value'], bins=100, ax=axes[0])
        axes[0].set_title('Q.Value distr.')

        if precursor_pval_max:
            axes[0].axvline(x=precursor_pval_max, color='red', linestyle='--', linewidth=2)

        sns.histplot(data['Global.Q.Value'], bins=100, ax=axes[1])
        axes[1].set_title('Gobal.Q.Value distr.')
        
        if global_precursor_pval_max:
            axes[1].axvline(x=global_precursor_pval_max, color='red', linestyle='--', linewidth=2)

        sns.histplot(data['Protein.Q.Value'], bins=100, ax=axes[2])
        axes[2].set_title('Protein.Q.Value distr.')

        if gene_pval_max:
            axes[2].axvline(x=gene_pval_max, color='red', linestyle='--', linewidth=2)
        plt.show()

        # Q values stats
        q_stats = data[['Q.Value', 'Protein.Q.Value', 'Global.Q.Value']].describe()
        print(q_stats)


    # Filter ds
    data_sub = data[
        (data['Proteotypic'] == 1)
        & (data['Protein.Ids'].str.split(';').apply(len).eq(1))
        ].copy()
    del data
    gc.collect()

    # ToDo: change to < instead of <=
    if precursor_pval_max:
        data_sub = data_sub[data_sub['Q.Value'] <= precursor_pval_max]
    if global_precursor_pval_max:
        data_sub = data_sub[data_sub['Global.Q.Value'] <= global_precursor_pval_max]
    if gene_pval_max:
        data_sub = data_sub[data_sub['Protein.Q.Value'] <= gene_pval_max]

    if len(data_sub) == 0:
        raise ValueError('Dataframe after filtering empty')

    if show_input_stats:
        # Q values stats
        q_stats = data_sub[['Q.Value', 'Protein.Q.Value', 'Global.Q.Value']].describe()
        print('\nAfter Q-value and proteotypicity filtering\n------')
        print(q_stats)

    # Check: how peptides map to proteins
    is_pep_multiprots = (
            data_sub.groupby([aggr_level, 'Run'], observed=True)
            ['Protein.Ids'].nunique() > 1
            )

    if is_pep_multiprots.any():
        raise ValueError(
            f'Peptides at aggregation level {aggr_level} map to multiple proteins. '
            'Not implemented yet.'
        )

    # Aggregate precursors
    data_cols = [
        'Run',
        aggr_level,
        'Protein.Ids',
        'Precursor.Quantity',
        ]

    precursor_data = data_sub[data_cols].copy()


    precursor_data_summed = (
        precursor_data.groupby([aggr_level,'Protein.Ids','Run'], observed=True) # In theory grouping by protein.ids not necessary
        ['Precursor.Quantity']
        .sum()
        .reset_index()
        )

    # Check: proteotypicity
    assert ((
        precursor_data_summed
        .groupby('Stripped.Sequence', observed=True)['Protein.Ids']
        .nunique().le(1).all()
        )), "Error: Some peptides map to multiple proteins!"

    X = pd.pivot(
        precursor_data_summed,
        index='Run',
        columns=aggr_level,
        values='Precursor.Quantity',
        )

    X = X.sort_index(axis=0).sort_index(axis=1)

    if fill_na is not None:
        X.fillna(fill_na, inplace=True)

    X.columns.name = None
    X.index.name = None

    del precursor_data
    gc.collect()

    # obs
    obs = pd.DataFrame({'run_id': X.index}, index = X.index)
    obs.index.name = None

    meta_cols = [
        aggr_level,
        'Protein.Ids',
        'Protein.Group',
        'Genes',
        'Protein.Names',
        ]

    if aggr_level == 'Modified.Sequence':
        meta_cols.append('Stripped.Sequence')

    if aggr_level == 'Precursor.Id':
        meta_cols.extend(['Stripped.Sequence', 'Modified.Sequence', 'Precursor.Charge'])

    # todo: add column for if aggr_level is stripped sequence or modified id, which
    # collects the combinations of precursor_ids and modified sequences that were
    # combined in aggregation

    precursor_meta = data_sub[meta_cols].copy()

    # Groups contain identical rows
    assert (
        precursor_meta.groupby(aggr_level, observed=True)
        .apply(lambda x: x.nunique().eq(1).all(), include_groups=False)
        .all()
        )

    var = precursor_meta.groupby(aggr_level, observed=True).first()
    var = var.loc[X.columns]
    var[aggr_level] = var.index
    var['peptide_id'] = var.index
    var.index.name = None

    del precursor_meta
    del data_sub
    gc.collect()

    adata = ad.AnnData(
        X = X,
        var = var,
        obs = obs,
        )

    adata.strings_to_categoricals()

    if len(adata.obs_names.unique()) < adata.n_obs:
        adata.obs_names_make_unique()
        warnings.warn(
            'Repeated obs names were present in the data. '
            'They were made unique by numbered suffixes.'
            )
    
    if len(adata.var_names.unique()) < adata.n_vars:
        adata.var_names_make_unique()
        warnings.warn(
            'Repeated var names were present in the data. '
            'They were made unique by numbered suffixes.'
            )

    return adata