Source code for proteopy.read.diann

import re
import warnings
import gc
import pandas as pd
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns


[docs] def diann( diann_output_path, aggr_level, precursor_pval_max, gene_pval_max, global_precursor_pval_max, show_input_stats = False, run_parser = None, fill_na = None, ): '''dia-NN input reader Load dia-nn output to anndata format. Notes: Filters for only proteotypic precursors - peptide maps to a single Protein.Ids. ''' # Check args aggr_level_options = [ 'Stripped.Sequence', 'Modified.Sequence', 'Precursor.Id', ] if aggr_level not in aggr_level_options: raise ValueError(( f'Wrong option passsed to aggr_level argument: {aggr_level}.' )) if run_parser is not None and not callable(run_parser): raise ValueError(( 'run_parser arg must either be a function or None.' )) base_required_cols = { 'Run', 'Proteotypic', 'Protein.Ids', 'Precursor.Quantity', 'Protein.Q.Value', 'Global.Q.Value', 'Q.Value', 'Protein.Group', 'Genes', 'Protein.Names', 'Stripped.Sequence', } required_cols = set(base_required_cols) required_cols.add(aggr_level) if aggr_level == 'Precursor.Id': required_cols.update({'Modified.Sequence', 'Precursor.Charge'}) if aggr_level == 'Modified.Sequence': required_cols.add('Modified.Sequence') header = pd.read_csv(diann_output_path, sep='\t', nrows=0) missing_cols = sorted(required_cols - set(header.columns)) if missing_cols: missing_str = ', '.join(missing_cols) raise ValueError( f'Missing required columns in DIA-NN output: {missing_str}.' ) data = pd.read_csv( diann_output_path, sep='\t', header=0, usecols=sorted(required_cols), ) if run_parser: data['Run'] = data['Run'].apply(run_parser) if show_input_stats: print('Before Q-value and proteotypicity filtering\n------') proteotypic_fraction = (data['Proteotypic'] == 1).sum() / len(data) print(f'Proteotypic peptide fraction: {proteotypic_fraction:.2f}') multimapper_fraction = ( (data['Protein.Ids'].str.split(';').apply(len) == 1) .sum() / len(data) ) print(f'Multimapper peptide fraction: {multimapper_fraction:.2f}') # Q value distr. plots fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16,4)) plt.subplots_adjust(wspace=0.3) sns.histplot(data['Q.Value'], bins=100, ax=axes[0]) axes[0].set_title('Q.Value distr.') if precursor_pval_max: axes[0].axvline(x=precursor_pval_max, color='red', linestyle='--', linewidth=2) sns.histplot(data['Global.Q.Value'], bins=100, ax=axes[1]) axes[1].set_title('Gobal.Q.Value distr.') if global_precursor_pval_max: axes[1].axvline(x=global_precursor_pval_max, color='red', linestyle='--', linewidth=2) sns.histplot(data['Protein.Q.Value'], bins=100, ax=axes[2]) axes[2].set_title('Protein.Q.Value distr.') if gene_pval_max: axes[2].axvline(x=gene_pval_max, color='red', linestyle='--', linewidth=2) plt.show() # Q values stats q_stats = data[['Q.Value', 'Protein.Q.Value', 'Global.Q.Value']].describe() print(q_stats) # Filter ds data_sub = data[ (data['Proteotypic'] == 1) & (data['Protein.Ids'].str.split(';').apply(len).eq(1)) ].copy() del data gc.collect() # ToDo: change to < instead of <= if precursor_pval_max: data_sub = data_sub[data_sub['Q.Value'] <= precursor_pval_max] if global_precursor_pval_max: data_sub = data_sub[data_sub['Global.Q.Value'] <= global_precursor_pval_max] if gene_pval_max: data_sub = data_sub[data_sub['Protein.Q.Value'] <= gene_pval_max] if len(data_sub) == 0: raise ValueError('Dataframe after filtering empty') if show_input_stats: # Q values stats q_stats = data_sub[['Q.Value', 'Protein.Q.Value', 'Global.Q.Value']].describe() print('\nAfter Q-value and proteotypicity filtering\n------') print(q_stats) # Check: how peptides map to proteins is_pep_multiprots = ( data_sub.groupby([aggr_level, 'Run'], observed=True) ['Protein.Ids'].nunique() > 1 ) if is_pep_multiprots.any(): raise ValueError( f'Peptides at aggregation level {aggr_level} map to multiple proteins. ' 'Not implemented yet.' ) # Aggregate precursors data_cols = [ 'Run', aggr_level, 'Protein.Ids', 'Precursor.Quantity', ] precursor_data = data_sub[data_cols].copy() precursor_data_summed = ( precursor_data.groupby([aggr_level,'Protein.Ids','Run'], observed=True) # In theory grouping by protein.ids not necessary ['Precursor.Quantity'] .sum() .reset_index() ) # Check: proteotypicity assert (( precursor_data_summed .groupby('Stripped.Sequence', observed=True)['Protein.Ids'] .nunique().le(1).all() )), "Error: Some peptides map to multiple proteins!" X = pd.pivot( precursor_data_summed, index='Run', columns=aggr_level, values='Precursor.Quantity', ) X = X.sort_index(axis=0).sort_index(axis=1) if fill_na is not None: X.fillna(fill_na, inplace=True) X.columns.name = None X.index.name = None del precursor_data gc.collect() # obs obs = pd.DataFrame({'run_id': X.index}, index = X.index) obs.index.name = None meta_cols = [ aggr_level, 'Protein.Ids', 'Protein.Group', 'Genes', 'Protein.Names', ] if aggr_level == 'Modified.Sequence': meta_cols.append('Stripped.Sequence') if aggr_level == 'Precursor.Id': meta_cols.extend(['Stripped.Sequence', 'Modified.Sequence', 'Precursor.Charge']) # todo: add column for if aggr_level is stripped sequence or modified id, which # collects the combinations of precursor_ids and modified sequences that were # combined in aggregation precursor_meta = data_sub[meta_cols].copy() # Groups contain identical rows assert ( precursor_meta.groupby(aggr_level, observed=True) .apply(lambda x: x.nunique().eq(1).all(), include_groups=False) .all() ) var = precursor_meta.groupby(aggr_level, observed=True).first() var = var.loc[X.columns] var[aggr_level] = var.index var['peptide_id'] = var.index var.index.name = None del precursor_meta del data_sub gc.collect() adata = ad.AnnData( X = X, var = var, obs = obs, ) adata.strings_to_categoricals() if len(adata.obs_names.unique()) < adata.n_obs: adata.obs_names_make_unique() warnings.warn( 'Repeated obs names were present in the data. ' 'They were made unique by numbered suffixes.' ) if len(adata.var_names.unique()) < adata.n_vars: adata.var_names_make_unique() warnings.warn( 'Repeated var names were present in the data. ' 'They were made unique by numbered suffixes.' ) return adata