Source code for proteopy.datasets.karayel_2020

"""Karayel 2020 human erythropoiesis proteomics dataset.

This module provides access to the protein-level DIA-MS proteomics dataset
from Karayel et al. (2020) studying dynamic phosphosignaling networks
during human erythropoiesis. The study quantified ~7,400 proteins from
CD34+ hematopoietic stem/progenitor cells (HSPCs) isolated from healthy
donors, across five differentiation stages of erythroid development.

Cells were FACS-sorted using CD235a, CD49d, and Band 3 surface markers.
The data is sourced from the PRIDE archive (`PXD017276
<https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD017276>`_)
and includes
measurements from the following erythroid differentiation stages:

- Progenitor: CFU-E progenitor cells (CD34+ HSPCs, negative fraction)
- ProE&EBaso: Proerythroblasts and early basophilic erythroblasts
- LBaso: Late basophilic erythroblasts
- Poly: Polychromatic erythroblasts
- Ortho: Orthochromatic erythroblasts

Reference
---------
Karayel et al. (2020) Integrative proteomics reveals principles of
dynamic phosphosignaling networks in human erythropoiesis.
Molecular Systems Biology 16: e9813.
DOI: 10.15252/msb.20209813
"""
import re
from pathlib import Path

import numpy as np
import pandas as pd
import pooch

import proteopy as pp

def _parse_sample_id(col: str) -> str:
    """Parse and clean sample identifiers from raw column names.

    Remove technical prefixes, suffixes, and file extensions from column
    names in the downloaded PRIDE data to extract meaningful sample
    identifiers.

    Parameters
    ----------
    col : str
        Raw column name from the PRIDE CSV file containing sample
        identifier and technical annotations.

    Returns
    -------
    str
        Cleaned sample identifier with technical metadata removed.

    Examples
    --------
    >>> col = "[1] 20181222_QX0_OzKa_SA_CD34pos_DIA_P1.raw.PG.Quantity"
    >>> _parse_sample_id(col)
    'P1'
    """
    col = re.sub(r"^\[\d+\]\s*", "", col)
    col = col.replace(".PG.Quantity", "")
    col = re.sub(r"\.raw$", "", col)
    col = Path(col).stem
    col = col.replace("20181222_QX0_OzKa_SA_CD34pos_", "")
    col = col.replace("DIA_", "")
    col = col.replace("_181226121547", "")
    return col

[docs] def karayel_2020(): """Load Karayel 2020 erythropoiesis proteomics dataset. Download and process the protein-level DIA-MS dataset from Karayel et al. (2020) studying CD34+ hematopoietic stem cell differentiation during erythropoiesis. The dataset contains quantitative proteomics measurements across five cell types representing sequential stages of erythroid development. The function downloads data from the PRIDE archive (`PXD017276 <https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD017276>`_), processes sample identifiers, maps technical names to biological cell types, and excludes day 7 samples. Protein quantities marked as 'Filtered' in the original data are converted to ``np.nan``. Sample annotation (``.obs``) includes: - ``sample_id``: Unique sample identifier (cell_type_replicate) - ``cell_type``: Differentiation stage (Progenitor, ProE&EBaso, LBaso, Poly, Ortho) - ``replicate``: Technical replicate identifier Variable annotation (``.var``) includes: - ``protein_id``: Protein group identifier (matches ``.var_names``) - ``gene_name``: Associated gene name(s) Returns ------- ad.AnnData AnnData object containing protein-level quantification data. ``.X`` contains protein intensities (samples × proteins) with missing values as ``np.nan``. Day 7 samples are excluded from the dataset. Raises ------ urllib.error.URLError If download from PRIDE archive fails. Examples -------- >>> import proteopy as pp >>> adata = pp.datasets.karayel_2020() >>> adata AnnData object with n_obs × n_vars obs: 'sample_id', 'cell_type', 'replicate' var: 'protein_id', 'gene_name' >>> adata.obs['cell_type'].unique() ['Progenitor', 'ProE&EBaso', 'LBaso', 'Poly', 'Ortho'] Notes ----- The dataset represents five stages of erythroid differentiation: 1. Progenitor: CD34+ hematopoietic stem cells 2. ProE&EBaso: Proerythroblasts and early basophilic erythroblasts 3. LBaso: Late basophilic erythroblasts 4. Poly: Polychromatic erythroblasts 5. Ortho: Orthochromatic erythroblasts Samples collected at day 7 (_D7) are filtered out during processing. Reference --------- Karayel Ö, Xu P, Bludau I, Velan Bhoopalan S, Yao Y, Ana Rita FC, Santos A, Schulman BA, Alpi AF, Weiss MJ, and Mann M. Integrative proteomics reveals principles of dynamic phosphosignaling networks in human erythropoiesis. Molecular Systems Biology, 2020. URL: https://doi.org/10.15252/msb.20209813, doi:10.15252/msb.20209813. """ url = ( "https://ftp.pride.ebi.ac.uk/pride/data/archive/2020/10/" "PXD017276/20190213_CD34_Phospho_study_DIA_proteome_Report.csv" ) file_path = pooch.retrieve( url=url, known_hash=None, # TODO fname="karayel_2020_proteome_report.csv", path=pooch.os_cache("proteopy"), ) df = pd.read_csv(file_path) quant_cols = [c for c in df.columns if c.endswith(".PG.Quantity")] # Replace 'Filtered' with np.nan before melting df[quant_cols] = df[quant_cols].replace("Filtered", np.nan).astype(float) long = ( df[["PG.ProteinGroups"] + quant_cols] .melt( id_vars="PG.ProteinGroups", var_name="raw_col", value_name="intensity", ) ) long["sample_id"] = long["raw_col"].map(_parse_sample_id) long = long.drop(columns=["raw_col"]) long = long.rename(columns={"PG.ProteinGroups": "protein_id"}) long['sample_id'] = ( long['sample_id'] .str.replace('Negativefrac', 'Progenitor', regex=False) .str.replace('P1andP2', 'ProE&EBaso', regex=False) .str.replace('P3', 'LBaso', regex=False) .str.replace('P4', 'Poly', regex=False) .str.replace('P5', 'Ortho', regex=False) ) Karayel_2020_quant = long[~long["sample_id"].str.contains('_D7')] Karayel_2020_meta_obs = ( Karayel_2020_quant[['sample_id']] .drop_duplicates() .reset_index(drop=True) ) Karayel_2020_meta_obs["cell_type"] = ( Karayel_2020_meta_obs["sample_id"].str.split("_").str[0] ) Karayel_2020_meta_obs["replicate"] = ( Karayel_2020_meta_obs["sample_id"].str.split("_").str[-1] ) Karayel_2020_meta_var = ( df[['PG.ProteinGroups', 'PG.Genes']] .drop_duplicates() .reset_index(drop=True) ) Karayel_2020_meta_var = Karayel_2020_meta_var.rename(columns={ 'PG.ProteinGroups': 'protein_id', 'PG.Genes': 'gene_name' }) adata = pp.read.long( intensities=Karayel_2020_quant, level='protein', sample_annotation=Karayel_2020_meta_obs, var_annotation=Karayel_2020_meta_var, ) return adata