Single cell reference (PBMC3K)#

Preprocess and annotate PBMC3K.

[1]:

import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
import matplotlib.pyplot as plt
import seaborn as sns

from dtangle import deconvolut

[2]:

adata = sc.datasets.pbmc3k()
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=False)

[3]:

sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.leiden(adata, resolution=0.2, key_added="leiden")
sc.tl.umap(adata)

/tmp/ipykernel_10289/3177813798.py:3: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.

 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata, resolution=0.2, key_added="leiden")

[4]:

sc.pl.umap(
    adata,
    color=[
        "leiden",
        "CD3E",
        "MS4A1",
        "NKG7",
        "CST3",
    ],
    ncols=3,
)

../_images/notebooks_vignette_pbmc3k_4_0.png

[5]:

adata.obs["celltype"] = adata.obs["leiden"].map(
    {
        "0": "T cells",
        "1": "Monocytes",
        "2": "NK cells",
        "3": "B cells",
    }
)

[6]:

fig, ax = plt.subplots(figsize=(4, 4))
sc.pl.umap(adata, color=["celltype"], legend_loc="on data", ax=ax, show=False)

[6]:

<Axes: title={'center': 'celltype'}, xlabel='UMAP1', ylabel='UMAP2'>

../_images/notebooks_vignette_pbmc3k_6_1.png

Build pseudobulk AnnData

[7]:

rng = np.random.default_rng(42)
donors = [f"donor_{i}" for i in range(8)]
adata.obs["donor"] = rng.choice(donors, size=adata.n_obs, replace=True)
truth = pd.crosstab(adata.obs["donor"], adata.obs["celltype"], normalize="index")
truth = truth.reindex(donors, fill_value=0.0)
counts = adata.layers["counts"]
pb_rows = []
for d in donors:
    idx = np.where(adata.obs["donor"].to_numpy() == d)[0]
    pb_rows.append(np.asarray(counts[idx].sum(axis=0)).ravel())

# Create pseudo-bulk dataset
pb = AnnData(X=np.vstack(pb_rows))
pb.obs_names = donors
pb.var_names = adata.var_names.copy()
pb.obsm["ground_truth"] = truth.copy()
sc.pp.log1p(pb)

# Create reference dataset
reference = AnnData(X=adata.layers["counts"].copy(), obs=adata.obs[["celltype"]].copy())
reference.var_names = adata.var_names.copy()
sc.pp.log1p(reference)

Deconvolution and comparison

[8]:

deconvolut(
    pb,
    reference,
    "celltype",
    n_markers=30,
    marker_method="ratio",
    data_type="rna-seq",
    key_added="dtangle",
)

[9]:

est = pb.obsm["dtangle"].copy()
truth_aligned = pb.obsm["ground_truth"].reindex(columns=est.columns, fill_value=0.0)
mae_by_type = (est - truth_aligned).abs().mean(axis=0).sort_values()
overall_mae = float((est - truth_aligned).abs().to_numpy().mean())
comparison = est.add_prefix("est_").join(truth_aligned.add_prefix("truth_"))
print(f"overall_mae: {overall_mae:.4f}")

overall_mae: 0.0226

[10]:

fig, ax = plt.subplots(1, 1, figsize=(6, 6))
for ct in est.columns:
    sns.scatterplot(
        x=comparison[f"truth_{ct}"],
        y=comparison[f"est_{ct}"],
        ax=ax,
        label=ct,
    )
ax.plot([0, 1], [0, 1], "r--")
ax.set_xlabel("True Proportion")
ax.set_ylabel("Estimated Proportion")
_ = ax.legend(frameon=False)

../_images/notebooks_vignette_pbmc3k_12_0.png