Label transfer / projection on simulated cluster dataset¶
In this notebook, we demonstrate how to use CONCORD in classification mode to transfer labels from one dataset to another. As an example, we'll work with a simulated single-batch dataset containing 3 clusters. You can apply the same pipeline to multi-batch datasets simply by providing the domain_key
parameter when running CONCORD.
Package loading¶
%load_ext autoreload
%autoreload 2
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl
from matplotlib import font_manager, rcParams
custom_rc = {
'font.family': 'Arial', # Set the desired font for this plot
}
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42
proj_name = "simulation_clustereasy_classification"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
data_dir = f"../data/{proj_name}/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)
file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix
cpu
'Sep10-2122'
Simulate simple 3-cluster dataset¶
state_key = 'cluster'
batch_key = 'batch'
state_type = 'cluster'
leiden_key='leiden'
batch_type = 'batch_specific_features'
distribution = 'normal'
from concord.simulation import (
SimConfig, ClusterConfig, BatchConfig, Simulation
)
# ──────────────── Inputs ────────────────
seed = 42
n_cells = [100,100,50]
n_genes = [1000,1000,500]
state_dispersion = [4.0, 2.0, 2.0]
# ─────────────── SimConfig ───────────────
sim_cfg = SimConfig(
n_cells=n_cells,
n_genes=n_genes,
seed=seed,
non_neg=True,
to_int=True,
)
# ─────────────── ClusterConfig ───────────────
state_cfg = ClusterConfig(
n_states=3,
distribution="normal",
level=5.0,
min_level=0.0,
dispersion=state_dispersion,
program_structure="uniform",
program_on_time_fraction=0.3,
global_non_specific_gene_fraction=0.1,
pairwise_non_specific_gene_fraction=None,
)
# ─────────────── BatchConfig ───────────────
batch_cfg = BatchConfig(
n_batches=1,
effect_type="batch_specific_features",
distribution="normal",
level=[5.0], # One value for 1 batch
dispersion=[3.0],
feature_frac=0.15,
)
# ─────────────── Simulation ───────────────
sim = Simulation(sim_cfg, state_cfg, batch_cfg)
adata, adata_state = sim.simulate_data()
adata.layers["counts"] = adata.X.copy()
import matplotlib.pyplot as plt
figsize = (2.3, 1.8)
ncols = 3
title_fontsize = 9
dpi = 600
_, _, state_pal = ccd.pl.get_color_mapping(adata, state_key, pal='Paired', seed=seed)
_, _, batch_pal = ccd.pl.get_color_mapping(adata, batch_key, pal='Set1', seed=seed)
pal = {state_key: state_pal, batch_key: batch_pal}
fig, axes = plt.subplots(1, ncols, figsize=(figsize[0] * ncols, figsize[1]), dpi=dpi)
for ax, (val, title) in zip(
axes,
[('no_noise', 'True state'),
('wt_noise', 'True state with noise'),
('X', 'Simulated data with batch')]):
ccd.pl.heatmap_with_annotations(
adata,
val=val,
obs_keys=[state_key] if val != 'X' else [state_key, batch_key],
pal=pal,
use_clustermap=False, # plain heat‑map
cluster_rows=False,
cluster_cols=False,
yticklabels=False,
value_annot=False,
cmap='viridis',
title=title,
title_fontsize=9,
ax=ax,
show=False # don’t flush the figure yet
)
plt.tight_layout(w_pad=0.0, h_pad=0.1)
plt.savefig(save_dir / f"data_heatmaps_{file_suffix}.svg",
dpi=600, bbox_inches='tight')
plt.show()
Run dimension reduction¶
Let's take a look at the data with standard PCA and UMAP.
n_components = 15
adata.obsm['no_noise'] = adata.layers['no_noise']
adata.obsm['wt_noise'] = adata.layers['wt_noise']
ccd.ul.run_pca(adata, source_key='no_noise', result_key='PCA_no_noise', n_pc=n_components, random_state=seed)
sc.pp.neighbors(adata, use_rep='PCA_no_noise', n_neighbors=30, random_state=seed)
ccd.ul.run_umap(adata, source_key='no_noise', result_key='UMAP_no_noise', random_state=seed)
ccd.ul.run_umap(adata, source_key='wt_noise', result_key='UMAP_wt_noise', random_state=seed)
concord - INFO - PCA performed on source data with 15 components concord - INFO - PCA embedding stored in adata.obsm['PCA_no_noise']
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
concord - INFO - UMAP embedding stored in adata.obsm['UMAP_no_noise'] concord - INFO - UMAP embedding stored in adata.obsm['UMAP_wt_noise']
# Random set 30% class labels to 'unlabeled' for prediction
adata.obs['input_state'] = adata.obs[state_key].astype(str)
adata.obs['input_state'][adata.obs.sample(frac=0.3, random_state=seed).index] = 'unlabeled'
adata.obs['input_state'].value_counts()
clus_pal = {
'unlabeled': 'lightgray',
'cluster_1': 'red',
'cluster_2': 'blue',
'cluster_3': 'green',
}
pal = {'input_state': clus_pal,
state_key: clus_pal,
'Concord_class_pred': clus_pal
}
color_by = [state_key, 'input_state'] # Choose which variables you want to visualize
ccd.pl.plot_embedding(
adata, basis='UMAP_wt_noise', color_by=color_by, figsize=(8, 2), dpi=600, ncols=3, font_size=6, point_size=10, legend_loc='on data',
pal=pal,
save_path=save_dir / 'UMAP_wt_noise.png'
)
Brief intro to CONCORD classification mode¶
Enable classification by setting use_classifier=True
.
To reduce overfitting, we recommend also hold out a validation set (e.g., train_frac=0.9
).
CONCORD supports three labeling workflows:
Semi-supervised joint training
Train on all cells; only labeled cells contribute to the classification loss, while all cells participate in learning the manifold (via contrastive objective).
Pros: leverages unlabeled data for representation learning; labels refine decision boundaries.
Cons: slightly more tuning may be needed if labels are sparse or imbalanced.Supervised on labeled-only cells
Train using only labeled cells and then predict the rest.
Pros: simple and often strong when labels are abundant and clean.
Cons: unlabeled cells do not influence the manifold, which can reduce structure in heterogeneous datasets.Unsupervised + external classifier
Learn the embedding without a classifier (use_classifier=False
), then fit a third-party classifier (e.g., KNeighborsClassifier) on labeled cells to annotate the unlabeled set.
Pros: decouples embedding from the classifier; easy to swap models.
Cons: ignores labels during representation learning and can yield lower classification performance.
Strategy 1: Semi-supervised joint training¶
# If integrating data across batch, simply add the domain_key argument to indicate the batch key in adata.obs
cur_ccd = ccd.Concord(adata=adata,
class_key = 'input_state',
use_classifier = True,
unlabeled_class = 'unlabeled',
clr_beta=1.0,
p_intra_knn=0,
n_epochs=10,
preload_dense=True,
verbose=True,
train_frac=0.9, # Use 90% of data for training, and 10% for validation
device=device)
# Encode data, saving the latent embedding in adata.obsm['Concord']
cur_ccd.fit_transform(output_key='Concord')
concord - INFO - Operating directly on the provided AnnData object. Object may be modified. concord - WARNING - Input data in adata.X appears to be raw counts. CONCORD performs best on normalized and log-transformed data. Consider setting normalize_total=True and log1p=True. concord - WARNING - No input feature list provided. It is recommended to first select features using the command `concord.ul.select_features()`. concord - INFO - Proceeding with all 2875 features in the dataset. concord - INFO - Using NT-Xent loss with beta=1.0. This will apply hard-negative weighting to the contrastive loss. concord - INFO - HCL (Contrastive learning with hard negative samples) mode is enabled. concord - WARNING - domain/batch information not found, all samples will be treated as from single domain/batch. concord - WARNING - Only one domain found in the data. Setting p_intra_domain to 1.0. concord - INFO - Column 'input_state' is already of type: category concord - INFO - Unused levels dropped for column 'input_state'. concord - INFO - Encoder input dim: 2875 concord - INFO - Classifier input dim: 100 concord - INFO - Model loaded to device: cpu concord - INFO - Total number of parameters: 2988911 concord.model.dataloader - INFO - Using 0 DataLoader workers. concord.model.anndataset - INFO - Initialized lightweight dataset with 250 samples. concord.model.dataloader - INFO - Loading all data into memory for fast access. This may consume a lot of RAM. If you run out of memory, please set `preload_dense=False`. concord.model.anndataset - INFO - Initialized lightweight dataset with 225 samples. concord.model.anndataset - INFO - Initialized lightweight dataset with 25 samples. concord - INFO - Augmentation probabilities: concord - INFO - - Element mask probability: 0.4 concord - INFO - - Feature mask probability: 0.3 concord - INFO - Starting epoch 1/10 concord - INFO - Processing chunk 1/1 for epoch 1 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 0 Training: 100%|██████████| 1/1 [00:00<00:00, 76.75it/s, loss=6.819]
concord - INFO - Epoch 0 | Train Loss: 6.82, MSE: 0.00, CLASS: 1.18, CONTRAST: 5.63, IMPORTANCE: 0.00 concord - INFO - Epoch: 0 | Train accuracy: 0.41 | precision: 0: 0.42, 1: 0.22, 2: 0.50 | recall: 0: 0.84, 1: 0.05, 2: 0.11 | f1: 0: 0.56, 1: 0.08, 2: 0.17
Epoch 0 Validation: 100%|██████████| 1/1 [00:00<00:00, 336.81it/s, loss=5.341]
concord - INFO - Epoch 0 | Val Loss: 5.34, MSE: 0.00, CLASS: 1.59, CONTRAST: 3.75, IMPORTANCE: 0.00 concord - INFO - Epoch: 0 | Val accuracy: 0.37 | precision: 0: 0.00, 1: 0.37, 2: 0.00 | recall: 0: 0.00, 1: 1.00, 2: 0.00 | f1: 0: 0.00, 1: 0.54, 2: 0.00 concord - INFO - New best model found at epoch 1 with validation loss: 5.3413 concord - INFO - Starting epoch 2/10 concord - INFO - Processing chunk 1/1 for epoch 2 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 1 Training: 100%|██████████| 1/1 [00:00<00:00, 50.14it/s, loss=7.011]
concord - INFO - Epoch 1 | Train Loss: 7.01, MSE: 0.00, CLASS: 1.47, CONTRAST: 5.54, IMPORTANCE: 0.00 concord - INFO - Epoch: 1 | Train accuracy: 0.38 | precision: 0: 0.00, 1: 0.38, 2: 0.00 | recall: 0: 0.00, 1: 1.00, 2: 0.00 | f1: 0: 0.00, 1: 0.55, 2: 0.00
Epoch 1 Validation: 100%|██████████| 1/1 [00:00<00:00, 330.49it/s, loss=4.161]
concord - INFO - Epoch 1 | Val Loss: 4.16, MSE: 0.00, CLASS: 0.47, CONTRAST: 3.69, IMPORTANCE: 0.00 concord - INFO - Epoch: 1 | Val accuracy: 0.63 | precision: 0: 1.00, 1: 0.00, 2: 0.30 | recall: 0: 1.00, 1: 0.00, 2: 1.00 | f1: 0: 1.00, 1: 0.00, 2: 0.46 concord - INFO - New best model found at epoch 2 with validation loss: 4.1609 concord - INFO - Starting epoch 3/10 concord - INFO - Processing chunk 1/1 for epoch 3 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 2 Training: 100%|██████████| 1/1 [00:00<00:00, 65.44it/s, loss=6.055]
concord - INFO - Epoch 2 | Train Loss: 6.05, MSE: 0.00, CLASS: 0.50, CONTRAST: 5.55, IMPORTANCE: 0.00 concord - INFO - Epoch: 2 | Train accuracy: 0.69 | precision: 0: 1.00, 1: 1.00, 2: 0.34 | recall: 0: 1.00, 1: 0.23, 2: 1.00 | f1: 0: 1.00, 1: 0.38, 2: 0.51
Epoch 2 Validation: 100%|██████████| 1/1 [00:00<00:00, 287.62it/s, loss=3.924]
concord - INFO - Epoch 2 | Val Loss: 3.92, MSE: 0.00, CLASS: 0.28, CONTRAST: 3.64, IMPORTANCE: 0.00 concord - INFO - Epoch: 2 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 3 with validation loss: 3.9238 concord - INFO - Starting epoch 4/10 concord - INFO - Processing chunk 1/1 for epoch 4 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 3 Training: 100%|██████████| 1/1 [00:00<00:00, 40.17it/s, loss=5.803]
concord - INFO - Epoch 3 | Train Loss: 5.80, MSE: 0.00, CLASS: 0.30, CONTRAST: 5.50, IMPORTANCE: 0.00 concord - INFO - Epoch: 3 | Train accuracy: 0.98 | precision: 0: 0.96, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 0.90 | f1: 0: 0.98, 1: 1.00, 2: 0.95
Epoch 3 Validation: 100%|██████████| 1/1 [00:00<00:00, 296.56it/s, loss=3.686]
concord - INFO - Epoch 3 | Val Loss: 3.69, MSE: 0.00, CLASS: 0.08, CONTRAST: 3.61, IMPORTANCE: 0.00 concord - INFO - Epoch: 3 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 4 with validation loss: 3.6859 concord - INFO - Starting epoch 5/10 concord - INFO - Processing chunk 1/1 for epoch 5 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 4 Training: 100%|██████████| 1/1 [00:00<00:00, 48.02it/s, loss=5.551]
concord - INFO - Epoch 4 | Train Loss: 5.55, MSE: 0.00, CLASS: 0.09, CONTRAST: 5.46, IMPORTANCE: 0.00 concord - INFO - Epoch: 4 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 4 Validation: 100%|██████████| 1/1 [00:00<00:00, 283.26it/s, loss=3.640]
concord - INFO - Epoch 4 | Val Loss: 3.64, MSE: 0.00, CLASS: 0.03, CONTRAST: 3.61, IMPORTANCE: 0.00 concord - INFO - Epoch: 4 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 5 with validation loss: 3.6401 concord - INFO - Starting epoch 6/10 concord - INFO - Processing chunk 1/1 for epoch 6 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 5 Training: 100%|██████████| 1/1 [00:00<00:00, 54.84it/s, loss=5.547]
concord - INFO - Epoch 5 | Train Loss: 5.55, MSE: 0.00, CLASS: 0.03, CONTRAST: 5.51, IMPORTANCE: 0.00
concord - INFO - Epoch: 5 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 5 Validation: 100%|██████████| 1/1 [00:00<00:00, 267.44it/s, loss=3.633]
concord - INFO - Epoch 5 | Val Loss: 3.63, MSE: 0.00, CLASS: 0.02, CONTRAST: 3.61, IMPORTANCE: 0.00 concord - INFO - Epoch: 5 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 6 with validation loss: 3.6333
concord - INFO - Starting epoch 7/10 concord - INFO - Processing chunk 1/1 for epoch 7 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 6 Training: 100%|██████████| 1/1 [00:00<00:00, 63.10it/s, loss=5.484]
concord - INFO - Epoch 6 | Train Loss: 5.48, MSE: 0.00, CLASS: 0.02, CONTRAST: 5.46, IMPORTANCE: 0.00 concord - INFO - Epoch: 6 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 6 Validation: 100%|██████████| 1/1 [00:00<00:00, 325.39it/s, loss=3.630]
concord - INFO - Epoch 6 | Val Loss: 3.63, MSE: 0.00, CLASS: 0.02, CONTRAST: 3.61, IMPORTANCE: 0.00 concord - INFO - Epoch: 6 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 7 with validation loss: 3.6304 concord - INFO - Starting epoch 8/10 concord - INFO - Processing chunk 1/1 for epoch 8 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 7 Training: 100%|██████████| 1/1 [00:00<00:00, 62.30it/s, loss=5.466]
concord - INFO - Epoch 7 | Train Loss: 5.47, MSE: 0.00, CLASS: 0.02, CONTRAST: 5.45, IMPORTANCE: 0.00 concord - INFO - Epoch: 7 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 7 Validation: 100%|██████████| 1/1 [00:00<00:00, 411.97it/s, loss=3.627]
concord - INFO - Epoch 7 | Val Loss: 3.63, MSE: 0.00, CLASS: 0.01, CONTRAST: 3.62, IMPORTANCE: 0.00 concord - INFO - Epoch: 7 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 8 with validation loss: 3.6270 concord - INFO - Starting epoch 9/10 concord - INFO - Processing chunk 1/1 for epoch 9 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 8 Training: 100%|██████████| 1/1 [00:00<00:00, 41.35it/s, loss=5.471]
concord - INFO - Epoch 8 | Train Loss: 5.47, MSE: 0.00, CLASS: 0.01, CONTRAST: 5.46, IMPORTANCE: 0.00 concord - INFO - Epoch: 8 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 8 Validation: 100%|██████████| 1/1 [00:00<00:00, 267.46it/s, loss=3.623]
concord - INFO - Epoch 8 | Val Loss: 3.62, MSE: 0.00, CLASS: 0.01, CONTRAST: 3.61, IMPORTANCE: 0.00 concord - INFO - Epoch: 8 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 9 with validation loss: 3.6232 concord - INFO - Starting epoch 10/10 concord - INFO - Processing chunk 1/1 for epoch 10 concord - INFO - Number of samples in train_dataloader: 225 concord - INFO - Number of samples in val_dataloader: 25
Epoch 9 Training: 100%|██████████| 1/1 [00:00<00:00, 69.30it/s, loss=5.393]
concord - INFO - Epoch 9 | Train Loss: 5.39, MSE: 0.00, CLASS: 0.01, CONTRAST: 5.38, IMPORTANCE: 0.00 concord - INFO - Epoch: 9 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 9 Validation: 100%|██████████| 1/1 [00:00<00:00, 347.10it/s, loss=3.619]
concord - INFO - Epoch 9 | Val Loss: 3.62, MSE: 0.00, CLASS: 0.01, CONTRAST: 3.61, IMPORTANCE: 0.00 concord - INFO - Epoch: 9 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
concord - INFO - New best model found at epoch 10 with validation loss: 3.6190 concord - INFO - Best model state loaded into the model before final save. concord - INFO - Model saved to save/final_model_Sep10-2134.pt concord - INFO - Final model saved at: save/final_model_Sep10-2134.pt; Configuration saved at: save/config_Sep10-2134.json. concord.model.dataloader - INFO - Using 0 DataLoader workers. concord.model.anndataset - INFO - Initialized lightweight dataset with 250 samples. concord.model.dataloader - INFO - Loading all data into memory for fast access. This may consume a lot of RAM. If you run out of memory, please set `preload_dense=False`. concord - INFO - Predicting for chunk 1/1 concord - INFO - Predictions added to AnnData object with base key 'Concord'.
ccd.ul.run_umap(adata, source_key='Concord', result_key='Concord_UMAP', n_components=2, n_neighbors=15, min_dist=0.1, metric='euclidean')
color_by = [state_key, 'input_state', 'Concord_class_pred']
ccd.pl.plot_embedding(
adata, basis='Concord_UMAP', color_by=color_by, figsize=(8, 2), dpi=600, ncols=3, font_size=6, point_size=10, legend_loc='on data',
pal=pal,
save_path=save_dir / 'Concord_UMAP.png'
)
concord - INFO - UMAP embedding stored in adata.obsm['Concord_UMAP']
Strategy 2: Supervised training on labeled-only cells¶
adata_train = adata[adata.obs['input_state'] != 'unlabeled']
adata_pred = adata[adata.obs['input_state'] == 'unlabeled']
The model only sees the labeled dataset during training:
cur_ccd = ccd.Concord(adata=adata_train,
class_key = 'input_state',
use_classifier = True,
clr_beta=1.0,
p_intra_knn=0,
n_epochs=10,
preload_dense=True,
verbose=True,
train_frac=0.9, # Use 90% of data for training, and 10% for validation
save_dir=save_dir / 'model/', # directory to save model checkpoints
)
# Encode data, saving the latent embedding in adata.obsm['Concord']
cur_ccd.fit_transform(output_key='Concord')
concord - INFO - Operating directly on the provided AnnData object. Object may be modified. concord - WARNING - Input data in adata.X appears to be raw counts. CONCORD performs best on normalized and log-transformed data. Consider setting normalize_total=True and log1p=True. concord - WARNING - No input feature list provided. It is recommended to first select features using the command `concord.ul.select_features()`. concord - INFO - Proceeding with all 2875 features in the dataset. concord - INFO - Using NT-Xent loss with beta=1.0. This will apply hard-negative weighting to the contrastive loss. concord - INFO - HCL (Contrastive learning with hard negative samples) mode is enabled. concord - WARNING - domain/batch information not found, all samples will be treated as from single domain/batch. concord - WARNING - Only one domain found in the data. Setting p_intra_domain to 1.0. concord - INFO - Column 'input_state' is already of type: category concord - INFO - Unused levels dropped for column 'input_state'. concord - INFO - Encoder input dim: 2875 concord - INFO - Classifier input dim: 100 concord - INFO - Model loaded to device: cpu concord - INFO - Total number of parameters: 2988911 concord.model.dataloader - INFO - Using 0 DataLoader workers. concord.model.anndataset - INFO - Initialized lightweight dataset with 175 samples. concord.model.dataloader - INFO - Loading all data into memory for fast access. This may consume a lot of RAM. If you run out of memory, please set `preload_dense=False`. concord.model.anndataset - INFO - Initialized lightweight dataset with 157 samples. concord.model.anndataset - INFO - Initialized lightweight dataset with 18 samples. concord - INFO - Augmentation probabilities: concord - INFO - - Element mask probability: 0.4 concord - INFO - - Feature mask probability: 0.3 concord - INFO - Starting epoch 1/10 concord - INFO - Processing chunk 1/1 for epoch 1 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 0 Training: 100%|██████████| 1/1 [00:00<00:00, 88.13it/s, loss=6.761]
concord - INFO - Epoch 0 | Train Loss: 6.76, MSE: 0.00, CLASS: 1.34, CONTRAST: 5.42, IMPORTANCE: 0.00 concord - INFO - Epoch: 0 | Train accuracy: 0.43 | precision: 0: 0.45, 1: 0.33, 2: 0.00 | recall: 0: 0.91, 1: 0.06, 2: 0.00 | f1: 0: 0.60, 1: 0.11, 2: 0.00
Epoch 0 Validation: 100%|██████████| 1/1 [00:00<00:00, 379.95it/s, loss=5.390]
concord - INFO - Epoch 0 | Val Loss: 5.39, MSE: 0.00, CLASS: 2.02, CONTRAST: 3.37, IMPORTANCE: 0.00 concord - INFO - Epoch: 0 | Val accuracy: 0.50 | precision: 0: 0.00, 1: 0.40, 2: 1.00 | recall: 0: 0.00, 1: 1.00, 2: 1.00 | f1: 0: 0.00, 1: 0.57, 2: 1.00 concord - INFO - New best model found at epoch 1 with validation loss: 5.3897 concord - INFO - Starting epoch 2/10 concord - INFO - Processing chunk 1/1 for epoch 2 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 1 Training: 100%|██████████| 1/1 [00:00<00:00, 39.66it/s, loss=7.444]
concord - INFO - Epoch 1 | Train Loss: 7.44, MSE: 0.00, CLASS: 2.04, CONTRAST: 5.40, IMPORTANCE: 0.00 concord - INFO - Epoch: 1 | Train accuracy: 0.52 | precision: 0: 0.00, 1: 0.42, 2: 1.00 | recall: 0: 0.00, 1: 1.00, 2: 0.88 | f1: 0: 0.00, 1: 0.59, 2: 0.94
Epoch 1 Validation: 100%|██████████| 1/1 [00:00<00:00, 353.86it/s, loss=3.744]
concord - INFO - Epoch 1 | Val Loss: 3.74, MSE: 0.00, CLASS: 0.42, CONTRAST: 3.32, IMPORTANCE: 0.00 concord - INFO - Epoch: 1 | Val accuracy: 0.72 | precision: 0: 1.00, 1: 1.00, 2: 0.38 | recall: 0: 1.00, 1: 0.17, 2: 1.00 | f1: 0: 1.00, 1: 0.29, 2: 0.55 concord - INFO - New best model found at epoch 2 with validation loss: 3.7438 concord - INFO - Starting epoch 3/10 concord - INFO - Processing chunk 1/1 for epoch 3 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 2 Training: 100%|██████████| 1/1 [00:00<00:00, 68.48it/s, loss=5.747]
concord - INFO - Epoch 2 | Train Loss: 5.75, MSE: 0.00, CLASS: 0.45, CONTRAST: 5.29, IMPORTANCE: 0.00 concord - INFO - Epoch: 2 | Train accuracy: 0.83 | precision: 0: 1.00, 1: 1.00, 2: 0.55 | recall: 0: 1.00, 1: 0.57, 2: 1.00 | f1: 0: 1.00, 1: 0.73, 2: 0.71
Epoch 2 Validation: 100%|██████████| 1/1 [00:00<00:00, 312.91it/s, loss=3.635]
concord - INFO - Epoch 2 | Val Loss: 3.63, MSE: 0.00, CLASS: 0.33, CONTRAST: 3.30, IMPORTANCE: 0.00 concord - INFO - Epoch: 2 | Val accuracy: 0.83 | precision: 0: 0.75, 1: 1.00, 2: 0.00 | recall: 0: 1.00, 1: 1.00, 2: 0.00 | f1: 0: 0.86, 1: 1.00, 2: 0.00 concord - INFO - New best model found at epoch 3 with validation loss: 3.6349 concord - INFO - Starting epoch 4/10 concord - INFO - Processing chunk 1/1 for epoch 4 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 3 Training: 100%|██████████| 1/1 [00:00<00:00, 97.39it/s, loss=5.701]
concord - INFO - Epoch 3 | Train Loss: 5.70, MSE: 0.00, CLASS: 0.38, CONTRAST: 5.32, IMPORTANCE: 0.00 concord - INFO - Epoch: 3 | Train accuracy: 0.86 | precision: 0: 0.75, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 0.22 | f1: 0: 0.86, 1: 1.00, 2: 0.36
Epoch 3 Validation: 100%|██████████| 1/1 [00:00<00:00, 300.54it/s, loss=3.362]
concord - INFO - Epoch 3 | Val Loss: 3.36, MSE: 0.00, CLASS: 0.10, CONTRAST: 3.26, IMPORTANCE: 0.00 concord - INFO - Epoch: 3 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 4 with validation loss: 3.3622 concord - INFO - Starting epoch 5/10 concord - INFO - Processing chunk 1/1 for epoch 5 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 4 Training: 100%|██████████| 1/1 [00:00<00:00, 44.88it/s, loss=5.404]
concord - INFO - Epoch 4 | Train Loss: 5.40, MSE: 0.00, CLASS: 0.11, CONTRAST: 5.29, IMPORTANCE: 0.00 concord - INFO - Epoch: 4 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 4 Validation: 100%|██████████| 1/1 [00:00<00:00, 337.16it/s, loss=3.304]
concord - INFO - Epoch 4 | Val Loss: 3.30, MSE: 0.00, CLASS: 0.05, CONTRAST: 3.26, IMPORTANCE: 0.00 concord - INFO - Epoch: 4 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 5 with validation loss: 3.3041 concord - INFO - Starting epoch 6/10 concord - INFO - Processing chunk 1/1 for epoch 6 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 5 Training: 100%|██████████| 1/1 [00:00<00:00, 79.26it/s, loss=5.243]
concord - INFO - Epoch 5 | Train Loss: 5.24, MSE: 0.00, CLASS: 0.04, CONTRAST: 5.20, IMPORTANCE: 0.00 concord - INFO - Epoch: 5 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 5 Validation: 100%|██████████| 1/1 [00:00<00:00, 341.42it/s, loss=3.298]
concord - INFO - Epoch 5 | Val Loss: 3.30, MSE: 0.00, CLASS: 0.03, CONTRAST: 3.27, IMPORTANCE: 0.00 concord - INFO - Epoch: 5 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 6 with validation loss: 3.2978 concord - INFO - Starting epoch 7/10 concord - INFO - Processing chunk 1/1 for epoch 7 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 6 Training: 100%|██████████| 1/1 [00:00<00:00, 48.02it/s, loss=5.224]
concord - INFO - Epoch 6 | Train Loss: 5.22, MSE: 0.00, CLASS: 0.03, CONTRAST: 5.19, IMPORTANCE: 0.00
concord - INFO - Epoch: 6 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 6 Validation: 100%|██████████| 1/1 [00:00<00:00, 459.15it/s, loss=3.295]
concord - INFO - Epoch 6 | Val Loss: 3.29, MSE: 0.00, CLASS: 0.02, CONTRAST: 3.27, IMPORTANCE: 0.00
concord - INFO - Epoch: 6 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 7 with validation loss: 3.2946 concord - INFO - Starting epoch 8/10 concord - INFO - Processing chunk 1/1 for epoch 8 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 7 Training: 100%|██████████| 1/1 [00:00<00:00, 41.52it/s, loss=5.242]
concord - INFO - Epoch 7 | Train Loss: 5.24, MSE: 0.00, CLASS: 0.02, CONTRAST: 5.22, IMPORTANCE: 0.00 concord - INFO - Epoch: 7 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 7 Validation: 100%|██████████| 1/1 [00:00<00:00, 275.63it/s, loss=3.292]
concord - INFO - Epoch 7 | Val Loss: 3.29, MSE: 0.00, CLASS: 0.02, CONTRAST: 3.28, IMPORTANCE: 0.00
concord - INFO - Epoch: 7 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 8 with validation loss: 3.2922 concord - INFO - Starting epoch 9/10 concord - INFO - Processing chunk 1/1 for epoch 9 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 8 Training: 100%|██████████| 1/1 [00:00<00:00, 48.07it/s, loss=5.311]
concord - INFO - Epoch 8 | Train Loss: 5.31, MSE: 0.00, CLASS: 0.02, CONTRAST: 5.30, IMPORTANCE: 0.00 concord - INFO - Epoch: 8 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 8 Validation: 100%|██████████| 1/1 [00:00<00:00, 286.22it/s, loss=3.290]
concord - INFO - Epoch 8 | Val Loss: 3.29, MSE: 0.00, CLASS: 0.01, CONTRAST: 3.28, IMPORTANCE: 0.00
concord - INFO - Epoch: 8 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 9 with validation loss: 3.2902 concord - INFO - Starting epoch 10/10 concord - INFO - Processing chunk 1/1 for epoch 10 concord - INFO - Number of samples in train_dataloader: 157 concord - INFO - Number of samples in val_dataloader: 18
Epoch 9 Training: 100%|██████████| 1/1 [00:00<00:00, 58.89it/s, loss=5.286]
concord - INFO - Epoch 9 | Train Loss: 5.29, MSE: 0.00, CLASS: 0.01, CONTRAST: 5.27, IMPORTANCE: 0.00
concord - INFO - Epoch: 9 | Train accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00
Epoch 9 Validation: 100%|██████████| 1/1 [00:00<00:00, 182.75it/s, loss=3.288]
concord - INFO - Epoch 9 | Val Loss: 3.29, MSE: 0.00, CLASS: 0.01, CONTRAST: 3.28, IMPORTANCE: 0.00 concord - INFO - Epoch: 9 | Val accuracy: 1.00 | precision: 0: 1.00, 1: 1.00, 2: 1.00 | recall: 0: 1.00, 1: 1.00, 2: 1.00 | f1: 0: 1.00, 1: 1.00, 2: 1.00 concord - INFO - New best model found at epoch 10 with validation loss: 3.2882 concord - INFO - Best model state loaded into the model before final save.
concord - INFO - Model saved to ../save/dev_simulation_clustereasy_classification-Sep10/model/final_model_Sep10-2136.pt concord - INFO - Final model saved at: ../save/dev_simulation_clustereasy_classification-Sep10/model/final_model_Sep10-2136.pt; Configuration saved at: ../save/dev_simulation_clustereasy_classification-Sep10/model/config_Sep10-2136.json. concord.model.dataloader - INFO - Using 0 DataLoader workers. concord.model.anndataset - INFO - Initialized lightweight dataset with 175 samples. concord.model.dataloader - INFO - Loading all data into memory for fast access. This may consume a lot of RAM. If you run out of memory, please set `preload_dense=False`. concord - INFO - Predicting for chunk 1/1 concord - INFO - Predictions added to AnnData object with base key 'Concord'.
ccd.ul.run_umap(adata_train, source_key='Concord', result_key='Concord_UMAP', n_components=2, n_neighbors=15, min_dist=0.1, metric='euclidean')
ccd.pl.plot_embedding(
adata_train, basis='Concord_UMAP', color_by=color_by, figsize=(8, 2), dpi=600, ncols=3, font_size=6, point_size=10, legend_loc='on data',
pal=pal,
save_path=save_dir / 'Concord_UMAP.png'
)
concord - INFO - UMAP embedding stored in adata.obsm['Concord_UMAP']
After training, use the snippet below to (optionally) reload the saved model and run inference on held-out or new data. Inference writes predicted labels and per-class probabilities to adata.obs
, and projects the new cells into the learned latent space via the trained encoder, saving the embedding in adata.obsm
(keys depend on your chosen output_key
).
#cur_ccd = ccd.Concord.load(model_dir=save_dir / 'model/',)
cur_ccd.predict_adata(adata_pred, output_key='Concord')
concord - WARNING - domain/batch information not found, all samples will be treated as from single domain/batch.
concord - WARNING - Only one domain found in the data. Setting p_intra_domain to 1.0. concord.model.dataloader - INFO - Using 0 DataLoader workers. concord.model.anndataset - INFO - Initialized lightweight dataset with 75 samples. concord.model.dataloader - INFO - Loading all data into memory for fast access. This may consume a lot of RAM. If you run out of memory, please set `preload_dense=False`. concord - INFO - Predicting for chunk 1/1 concord - INFO - Predictions added to AnnData object with base key 'Concord'.
adata_merge = adata_train.concatenate(adata_pred, batch_key='train_test', index_unique=None)
ccd.ul.run_umap(adata_merge, source_key='Concord', result_key='Concord_UMAP_2D', n_components=2, n_neighbors=30, min_dist=0.5, metric='euclidean', random_state=seed)
color_by = [state_key, 'input_state', 'Concord_class_pred', 'train_test']
show_basis = 'Concord_UMAP_2D'
ccd.pl.plot_embedding(
adata_merge, show_basis, color_by, figsize=(9,2), dpi=300, ncols=4, font_size=5, point_size=10, legend_loc='on data',
pal=pal,
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)
concord - INFO - UMAP embedding stored in adata.obsm['Concord_UMAP_2D']