C. elegans Embryogenesis by Packer & Zhu et al.
This tutorial processes C. elegans embryogenesis single cell data published by Packer & Zhu et al., 2019. Data can be downloaded from https://zenodo.org/records/15236812.
Load package and data¶
In [1]:
Copied!
# get current working directory
import os
cwd = os.getcwd()
print("Current working directory:", cwd)
# get current working directory
import os
cwd = os.getcwd()
print("Current working directory:", cwd)
Current working directory: /Users/QZhu/Documents/CONCORD/QZ_documentation/Concord_documentation/docs/notebooks
In [ ]:
Copied!
import concord as ccd
import scanpy as sc
import torch
import warnings
from pathlib import Path
import time
warnings.filterwarnings('ignore')
data_dir = Path('../../data/celegans_data/') # Replace with your data directory
data_path = data_dir / "celegans_global_adata.h5ad"
adata = sc.read(data_path)
import concord as ccd
import scanpy as sc
import torch
import warnings
from pathlib import Path
import time
warnings.filterwarnings('ignore')
data_dir = Path('../../data/celegans_data/') # Replace with your data directory
data_path = data_dir / "celegans_global_adata.h5ad"
adata = sc.read(data_path)
In [ ]:
Copied!
proj_name = "concord_celegans"
save_dir = f"../../save/{proj_name}-{time.strftime('%b%d')}/" # Replace with your save directory if needed
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Replace with your device, for example, 'mps' for Apple silicon
file_suffix = f"{time.strftime('%b%d-%H%M')}" # Replace with your file suffix if needed
seed = 0
proj_name = "concord_celegans"
save_dir = f"../../save/{proj_name}-{time.strftime('%b%d')}/" # Replace with your save directory if needed
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Replace with your device, for example, 'mps' for Apple silicon
file_suffix = f"{time.strftime('%b%d-%H%M')}" # Replace with your file suffix if needed
seed = 0
In [4]:
Copied!
adata.layers["counts"] = adata.X.copy() # Store raw counts in a layer to keep a backup
feature_list = ccd.ul.select_features(adata, n_top_features=6000, flavor='seurat_v3') # Loosely select features based on Seurat v3 method (so that enough information is preserved)
sc.pp.normalize_total(adata) # Normalize counts per cell
sc.pp.log1p(adata) # Log-transform data
adata = adata[adata.obs['to.filter'] == 'FALSE'] # Only keep cells that pass quality filter
adata.layers["counts"] = adata.X.copy() # Store raw counts in a layer to keep a backup
feature_list = ccd.ul.select_features(adata, n_top_features=6000, flavor='seurat_v3') # Loosely select features based on Seurat v3 method (so that enough information is preserved)
sc.pp.normalize_total(adata) # Normalize counts per cell
sc.pp.log1p(adata) # Log-transform data
adata = adata[adata.obs['to.filter'] == 'FALSE'] # Only keep cells that pass quality filter
concord.utils.feature_selector - INFO - Selecting highly variable features with flavor seurat_v3...
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Run Concord¶
By default the hcl mode is used, which is specified through 'clr_beta': 1.0
. To use kNN mode, set 'clr_beta': 0.0
; 'p_intra_knn': 0.3
(or between 0.1-0.5). Hybrid mode is also supported via setting both clr_beta
and p_intra_knn
to non-zero values, but has not been benchmarked.
In [ ]:
Copied!
concord_args = {
'adata': adata,
'input_feature': feature_list,
'batch_size':128, # Batch size for training, adjust as needed
'latent_dim': 300, # Latent dimension size, adjust as needed
'encoder_dims':[1000], # Encoder dimensions, recommended to be larger than latent_dim
'element_mask_prob': 0.4, # Probability of masking elements, recommended to be between 0.2 and 0.6
'feature_mask_prob': 0.2, # Probability of masking features, recommended to be between 0.0 and 0.5
'clr_temperature': 0.4, # Temperature for contrastive loss, recommended to be between 0.1 and 0.5
'clr_beta': 1.0, # Weight for hcl sampling, recommended to be between 0.5 and 2.0
'p_intra_domain': 1.0, # Enrichment probability for intra-domain sampling, recommended to be between 0.9 and 1.0, note the lower the value, the more dataset-specific information (may contain batch effects) is preserved
'n_epochs': 15, # Number of epochs for training, adjust as needed
'domain_key': 'batch', # Key in adata.obs for batch labels
'verbose': True, # Verbosity level, set to True for more detailed output
'preload_dense': True, # Whether to preload dense matrix, set to False for very large datasets or if using multi-worker data loading
'seed': seed, # random seed for reproducibility
'device': device, # Device for training, can be 'cpu', 'cuda', or 'mps'
'save_dir': save_dir # Directory to save the model and results
}
concord_args = {
'adata': adata,
'input_feature': feature_list,
'batch_size':128, # Batch size for training, adjust as needed
'latent_dim': 300, # Latent dimension size, adjust as needed
'encoder_dims':[1000], # Encoder dimensions, recommended to be larger than latent_dim
'element_mask_prob': 0.4, # Probability of masking elements, recommended to be between 0.2 and 0.6
'feature_mask_prob': 0.2, # Probability of masking features, recommended to be between 0.0 and 0.5
'clr_temperature': 0.4, # Temperature for contrastive loss, recommended to be between 0.1 and 0.5
'clr_beta': 1.0, # Weight for hcl sampling, recommended to be between 0.5 and 2.0
'p_intra_domain': 1.0, # Enrichment probability for intra-domain sampling, recommended to be between 0.9 and 1.0, note the lower the value, the more dataset-specific information (may contain batch effects) is preserved
'n_epochs': 15, # Number of epochs for training, adjust as needed
'domain_key': 'batch', # Key in adata.obs for batch labels
'verbose': True, # Verbosity level, set to True for more detailed output
'preload_dense': True, # Whether to preload dense matrix, set to False for very large datasets or if using multi-worker data loading
'seed': seed, # random seed for reproducibility
'device': device, # Device for training, can be 'cpu', 'cuda', or 'mps'
'save_dir': save_dir # Directory to save the model and results
}
Be sure to specify domain_key to integrate across batches.
In [6]:
Copied!
output_key = 'Concord'
cur_ccd = ccd.Concord(**concord_args)
cur_ccd.fit_transform(output_key=output_key)
output_key = 'Concord'
cur_ccd = ccd.Concord(**concord_args)
cur_ccd.fit_transform(output_key=output_key)
concord - INFO - Operating directly on the provided AnnData object. Object may be modified. concord - INFO - Using NT-Xent loss with beta=1.0. This will apply hard-negative weighting to the contrastive loss. concord - INFO - HCL (Contrastive learning with hard negative samples) mode is enabled. concord - INFO - Column 'batch' is already of type: category concord - INFO - Unused levels dropped for column 'batch'. concord - INFO - Encoder input dim: 6000 concord - INFO - Model loaded to device: cpu concord - INFO - Total number of parameters: 6303956 concord.model.dataloader - INFO - Using 0 DataLoader workers. concord.model.dataloader - INFO - Filtering features with provided list (6000 features)... concord.model.anndataset - INFO - Initialized lightweight dataset with 89701 samples. concord.model.dataloader - INFO - Loading all data into memory for fast access. This may consume a lot of RAM. If you run out of memory, please set `preload_dense=False`. concord - INFO - Augmentation probabilities: concord - INFO - - Element mask probability: 0.4 concord - INFO - - Feature mask probability: 0.2 concord - INFO - Starting epoch 1/15 concord - INFO - Processing chunk 1/1 for epoch 1 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 0 Training: 100%|██████████| 698/698 [00:11<00:00, 58.22it/s, loss=3.992]
concord - INFO - Epoch 0 | Train Loss: 4.25, MSE: 0.00, CLASS: 0.00, CONTRAST: 4.25, IMPORTANCE: 0.00 concord - INFO - Starting epoch 2/15 concord - INFO - Processing chunk 1/1 for epoch 2 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 1 Training: 100%|██████████| 698/698 [00:11<00:00, 59.74it/s, loss=3.871]
concord - INFO - Epoch 1 | Train Loss: 3.95, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.95, IMPORTANCE: 0.00 concord - INFO - Starting epoch 3/15 concord - INFO - Processing chunk 1/1 for epoch 3 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 2 Training: 100%|██████████| 698/698 [00:12<00:00, 58.01it/s, loss=3.970]
concord - INFO - Epoch 2 | Train Loss: 3.87, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.87, IMPORTANCE: 0.00 concord - INFO - Starting epoch 4/15 concord - INFO - Processing chunk 1/1 for epoch 4 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 3 Training: 100%|██████████| 698/698 [00:12<00:00, 57.37it/s, loss=3.790]
concord - INFO - Epoch 3 | Train Loss: 3.81, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.81, IMPORTANCE: 0.00 concord - INFO - Starting epoch 5/15 concord - INFO - Processing chunk 1/1 for epoch 5 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 4 Training: 100%|██████████| 698/698 [00:12<00:00, 57.79it/s, loss=3.723]
concord - INFO - Epoch 4 | Train Loss: 3.78, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.78, IMPORTANCE: 0.00 concord - INFO - Starting epoch 6/15 concord - INFO - Processing chunk 1/1 for epoch 6 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 5 Training: 100%|██████████| 698/698 [00:11<00:00, 58.50it/s, loss=3.746]
concord - INFO - Epoch 5 | Train Loss: 3.75, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.75, IMPORTANCE: 0.00 concord - INFO - Starting epoch 7/15 concord - INFO - Processing chunk 1/1 for epoch 7 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 6 Training: 100%|██████████| 698/698 [00:11<00:00, 58.93it/s, loss=3.702]
concord - INFO - Epoch 6 | Train Loss: 3.73, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.73, IMPORTANCE: 0.00 concord - INFO - Starting epoch 8/15 concord - INFO - Processing chunk 1/1 for epoch 8 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 7 Training: 100%|██████████| 698/698 [00:12<00:00, 57.38it/s, loss=3.639]
concord - INFO - Epoch 7 | Train Loss: 3.72, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.72, IMPORTANCE: 0.00 concord - INFO - Starting epoch 9/15 concord - INFO - Processing chunk 1/1 for epoch 9 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 8 Training: 100%|██████████| 698/698 [00:11<00:00, 58.66it/s, loss=3.640]
concord - INFO - Epoch 8 | Train Loss: 3.70, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.70, IMPORTANCE: 0.00 concord - INFO - Starting epoch 10/15 concord - INFO - Processing chunk 1/1 for epoch 10 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 9 Training: 100%|██████████| 698/698 [00:11<00:00, 59.27it/s, loss=3.643]
concord - INFO - Epoch 9 | Train Loss: 3.68, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.68, IMPORTANCE: 0.00
concord - INFO - Starting epoch 11/15 concord - INFO - Processing chunk 1/1 for epoch 11 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 10 Training: 100%|██████████| 698/698 [00:12<00:00, 57.69it/s, loss=3.623]
concord - INFO - Epoch 10 | Train Loss: 3.67, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.67, IMPORTANCE: 0.00 concord - INFO - Starting epoch 12/15 concord - INFO - Processing chunk 1/1 for epoch 12 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 11 Training: 100%|██████████| 698/698 [00:12<00:00, 57.17it/s, loss=3.638]
concord - INFO - Epoch 11 | Train Loss: 3.66, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.66, IMPORTANCE: 0.00 concord - INFO - Starting epoch 13/15 concord - INFO - Processing chunk 1/1 for epoch 13 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 12 Training: 100%|██████████| 698/698 [00:12<00:00, 57.24it/s, loss=3.684]
concord - INFO - Epoch 12 | Train Loss: 3.65, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.65, IMPORTANCE: 0.00 concord - INFO - Starting epoch 14/15 concord - INFO - Processing chunk 1/1 for epoch 14 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 13 Training: 100%|██████████| 698/698 [00:12<00:00, 55.77it/s, loss=3.660]
concord - INFO - Epoch 13 | Train Loss: 3.64, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.64, IMPORTANCE: 0.00 concord - INFO - Starting epoch 15/15 concord - INFO - Processing chunk 1/1 for epoch 15 concord - INFO - Number of samples in train_dataloader: 89701
Epoch 14 Training: 100%|██████████| 698/698 [00:12<00:00, 57.25it/s, loss=3.648]
concord - INFO - Epoch 14 | Train Loss: 3.63, MSE: 0.00, CLASS: 0.00, CONTRAST: 3.63, IMPORTANCE: 0.00
concord - INFO - Model saved to ../save/concord_celegans-Sep08/final_model_Sep08-1842.pt concord - INFO - Final model saved at: ../save/concord_celegans-Sep08/final_model_Sep08-1842.pt; Configuration saved at: ../save/concord_celegans-Sep08/config_Sep08-1842.json. concord.model.dataloader - INFO - Using 0 DataLoader workers. concord.model.dataloader - INFO - Filtering features with provided list (6000 features)... concord.model.anndataset - INFO - Initialized lightweight dataset with 89701 samples. concord.model.dataloader - INFO - Loading all data into memory for fast access. This may consume a lot of RAM. If you run out of memory, please set `preload_dense=False`. concord - INFO - Predicting for chunk 1/1 concord - INFO - Predictions added to AnnData object with base key 'Concord'.
Visualization¶
We recommend using UMAP to visualize the CONCORD latent space:
In [7]:
Copied!
basis = 'Concord'
ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
show_basis = basis + '_UMAP'
show_cols = ['plot.cell.type', 'raw.embryo.time', "batch"]
pal = {'plot.cell.type': 'tab20', 'raw.embryo.time': 'BlueGreenRed', "batch": 'tab20'}
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(13,5), dpi=600, ncols=3, font_size=5, point_size=2, legend_loc='on data',
pal = pal,
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)
basis = 'Concord'
ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
show_basis = basis + '_UMAP'
show_cols = ['plot.cell.type', 'raw.embryo.time', "batch"]
pal = {'plot.cell.type': 'tab20', 'raw.embryo.time': 'BlueGreenRed', "batch": 'tab20'}
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(13,5), dpi=600, ncols=3, font_size=5, point_size=2, legend_loc='on data',
pal = pal,
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)
concord - INFO - UMAP embedding stored in adata.obsm['Concord_UMAP']
In [8]:
Copied!
ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP_3D', n_components=3, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
import plotly.io as pio
pio.renderers.default = 'notebook'
for col in show_cols:
show_basis = f'{basis}_UMAP_3D'
ccd.pl.plot_embedding_3d(
adata, basis=show_basis, color_by=col,
pal = pal[col],
save_path=save_dir / f'{show_basis}_{col}_{file_suffix}.html',
point_size=1, opacity=0.8, width=1500, height=1000
)
ccd.ul.run_umap(adata, source_key=basis, result_key=f'{basis}_UMAP_3D', n_components=3, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
import plotly.io as pio
pio.renderers.default = 'notebook'
for col in show_cols:
show_basis = f'{basis}_UMAP_3D'
ccd.pl.plot_embedding_3d(
adata, basis=show_basis, color_by=col,
pal = pal[col],
save_path=save_dir / f'{show_basis}_{col}_{file_suffix}.html',
point_size=1, opacity=0.8, width=1500, height=1000
)
concord - INFO - UMAP embedding stored in adata.obsm['Concord_UMAP_3D'] concord.plotting.pl_embedding - INFO - 3D plot saved to ../save/concord_celegans-Sep08/Concord_UMAP_3D_plot.cell.type_Sep08-1836_plot.cell.type.html