Intestine development by Huycke et al.
Basic setup¶
In [1]:
Copied!
%load_ext autoreload
%autoreload 2
%load_ext autoreload
%autoreload 2
In [ ]:
Copied!
import concord as ccd
import scanpy as sc
import torch
import warnings
warnings.filterwarnings('ignore')
data_path = "../data/intestine_dev/intestine_adata.h5ad"
adata = sc.read(
data_path
)
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
ccd.ul.score_cell_cycle(adata, organism='Mm')
import concord as ccd
import scanpy as sc
import torch
import warnings
warnings.filterwarnings('ignore')
data_path = "../data/intestine_dev/intestine_adata.h5ad"
adata = sc.read(
data_path
)
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
ccd.ul.score_cell_cycle(adata, organism='Mm')
Concord - INFO - Processed 43 human genes to mouse orthologs. Concord - INFO - Processed 54 human genes to mouse orthologs.
In [3]:
Copied!
import time
from pathlib import Path
proj_name = "concord_Huycke_intestine"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
file_suffix = f"{proj_name}_{time.strftime('%b%d')}"
seed = 0
import time
from pathlib import Path
proj_name = "concord_Huycke_intestine"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
file_suffix = f"{proj_name}_{time.strftime('%b%d')}"
seed = 0
Run Concord¶
In [ ]:
Copied!
output_key = 'Concord'
feature_list = ccd.ul.select_features(adata, n_top_features=10000, flavor='seurat_v3')
cur_ccd = ccd.Concord(adata=adata, input_feature=feature_list, domain_key='LaneID',
latent_dim=32,
clr_temperature = .3, # Check out advanced usage to learn what this parameter controls
p_intra_domain=1.0, # Check out advanced usage to learn what this parameter controls
seed=seed,
inplace=False,
verbose=True,
device=device)
# Encode data, saving the latent embedding in adata.obsm['Concord']
cur_ccd.encode_adata(input_layer_key='X_log1p', output_key=output_key)
# Save the latent embedding to a filem, so that it can be loaded later
ccd.ul.save_obsm_to_hdf5(cur_ccd.adata, save_dir / f"obsm_{file_suffix}.h5")
adata.obsm = cur_ccd.adata.obsm # If not inplace
output_key = 'Concord'
feature_list = ccd.ul.select_features(adata, n_top_features=10000, flavor='seurat_v3')
cur_ccd = ccd.Concord(adata=adata, input_feature=feature_list, domain_key='LaneID',
latent_dim=32,
clr_temperature = .3, # Check out advanced usage to learn what this parameter controls
p_intra_domain=1.0, # Check out advanced usage to learn what this parameter controls
seed=seed,
inplace=False,
verbose=True,
device=device)
# Encode data, saving the latent embedding in adata.obsm['Concord']
cur_ccd.encode_adata(input_layer_key='X_log1p', output_key=output_key)
# Save the latent embedding to a filem, so that it can be loaded later
ccd.ul.save_obsm_to_hdf5(cur_ccd.adata, save_dir / f"obsm_{file_suffix}.h5")
adata.obsm = cur_ccd.adata.obsm # If not inplace
Concord - INFO - Setting sampler_knn to 1309 to be 1/50 the number of cells in the dataset. You can change this value by setting sampler_knn in the configuration. Concord - INFO - Column 'LaneID' is already of type: category Concord - INFO - Unused levels dropped for column 'LaneID'. Concord - INFO - Encoder input dim: 10000 Concord - INFO - Decoder input dim: 40 Concord - INFO - Model loaded to device: cuda:0 Concord - INFO - Total number of parameters: 2590128 Concord.model.dataloader - INFO - Preprocessing adata... Concord.utils.preprocessor - INFO - Data is already log1p transformed. Skip normalization. Concord.utils.preprocessor - INFO - Data is already log1p transformed. Storing in the specified layer. Concord.utils.preprocessor - INFO - Filtering features with provided list (10000 features)... Concord.model.anndataset - INFO - Initialized dataset with 65468 samples. Data structure: ['input', 'domain', 'idx'] Concord.model.dataloader - INFO - Using existing embedding 'X_pca' from adata.obsm Concord.model.knn - INFO - Using FAISS CPU index. Concord.model.knn - INFO - Building Faiss IVF index. nprobe=10 Concord.model.knn - INFO - Using FAISS CPU index. Concord.model.dataloader - INFO - Number of unique_domains: 6 Concord.model.dataloader - INFO - Calculating each domain's coverage of the global manifold using X_pca. Concord.model.dataloader - INFO - Converting coverage to p_intra_domain... Concord.model.dataloader - INFO - Final p_intra_domain values: L1: 1.00, L2: 1.00, R: 1.00, Live_1: 1.00, Live_2: 1.00, Rare: 1.00 Concord - INFO - Starting epoch 1/5 Concord - INFO - Processing chunk 1/1 for epoch 1 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 0 Training: 1020it [00:32, 31.28it/s, loss=3.05]
Concord - INFO - Epoch 0 | Train Loss: 3.33, MSE: 0.09, CLASS: 0.00, CONTRAST: 3.24, IMPORTANCE: 0.00 Concord - INFO - Starting epoch 2/5 Concord - INFO - Processing chunk 1/1 for epoch 2 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 1 Training: 100%|██████████| 1020/1020 [00:30<00:00, 33.47it/s, loss=3.06]
Concord - INFO - Epoch 1 | Train Loss: 2.99, MSE: 0.06, CLASS: 0.00, CONTRAST: 2.93, IMPORTANCE: 0.00 Concord - INFO - Starting epoch 3/5 Concord - INFO - Processing chunk 1/1 for epoch 3 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 2 Training: 100%|██████████| 1020/1020 [00:27<00:00, 36.61it/s, loss=2.94]
Concord - INFO - Epoch 2 | Train Loss: 2.94, MSE: 0.06, CLASS: 0.00, CONTRAST: 2.88, IMPORTANCE: 0.00 Concord - INFO - Starting epoch 4/5 Concord - INFO - Processing chunk 1/1 for epoch 4 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 3 Training: 100%|██████████| 1020/1020 [00:28<00:00, 36.01it/s, loss=3.09]
Concord - INFO - Epoch 3 | Train Loss: 2.91, MSE: 0.06, CLASS: 0.00, CONTRAST: 2.85, IMPORTANCE: 0.00 Concord - INFO - Starting epoch 5/5 Concord - INFO - Processing chunk 1/1 for epoch 5 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 4 Training: 100%|██████████| 1020/1020 [00:29<00:00, 34.81it/s, loss=3.05]
Concord - INFO - Epoch 4 | Train Loss: 2.89, MSE: 0.06, CLASS: 0.00, CONTRAST: 2.83, IMPORTANCE: 0.00
Concord - INFO - Model saved to save/final_model.pth Concord - INFO - Final model saved at: save/final_model.pth; Configuration saved at: save/config.json. Concord.model.dataloader - INFO - Preprocessing adata... Concord.utils.preprocessor - INFO - Data is already log1p transformed. Skip normalization. Concord.utils.preprocessor - INFO - Data is already log1p transformed. Storing in the specified layer. Concord.utils.preprocessor - INFO - Filtering features with provided list (10000 features)... Concord.model.anndataset - INFO - Initialized dataset with 65468 samples. Data structure: ['input', 'domain', 'idx'] Concord - INFO - Predicting for chunk 1/1
Visualize Concord latent with UMAP¶
2D UMAP¶
In [ ]:
Copied!
#ccd.ul.run_umap(adata, source_key=output_key, result_key=f'{output_key}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
show_basis = f'{output_key}_UMAP'
show_cols = ['MouseAge_combined', 'seg_classify', 'phase', 'batch', 'cell_type', "mes_subtype", "dropout_est"]
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(13,12), dpi=600, ncols=3, font_size=5, point_size=1, legend_loc='on data',
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)
#ccd.ul.run_umap(adata, source_key=output_key, result_key=f'{output_key}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
show_basis = f'{output_key}_UMAP'
show_cols = ['MouseAge_combined', 'seg_classify', 'phase', 'batch', 'cell_type', "mes_subtype", "dropout_est"]
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(13,12), dpi=600, ncols=3, font_size=5, point_size=1, legend_loc='on data',
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)