Trajectory simulationĀ¶
InĀ [1]:
Copied!
%load_ext autoreload
%autoreload 2
%load_ext autoreload
%autoreload 2
InĀ [Ā ]:
Copied!
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl
from matplotlib import font_manager, rcParams
custom_rc = {
'font.family': 'Arial', # Set the desired font for this plot
}
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl
from matplotlib import font_manager, rcParams
custom_rc = {
'font.family': 'Arial', # Set the desired font for this plot
}
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42
InĀ [3]:
Copied!
proj_name = "simulation_trajectory"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
data_dir = f"../data/{proj_name}/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)
file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix
proj_name = "simulation_trajectory"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
data_dir = f"../data/{proj_name}/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)
file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix
cpu
Out[3]:
'Feb16-1614'
SimulationĀ¶
InĀ [4]:
Copied!
state_key = 'time'
batch_key = 'batch'
state_type = 'trajectory'
batch_type = 'batch_specific_features'
distribution = 'normal'
leiden_key = 'leiden_no_noise'
state_key = 'time'
batch_key = 'batch'
state_type = 'trajectory'
batch_type = 'batch_specific_features'
distribution = 'normal'
leiden_key = 'leiden_no_noise'
InĀ [46]:
Copied!
from Concord.utils.simulation import Simulation
# Create an instance of the Simulation class
sim = Simulation(n_cells=1000, n_genes=100, n_batches=2, n_states=3,
state_type=state_type,
state_distribution = distribution,
state_level=10,
state_min_level=0,
state_dispersion=2.0,
program_structure='linear_bidirectional',
program_on_time_fraction=0.1,
trajectory_program_num=5,
trajectory_cell_block_size_ratio=0.6,
trajectory_loop_to=None,
batch_distribution = distribution,
batch_type=batch_type,
batch_level=[10,10],
batch_dispersion=[2.0, 2.0],
non_neg=True, to_int=True,
seed=42)
# Generate the simulated data
adata, adata_state = sim.simulate_data()
from Concord.utils.simulation import Simulation
# Create an instance of the Simulation class
sim = Simulation(n_cells=1000, n_genes=100, n_batches=2, n_states=3,
state_type=state_type,
state_distribution = distribution,
state_level=10,
state_min_level=0,
state_dispersion=2.0,
program_structure='linear_bidirectional',
program_on_time_fraction=0.1,
trajectory_program_num=5,
trajectory_cell_block_size_ratio=0.6,
trajectory_loop_to=None,
batch_distribution = distribution,
batch_type=batch_type,
batch_level=[10,10],
batch_dispersion=[2.0, 2.0],
non_neg=True, to_int=True,
seed=42)
# Generate the simulated data
adata, adata_state = sim.simulate_data()
Concord.utils.simulation - INFO - Simulating trajectory with 3 states, distribution: normal with mean expression 10 and dispersion 2.0. Concord.utils.simulation - INFO - Simulating batch-specific features effect on batch_1 by appending a set of batch-specific genes with normal distributed value with level 10 and dispersion 2.0. Concord.utils.simulation - INFO - Simulating batch-specific features effect on batch_2 by appending a set of batch-specific genes with normal distributed value with level 10 and dispersion 2.0.
InĀ [47]:
Copied!
ccd.pl.heatmap_with_annotations(adata_state, val='no_noise', obs_keys=[state_key], yticklabels=False, cluster_cols=False, cluster_rows=False, value_annot=False, cmap='viridis', title='True state', save_path=save_dir/f'true_state_heatmap_{file_suffix}.png', figsize=(6, 4), dpi=300)
ccd.pl.heatmap_with_annotations(adata_state, val='wt_noise', obs_keys=[state_key], yticklabels=False, cluster_cols=False, cluster_rows=False, value_annot=False, cmap='viridis', title='True state with noise', save_path=save_dir/f'true_state_with_noise_heatmap_{file_suffix}.png', figsize=(6, 4), dpi=300)
ccd.pl.heatmap_with_annotations(adata, val='X', obs_keys=[state_key, batch_key], yticklabels=False, cluster_cols=False, cluster_rows=False, value_annot=False, cmap='viridis', title='Simulated data with batch signal', save_path=save_dir/f'simulated_data_heatmap_{file_suffix}.png', figsize=(6, 4), dpi=300)
ccd.pl.heatmap_with_annotations(adata_state, val='no_noise', obs_keys=[state_key], yticklabels=False, cluster_cols=False, cluster_rows=False, value_annot=False, cmap='viridis', title='True state', save_path=save_dir/f'true_state_heatmap_{file_suffix}.png', figsize=(6, 4), dpi=300)
ccd.pl.heatmap_with_annotations(adata_state, val='wt_noise', obs_keys=[state_key], yticklabels=False, cluster_cols=False, cluster_rows=False, value_annot=False, cmap='viridis', title='True state with noise', save_path=save_dir/f'true_state_with_noise_heatmap_{file_suffix}.png', figsize=(6, 4), dpi=300)
ccd.pl.heatmap_with_annotations(adata, val='X', obs_keys=[state_key, batch_key], yticklabels=False, cluster_cols=False, cluster_rows=False, value_annot=False, cmap='viridis', title='Simulated data with batch signal', save_path=save_dir/f'simulated_data_heatmap_{file_suffix}.png', figsize=(6, 4), dpi=300)
Out[47]:
<seaborn.matrix.ClusterGrid at 0x7f094c765ca0>
VisualizationĀ¶
No batch effect, no noiseĀ¶
InĀ [48]:
Copied!
ccd.ul.run_pca(adata_state, source_key='no_noise', result_key='PCA_no_noise', n_pc=30, random_state=seed)
ccd.ul.run_umap(adata_state, source_key='PCA_no_noise', result_key='UMAP_no_noise', random_state=seed)
ccd.ul.run_pca(adata_state, source_key='no_noise', result_key='PCA_no_noise', n_pc=30, random_state=seed)
ccd.ul.run_umap(adata_state, source_key='PCA_no_noise', result_key='UMAP_no_noise', random_state=seed)
Concord - INFO - PCA performed on source data with 30 components Concord - INFO - PCA embedding stored in adata.obsm['PCA_no_noise'] Concord - INFO - UMAP embedding stored in adata.obsm['UMAP_no_noise']
InĀ [49]:
Copied!
sc.pp.neighbors(adata_state, use_rep='PCA_no_noise', n_neighbors=30, random_state=seed)
sc.tl.leiden(adata_state, resolution=1.0, key_added=leiden_key, random_state=seed)
adata.obs[leiden_key] = adata_state.obs[leiden_key]
sc.pp.neighbors(adata_state, use_rep='PCA_no_noise', n_neighbors=30, random_state=seed)
sc.tl.leiden(adata_state, resolution=1.0, key_added=leiden_key, random_state=seed)
adata.obs[leiden_key] = adata_state.obs[leiden_key]
InĀ [26]:
Copied!
show_basis = 'PCA_no_noise'
show_cols = [state_key, leiden_key, batch_key]
ccd.pl.plot_embedding(
adata_state, show_basis, show_cols, figsize=(7.5,2.5), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data', rasterized=False,
save_path=save_dir / f"nobatch_{show_basis}_{file_suffix}.pdf"
)
show_basis = 'PCA_no_noise'
show_cols = [state_key, leiden_key, batch_key]
ccd.pl.plot_embedding(
adata_state, show_basis, show_cols, figsize=(7.5,2.5), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data', rasterized=False,
save_path=save_dir / f"nobatch_{show_basis}_{file_suffix}.pdf"
)
InĀ [51]:
Copied!
show_basis = 'UMAP_no_noise'
show_cols = [state_key, leiden_key]
ccd.pl.plot_embedding(
adata_state, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data',
save_path=save_dir / f"nobatch_{show_basis}_{file_suffix}.png"
)
show_basis = 'UMAP_no_noise'
show_cols = [state_key, leiden_key]
ccd.pl.plot_embedding(
adata_state, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data',
save_path=save_dir / f"nobatch_{show_basis}_{file_suffix}.png"
)
No batch effect, noise added, PCA and UMAPĀ¶
InĀ [52]:
Copied!
adata_state.X = adata_state.layers['wt_noise'].copy()
ccd.ul.run_pca(adata_state, source_key='wt_noise', result_key='PCA_wt_noise', n_pc=30, random_state=seed)
ccd.ul.run_umap(adata_state, source_key='PCA_wt_noise', result_key='UMAP_wt_noise', random_state=seed)
adata_state.X = adata_state.layers['wt_noise'].copy()
ccd.ul.run_pca(adata_state, source_key='wt_noise', result_key='PCA_wt_noise', n_pc=30, random_state=seed)
ccd.ul.run_umap(adata_state, source_key='PCA_wt_noise', result_key='UMAP_wt_noise', random_state=seed)
Concord - INFO - PCA performed on source data with 30 components Concord - INFO - PCA embedding stored in adata.obsm['PCA_wt_noise'] Concord - INFO - UMAP embedding stored in adata.obsm['UMAP_wt_noise']
InĀ [25]:
Copied!
show_basis = 'PCA_wt_noise'
show_cols = [state_key, leiden_key, batch_key]
ccd.pl.plot_embedding(
adata_state, show_basis, show_cols, figsize=(7.5,2.5), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data', rasterized=False,
save_path=save_dir / f"nobatch_{show_basis}_{file_suffix}.pdf"
)
show_basis = 'PCA_wt_noise'
show_cols = [state_key, leiden_key, batch_key]
ccd.pl.plot_embedding(
adata_state, show_basis, show_cols, figsize=(7.5,2.5), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data', rasterized=False,
save_path=save_dir / f"nobatch_{show_basis}_{file_suffix}.pdf"
)
InĀ [54]:
Copied!
show_basis = 'UMAP_wt_noise'
show_cols = [state_key, leiden_key]
ccd.pl.plot_embedding(
adata_state, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data',
save_path=save_dir / f"nobatch_{show_basis}_{file_suffix}.png"
)
show_basis = 'UMAP_wt_noise'
show_cols = [state_key, leiden_key]
ccd.pl.plot_embedding(
adata_state, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data',
save_path=save_dir / f"nobatch_{show_basis}_{file_suffix}.png"
)
No batch correction, PCA and UMAPĀ¶
InĀ [55]:
Copied!
n_pcs = min(adata.n_obs, adata.n_vars)-1
sc.pp.pca(adata, n_comps=n_pcs)
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=n_pcs)
sc.tl.umap(adata, min_dist=0.5)
adata.obsm["Unintegrated"] = adata.obsm["X_pca"]
n_pcs = min(adata.n_obs, adata.n_vars)-1
sc.pp.pca(adata, n_comps=n_pcs)
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=n_pcs)
sc.tl.umap(adata, min_dist=0.5)
adata.obsm["Unintegrated"] = adata.obsm["X_pca"]
InĀ [56]:
Copied!
show_basis = 'X_pca'
show_cols = [state_key, batch_key, leiden_key]
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data',
save_path=save_dir / f"wtbatch_{show_basis}_{file_suffix}.png"
)
show_basis = 'X_pca'
show_cols = [state_key, batch_key, leiden_key]
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data',
save_path=save_dir / f"wtbatch_{show_basis}_{file_suffix}.png"
)
InĀ [57]:
Copied!
show_basis = 'X_umap'
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data',
save_path=save_dir / f"wtbatch_{show_basis}_{file_suffix}.png"
)
show_basis = 'X_umap'
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=20, legend_loc='on data',
save_path=save_dir / f"wtbatch_{show_basis}_{file_suffix}.png"
)
ConcordĀ¶
InĀ [100]:
Copied!
cur_ccd = ccd.Concord(adata=adata,
input_feature=None,
batch_size=64,
n_epochs=10,
domain_key=batch_key, # key indicating batch
seed=seed, # random seed
verbose=False, # print training progress
device=device, # device to run on
save_dir=save_dir # directory to save model checkpoints
)
# Encode data, saving the latent embedding in adata.obsm['Concord']
output_key = 'Concord'
cur_ccd.encode_adata(input_layer_key='X', output_key=output_key, preprocess=False)
# Save the latent embedding to a file, so that it can be loaded later
ccd.ul.save_obsm_to_hdf5(cur_ccd.adata, save_dir / f"obsm_{file_suffix}.h5")
cur_ccd = ccd.Concord(adata=adata,
input_feature=None,
batch_size=64,
n_epochs=10,
domain_key=batch_key, # key indicating batch
seed=seed, # random seed
verbose=False, # print training progress
device=device, # device to run on
save_dir=save_dir # directory to save model checkpoints
)
# Encode data, saving the latent embedding in adata.obsm['Concord']
output_key = 'Concord'
cur_ccd.encode_adata(input_layer_key='X', output_key=output_key, preprocess=False)
# Save the latent embedding to a file, so that it can be loaded later
ccd.ul.save_obsm_to_hdf5(cur_ccd.adata, save_dir / f"obsm_{file_suffix}.h5")
Concord - WARNING - No input feature list provided. It is recommended to first select features using the command `concord.ul.select_features()`.
WARNING clustering 1000 points to 31 centroids: please provide at least 1209 training points
p_intra_knn: 0.3
Epoch 0 Training: 14it [00:00, 64.20it/s, loss=4.07] Epoch 1 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 70.18it/s, loss=3.96] Epoch 2 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 74.94it/s, loss=3.89] Epoch 3 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 80.20it/s, loss=3.78] Epoch 4 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 79.21it/s, loss=3.78] Epoch 5 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 71.98it/s, loss=3.75] Epoch 6 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 68.10it/s, loss=3.8] Epoch 7 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 42.68it/s, loss=3.77] Epoch 8 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 42.76it/s, loss=3.82] Epoch 9 Training: 100%|āāāāāāāāāā| 14/14 [00:00<00:00, 36.47it/s, loss=3.78]
InĀ [101]:
Copied!
ccd.pl.heatmap_with_annotations(adata, val='Concord', obs_keys=[state_key, batch_key],
cluster_cols=True, cluster_rows=True, cmap='viridis',
pal = {'cluster': 'Set1', 'batch':'Set2'}, add_color_legend=True,
save_path=save_dir/f'Concord_latent_heatmap_{file_suffix}.png')
ccd.pl.heatmap_with_annotations(adata, val='Concord', obs_keys=[state_key, batch_key],
cluster_cols=True, cluster_rows=True, cmap='viridis',
pal = {'cluster': 'Set1', 'batch':'Set2'}, add_color_legend=True,
save_path=save_dir/f'Concord_latent_heatmap_{file_suffix}.png')
Out[101]:
<seaborn.matrix.ClusterGrid at 0x7f081402c070>
InĀ [Ā ]:
Copied!
sc.pp.neighbors(adata, n_neighbors=30, use_rep = 'Concord')
ccd.ul.run_umap(adata, source_key='Concord', result_key='Concord_UMAP_2D', n_components=2, n_neighbors=30, min_dist=0.5, metric='euclidean', random_state=seed)
show_cols = [state_key, batch_key, leiden_key]
show_basis = 'Concord_UMAP_2D'
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=10, legend_loc='on data',
#pal = {'cluster':'Set1', 'batch':'Set2', 'leiden':'tab20'},
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)
sc.pp.neighbors(adata, n_neighbors=30, use_rep = 'Concord')
ccd.ul.run_umap(adata, source_key='Concord', result_key='Concord_UMAP_2D', n_components=2, n_neighbors=30, min_dist=0.5, metric='euclidean', random_state=seed)
show_cols = [state_key, batch_key, leiden_key]
show_basis = 'Concord_UMAP_2D'
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(8,3), dpi=300, ncols=3, font_size=5, point_size=10, legend_loc='on data',
#pal = {'cluster':'Set1', 'batch':'Set2', 'leiden':'tab20'},
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)
InĀ [105]:
Copied!
adata.write_h5ad(data_dir / f"adata_{file_suffix}.h5ad")
adata_state.write_h5ad(data_dir / f"adata_state_{file_suffix}.h5ad")
adata.write_h5ad(data_dir / f"adata_{file_suffix}.h5ad")
adata_state.write_h5ad(data_dir / f"adata_state_{file_suffix}.h5ad")