KS VIP
如感兴趣/需要详情请联系作者:
1-创建环境-安装库
#最好单独创建环境,有些依赖包邮版本冲突#创建环境conda create -y -n cell2loc_env python=3.10conda activate cell2loc_env#安装库pip install cell2location -i https://pypi.tuna./simple
import scanpy as scimport numpy as npimport matplotlib.pyplot as pltimport matplotlib as mplimport cell2locationfrom matplotlib import rcParams
第一点: scRNA/snRNA参考数据集需要的是counts表达矩阵。
第二点:spatial数据可以是单个sample,也可以是多个batchs的整合数据。
第三点:spatial数据的线粒体基因不保留,需要删除。
第四点:cell2location调用GPU才会分析加速,CPU分析速度很慢很慢。
第五点:这是所有反卷积方法的共通点,参考数据集最好是sample一一对应的,效果肯定最好,如果没有只能从公共数据库来了,也是可以的!
2-load scRNA and Spatial data
scRNA参考数据集处理:如果您的数据是seurat,先将其转化为anndata。
endometrium_scRNA#AnnData object with n_obs × n_vars = 23993 × 30041# obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'percent.mt', 'RNA_snn_res.0.5', 'seurat_clusters', 'pANN', 'DF.classify', 'RNA_snn_res.0.6', 'RNA_snn_res.0.8', 'manual_cellAnno', 'barcode', 'UMAP_1', 'UMAP_2'# obsm: 'X_pca', 'X_umap'
sc.pl.umap(endometrium_scRNA,color='manual_cellAnno')#基因选择,过滤scRNA参考数据集基因:cell2location使用filter_genes函数进行基因过滤,以这种方式能够保留罕见基因的标记,同时删除了大多数无信息的基因。from cell2location.utils.filtering import filter_genesselected = filter_genes(endometrium_scRNA, #scRNA/snRNA anndata objectcell_count_cutoff=5, #在少于该数量细胞中被检测到的基因将被排除。cell_percentage_cutoff2=0.05, #在至少该比例细胞中被检测到的基因将被直接保留。nonz_mean_cutoff=1.15)#对于检测到的细胞数介于上述两个阈值之间的基因,只有当其在非零表达细胞中的平均表达量高于该阈值时,才会被保留。# filter the objectendometrium_scRNA = endometrium_scRNA[:, selected].copy()
load Visium data及数据处理
endometrium_visium = sc.read_h5ad('./adata_spatial_endometrium.h5ad')endometrium_visium
AnnData object with n_obs × n_vars = 2016 × 15696obs: 'in_tissue', 'array_row', 'array_col', 'pxl_row_in_fullres', 'pxl_col_in_fullres', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts', 'clusters'var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'uns: 'clusters', 'clusters_colors', 'dendrogram_clusters', 'hvg', 'log1p', 'neighbors', 'pca', 'rank_genes_groups', 'spatial', 'umap'obsm: 'X_pca', 'X_umap', 'spatial'varm: 'PCs'obsp: 'connectivities', 'distances'
#查看数据from matplotlib.colors import ListedColormapsq.pl.spatial_scatter(endometrium_visium,color="clusters",size=1.5,palette=ListedColormap(["#11A579", "#F2B701", "#66C5CC", "#80BA5A", "#F6CF71", "#7F3C8D"]))
#识别线粒体基因endometrium_visium.var['MT_gene'] = [gene.startswith('MT-') for gene in endometrium_visium.var_names]
endometrium_visium.obsm['MT'] = endometrium_visium[:, endometrium_visium.var['MT_gene'].values].X.toarray()endometrium_visium = endometrium_visium[:, ~endometrium_visium.var['MT_gene'].values]
3-分析【step1】:Estimation of reference【scRNA】 cell type signatures
cell2location.models.RegressionModel.setup_anndata(adata=endometrium_scRNA,#参考数据集batch_key='orig.ident',#adata.obs中批次/sample列。labels_key='manual_cellAnno',# celltype注释列categorical_covariate_keys=None#矫正技术因素(例如platform, 3' vs 5', donor effect,我这里的演示数据中不涉及这些))
# create the regression modelfrom cell2location.models import RegressionModelmod = RegressionModel(endometrium_scRNA)
# view anndata_setup as a sanity checkmod.view_anndata_setup()
Anndata setup with scvi-tools version 1.3.3.Setup via `RegressionModel.setup_anndata` with arguments:{│ 'layer': None,│ 'batch_key': 'orig.ident',│ 'labels_key': 'manual_cellAnno',│ 'categorical_covariate_keys': None,│ 'continuous_covariate_keys': None}Summary Statistics┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓┃ Summary Stat Key ┃ Value ┃┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩│ n_batch │ 5 ││ n_cells │ 23993 ││ n_extra_categorical_covs │ 0 ││ n_extra_continuous_covs │ 0 ││ n_labels │ 10 ││ n_vars │ 16620 │└──────────────────────────┴───────┘Data Registry┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓┃ Registry Key ┃ scvi-tools Location ┃┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩│ X │ adata.X ││ batch │ adata.obs['_scvi_batch'] ││ ind_x │ adata.obs['_indices'] ││ labels │ adata.obs['_scvi_labels'] │└──────────────┴───────────────────────────┘batch State Registry┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓┃ Source Location ┃ Categories ┃ scvi-tools Encoding ┃┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩│ adata.obs['orig.ident'] │ LH3 │ 0 ││ │ LH5 │ 1 ││ │ LH7 │ 2 ││ │ LH9 │ 3 ││ │ LH11 │ 4 │└─────────────────────────┴────────────┴─────────────────────┘labels State Registry┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓┃ Source Location ┃ Categories ┃ scvi-tools Encoding ┃┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩│ adata.obs['manual_cellAnno'] │ Bcell │ 0 ││ │ Ciliated │ 1 ││ │ Mast │ 2 ││ │ Myeloid │ 3 ││ │ NK │ 4 ││ │ Neutrophil │ 5 ││ │ Stromal │ 6 ││ │ Tcell │ 7 ││ │ endothelial │ 8 ││ │ unCiliated │ 9 │└──────────────────────────────┴─────────────┴─────────────────────┘
mod.train(max_epochs=250)mod.plot_history(20)save model,refernce训练模型保存,后续其他同组织空转sample就可以直接使用了,不需要每次训练费时间。
endometrium_scRNA = mod.export_posterior(endometrium_scRNA,sample_kwargs={'num_samples': 1000,'batch_size': 2500})
# Save modelmod.save('./reference_signatures/', overwrite=True)# Save anndata object with resultsendometrium_scRNA.write('./endometrium_sc_cell2location.h5ad')
可视化模型质控图,通过重建准确性来评估推断过程中是否存在问题。这个二维直方图应当表现为:大多数观测点沿着一条带有噪声的对角线分布。否则特征估计可能存在问题。
mod.plot_QC()提取reference scRNA celltype的表达特征用于后续mapping。现在的数据储存在anndata,后续反卷积只需要每种celltype的特征。
# export estimated expression in each clusterif 'means_per_cluster_mu_fg' in endometrium_scRNA.varm.keys():inf_aver = endometrium_scRNA.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'for i in endometrium_scRNA.uns['mod']['factor_names']]].copy()else:inf_aver = endometrium_scRNA.var[[f'means_per_cluster_mu_fg_{i}'for i in endometrium_scRNA.uns['mod']['factor_names']]].copy()inf_aver.columns = endometrium_scRNA.uns['mod']['factor_names']inf_aver.iloc[0:5, 0:5]
4-分析【step2】:spatial mapping
找到reference cell type signatures和空间数据的共同基因,并对两者进行过滤,只保留共享基因。
# find shared genes and subset both anndata and reference signaturesintersect = np.intersect1d(endometrium_visium.var_names, inf_aver.index)endometrium_visium = endometrium_visium[:, intersect].copy()inf_aver = inf_aver.loc[intersect, :].copy()# prepare anndata for cell2location modelcell2location.models.Cell2location.setup_anndata(adata=endometrium_visium, batch_key=None)#我这里的空转是一个sample,不涉及batch
接下来才是cell2location正式的反卷积过程,它的前期处理步骤很多,主要的目的是给予用户更多的调整和选择,而不是设定固定参数,可能对部分数据效果一般。
初始化cell2location 空间解卷积模型,这里面有两个重要的参数,需要自己指定,影响最终的结果。
一个是N_cells_per_location,这个参数的意思是期望的每个空间位置的细胞数,对于visium即每个spots中包含的cell number,这个数值需要通过配对的组织学图像进行估计。。
另一个参数是detection_alpha,表示每个spot RNA检测灵敏度的正则化强度,如果在观察每个空间位置的总RNA计数分布与基于组织学检查所预期的细胞数量分布不一致,那么说明样本中很可能存在较高的RNA检测灵敏度技术变异,如果当一个切片 / 批次内 RNA 检测灵敏度存在较大技术变异时,为了提高模型的准确性和灵敏度,需要放松对每个空间位置归一化项的正则化,也就是detection_alpha数值设置低一点,反之则设置高一点。
# create and train the modelmod = cell2location.models.Cell2location(endometrium_visium, #空转adatacell_state_df=inf_aver,#reference celltype signaturesN_cells_per_location=25,detection_alpha=20)mod.view_anndata_setup()
Anndata setup with scvi-tools version 1.3.3.Setup via `Cell2location.setup_anndata` with arguments:{│ 'layer': None,│ 'batch_key': None,│ 'labels_key': None,│ 'categorical_covariate_keys': None,│ 'continuous_covariate_keys': None}Summary Statistics┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓┃ Summary Stat Key ┃ Value ┃┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩│ n_batch │ 1 ││ n_cells │ 2016 ││ n_extra_categorical_covs │ 0 ││ n_extra_continuous_covs │ 0 ││ n_labels │ 1 ││ n_vars │ 13057 │└──────────────────────────┴───────┘Data Registry┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓┃ Registry Key ┃ scvi-tools Location ┃┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩│ X │ adata.X ││ batch │ adata.obs['_scvi_batch'] ││ ind_x │ adata.obs['_indices'] ││ labels │ adata.obs['_scvi_labels'] │└──────────────┴───────────────────────────┘batch State Registry┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓┃ Source Location ┃ Categories ┃ scvi-tools Encoding ┃┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩│ adata.obs['_scvi_batch'] │ 0 │ 0 │└──────────────────────────┴────────────┴─────────────────────┘labels State Registry┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓┃ Source Location ┃ Categories ┃ scvi-tools Encoding ┃┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩│ adata.obs['_scvi_labels'] │ 0 │ 0 │└───────────────────────────┴────────────┴─────────────────────┘
Training cell2location:推断每个空间spot的celltype丰度(cell abundance)
#哭死,没有GPU,这里又是非常耗费时间的一个步骤#默认max_epochs都是30000,我作为演示,仅仅看看CPU状态下需要多长时间,好家伙,20000次得一天一夜。mod.train(max_epochs=20000,#最大训练轮数,数值越大:模型收敛(loss稳定)的可能性越高但同时训练时间越长。# train using full data (batch_size=None)batch_size=None,# use all data points in training because# we need to estimate cell abundance at all locationstrain_size=1,)
#同样的,好的训练模型在训练后期趋于平稳,我之前偷懒只测试了3000次,还花费了4好,#发现结果不好,max_epochs设置太少了,曲线还在下降;所以强烈建议使用GPU!!!# plot ELBO loss history during training, removing first 100 epochs from the plotmod.plot_history(1000)plt.legend(labels=['full data training']);
#提取训练的mapping结果至annda.endometrium_visium = mod.export_posterior(endometrium_visium, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs})# Save model-mapping model,注意与前面参考scRNA model区分mod.save('./cell2loc_mapping_visium/', overwrite=True)# Save anndata object with results,空转adata保存endometrium_visium.write('./endometrium_visium_cell2location_mapping.h5ad')
可视化【step3】:结果可视化及下游分析
在空间坐标中可视化细胞丰度
#提取cell abundance,并添加到adata.obs。endometrium_visium.obs[endometrium_visium.uns['mod']['factor_names']] = endometrium_visium.obsm['q05_cell_abundance_w_sf']# plot in spatial coordinateswith mpl.rc_context({'axes.facecolor': 'black','figure.figsize': [4.5, 5]}):sc.pl.spatial(endometrium_visium,cmap='magma',color=endometrium_scRNA.obs['manual_cellAnno'].unique(),ncols=4, size=1.3,img_key='hires',# limit color scale at 99.2% quantile of cell abundancevmin=0, vmax='p99.2')
#在空间中可视化多种celltypefrom cell2location.plt import plot_spatial# 选择需要展示的celltypeclust_labels = ['Tcell', 'Bcell', 'Myeloid','NK']clust_col = ['' + str(i) for i in clust_labels] # in case column names differ from labelswith mpl.rc_context({'figure.figsize': (10, 10)}):fig = plot_spatial(adata=endometrium_visium,# labels to show on a plotcolor=clust_col,labels=clust_labels,show_img=True,# 'fast' (white background) or 'dark_background'style='fast',# limit color scale at 99.2% quantile of cell abundancemax_color_quantile=0.992,# size of locations (adjust depending on figure size)circle_diameter=6,colorbar_position='right')
合作联系