我们从单样本的角度解析了scanpy的基本流程,主要是为了让初次上手的小伙伴熟悉,也从多样本的角度解析了数据整合分析(scanpy百万级别单细胞一半的转录组实测)。从这里开始,我们正式演示scanpy多样本数据整合分析,以及演示多个整合方法。
下载数据:本次演示说的数据来自GEO,GSE num=GSE250130。
!wget https://ftp.ncbi.nlm./geo/series/GSE250nnn/GSE250130/suppl/GSE250130_RAW.tar
加载数据及质控:
import numpy as npimport pandas as pdimport scanpy as scimport matplotlib.pyplot as pltimport osimport sysimport re
sc.settings.verbosity = 3 sc.logging.print_header()sc.settings.set_figure_params(dpi=80, facecolor="white")
files = os.listdir('./')pattern = re.compile(r".*_d+.*") # _ 后至少一个数字samples = [f for f in files if pattern.search(f)]samples.sort()print(samples)
adata_list=[]for sample in samples: group = sample.split('_')[0] adata = sc.read_10x_mtx("./"+sample,var_names="gene_symbols",cache=True) adata.obs['soure_id'] = sample adata.obs['group'] = group adata_list.append(adata)
#数据合并adata = sc.concat(adata_list,join='outer',label='batch', keys=samples, index_unique='_')
#计算qc指标#其实基本上每次处理数据,我们需要计算的质控指标也就那么几个,所以干脆将其整为一个函数,直接计算好了def qc(adata,min_cells,min_genes):
# 基本的QC过滤 sc.pp.filter_genes(adata, min_cells=min_cells) sc.pp.filter_cells(adata, min_genes=min_genes)
adata.var["mt"] = adata.var_names.str.startswith("MT-") adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL")) adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]")) sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True) remove = ['total_counts_mt', 'log1p_total_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb'] adata.obs = adata.obs[[x for x in adata.obs.columns if x not in remove]]
return adata
adata = qc(adata,min_cells=3,min_genes=150)
#plotfor k in ["n_genes_by_counts", "total_counts", "pct_counts_mt",'pct_counts_ribo', 'pct_counts_hb']: with rc_context({"figure.figsize": (9, 5)}): sc.pl.violin(adata,k,jitter=0.2,size=1,groupby='batch',multi_panel=True,order=['LH_3', 'LH_5', 'LH_7', 'LH_9','LH_11','RIF_1', 'RIF_2', 'RIF_3'])
#QC,指标按照自己实际数据设定adata = adata[ (adata.obs.n_genes_by_counts < 8000) & (adata.obs.n_genes_by_counts >= 150) & (adata.obs.total_counts < 50000) & (adata.obs.pct_counts_hb < 0.1) & (adata.obs.pct_counts_mt < 20), :].copy()
#plot violin after QC#plotfor k in ["n_genes_by_counts", "total_counts", "pct_counts_mt"]: with rc_context({"figure.figsize": (8, 5)}): sc.pl.violin(adata,k,stripplot=False,groupby='batch',multi_panel=True,order=['LH_3', 'LH_5', 'LH_7', 'LH_9','LH_11','RIF_1', 'RIF_2', 'RIF_3'])
Normalization & Feature selection:
adata.layers["counts"] = adata.X.copy()
# Normalizing to median total countssc.pp.normalize_total(adata, target_sum=1e4)# Logarithmize the data,进一步为稳定数据方差,log1p即log(1 + x)sc.pp.log1p(adata)
#cell cycle score#s_genes,g2m_geness_genes = ["MCM5","PCNA","TYMS","FEN1","MCM2","MCM4","RRM1","UNG","GINS2","MCM6","CDCA7","DTL","PRIM1",'UHRF1', "MLF1IP","HELLS","RFC2","RPA2","NASP","RAD51AP1","GMNN","WDR76","SLBP","CCNE2","UBR7","POLD3","MSH2", "ATAD2","RAD51","RRM2","CDC45","CDC6","EXO1","TIPIN","DSCC1","BLM","CASP8AP2","USP1","CLSPN", "POLA1","CHAF1B","BRIP1","E2F8","HMGB2"]g2m_genes = ["CDK1","NUSAP1",'UBE2C',"BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67", "TMPO","CENPF","TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11", "ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1","CDC20","TTK","CDC25C","KIF2C", "RANGAP1","NCAPD2","DLGAP5","CDCA2","CDCA8","ECT2","KIF23","HMMR","AURKA","PSRC1", "ANLN","LBR","CKAP5","CENPE","CTCF","NEK2","G2E3","GAS2L3","CBX5","CENPA"]
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
#plotfor k in ['S_score', 'G2M_score']: with rc_context({"figure.figsize": (8, 5)}): sc.pl.violin(adata,k,stripplot=False,groupby='batch',multi_panel=True,order=['LH_3', 'LH_5', 'LH_7', 'LH_9','LH_11','RIF_1', 'RIF_2', 'RIF_3'])
#Feature selection 使用的flavor不同,选择的layer也不同,查看帮助函数,参考单样本详细解释sc.pp.highly_variable_genes( adata, layer='counts', n_top_genes=3000, subset=False, flavor="seurat_v3", batch_key="batch")
#保存下数据,接下来我们将演示常用的数据整合方法#save dataadata.write_h5ad('./adata_Nor.h5ad')
数据整合:本节我们先展示scvi整合方法,这也是文献中使用比较多的。
scvi-tool:https://github.com/scverse/scvi-tools?tab=readme-ov-file scvi-tool是一个用于单细胞组学数据分析的综合工具,包括多个模块,数据整合去除批次只是其中之一,其他的感兴趣的可看看官网:首先安装scvi。如果没有GPU,那么scvi的运行非常的慢,我这里演示就是一个例子。数据越大,速度越慢,所以谨慎使用。
pip install scvi-tools -i https://pypi.tuna./simple
import scviscvi.settings.seed = 0print("Last run with scvi-tools version:", scvi.__version__)
#数据整合中,大多数情况我们传入这些参数即可scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch",continuous_covariate_keys=['pct_counts_mt'])
#Create an SCVI model object.model = scvi.model.SCVI(adata, n_layers=2, n_latent=20, gene_likelihood="nb")
# train scVI#我这个服务器没有GPU,运行就会很慢,可以设置多线程model.train(datasplitter_kwargs={"num_workers": 2})
#获取训练结果,一个数组latent = model.get_latent_representation()
#添加到adataadata.obsm["X_scVI"] = latent
#scvi 标准化后的值重新存入 anndata 中adata.layers['scvi_normalized'] = model.get_normalized_expression(library_size=10e4)
#使用scvi结果降维# use scVI latent space for UMAP generationsc.pp.neighbors(adata,use_rep='X_scVI',n_neighbors=100)sc.tl.umap(adata,min_dist=0.6)
sc.pl.umap(adata, color=["batch","group"], title=["scvi batch umap","scvi group umap"])
sc.tl.leiden(adata, resolution=0.3)
sc.pl.umap( adata, color='leiden', frameon=False,)
参考原文marker注释细胞:
marker_gene = {'B cell': ['MS4A1', 'CD79A'], 'Mast': ['TPSAB1', 'CPA3','KIT'], 'Myeloid': ['CLEC7A', 'CD14'], 'NKT': ['CD3D', 'CD3G', 'CD247', 'NCAM1'], 'NK':['NKG7','GNLY'], 'Endo': ['CD34', 'VWF', 'PECAM1'], 'Ciliated': ['EPCAM', 'CDH1', 'FOXJ1', 'CFAP52'], 'UnCiliated': ['KRT18', 'MSX1', 'SOX17'], 'Stromal': ['DCN', 'COL1A1', 'PDGFRA']}
sc.pl.dotplot(adata, marker_gene, groupby="leiden",dendrogram=True);
rcParams['figure.figsize'] = (8,8)sc.pl.umap( adata, size=5,#点的大小 color="leiden",#plot celltype legend_loc="on data",#图上展示celltype,相当于DimPlot的label=T。 legend_fontsize=12,#字体大小 legend_fontoutline=0,#字体边框粗细,不设置边框则设置为0 frameon=False, #不展示边框 palette=['#fc8d59','#9e9ac8','#96daf7','#fed976','#b0479a','#35978f','#193a1c','#e31a1c','#f6a2a7','#f9d3d7','#08306b'],#设置颜色 )
#注释celltypecellAnno = { "0": "Stromal",# "1": "NKT",# "2": "T cell",# "3": "UnCiliated",# "4": "UnCiliated",# "5": "Myeloid",# "6": "B cell",# "7": "Mast",# "8": "Ciliated",# "9": "NKT",# "10":"NKT",# "11":"unkown", "12":"Stromal",# "13":"Endo",# "14":"unkown"}
rcParams['figure.figsize'] = (8,8)sc.pl.umap( adata, size=5,#点的大小 color="celltype",#plot celltype legend_loc="on data",#图上展示celltype,相当于DimPlot的label=T。 legend_fontsize=12,#字体大小 legend_fontoutline=1,#字体边框粗细,不设置边框则设置为0 frameon=False, #不展示边框 palette=['#fc8d59','#9e9ac8','#96daf7','#fed976','#b0479a','#35978f','#e31a1c','#f6a2a7','#f9d3d7','#08306b'],#设置颜色 )
保存数据:
adata.write_h5ad('./adata_cellAnno.h5ad')
|