精彩内容
如感兴趣/需要详情请联系作者:
1-read scRNA daat
import osimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport gseapy as gpimport scanpy as sc
#load scRNA dataadata=sc.read_h5ad('./miloData.h5ad')adata
2-GSEA analysis
可以按照一般的步骤,线进行差异基因分析,然后根据logFC进行排序,得到rank的gene list之后进行GSEA【prerank】分析,或者直接适用表达矩阵进行GSEA分析!
# subset data#这里分析不同celltype两组之间的差异基因,所以提取感兴趣的celltypebdata = adata[adata.obs.celltype == "Cxcr3+P"].copy()bdata
#GSEA analysisres = gp.gsea(data=bdata.to_df().T, #基因表达矩阵, row -> genes, column-> samplesgene_sets="GO_Biological_Process_2025",#基因集,human = gp.get_library_name(organism='Human')查看需要分析的基因集#或者gmt文件,或者自定义基因集gmt file or 字典cls=bdata.obs.group,#cell分组信息permutation_num=1000,min_size=15,#基因集和数据集中允许的最小基因数量,默认15max_size=500,#默认500permutation_type='phenotype',outdir=None,method='s2n', # 用于计算相关性或rank的方法。默认signal_to_noisethreads= 16)#线程数
#plot单个termsfrom gseapy import gseaplotgseaplot(term='RNA Stabilization (GO:0043489)',#需要plot的terms名称rank_metric=res.ranking, #rank文件,在分析结果中有储存ofname=None,figsize = (4,5),#设置图形尺寸,长高**res.results['RNA Stabilization (GO:0043489)'])
## Heatmap of gene expression#还可以直接展示相关Terms中的基因的Expression,但这个效果不咋的genes = res.res2d.Lead_genes.iloc[3].split(";")ax = gp.heatmap(df = res.heatmat.loc[genes],z_score=None,title=res.res2d.Term.iloc[3],figsize=(6,5),cmap=plt.cm.viridis,xticklabels=False)
#同时可视化多个termsaxs = res.plot(terms=res.res2d['Term'][0:5])
3-差异基因富集分析
#差异基因分析sc.tl.rank_genes_groups(bdata,groupby='group',#分组columnuse_raw=False,method='wilcoxon',groups=["disease"],#实验组reference='healthy') #对照组
#获取差异分析结果dataframe#有了这个结果,其实还可以适用prerank进行分析GSEA,与之前一样,不再演示df = sc.get.rank_genes_groups_df(bdata, group='disease')df
# subset up or down regulated genes#筛选显著性基因degs_sig = df[(df['pvals_adj'] < 0.05) & (abs(df['logfoldchanges']) > 0.5)]degs_up = degs_sig[degs_sig.logfoldchanges > 0]degs_dw = degs_sig[degs_sig.logfoldchanges < 0]#up gene enrichmentenr_up = gp.enrichr(degs_up.names,gene_sets='GO_Biological_Process_2025',outdir=None)#down gene enrichmentenr_dw = gp.enrichr(degs_dw.names,gene_sets='GO_Biological_Process_2025',outdir=None)
from gseapy import barplot, dotplot# categorical barplotax = barplot(enr_res,column="Adjusted P-value",group='UP_DW', # set group, so you could do a multi-sample/library comparsionsize=10,top_term=10,figsize=(3,5),color = {'UP': 'salmon','DOWN':'darkblue'})
# categorical scatterplotax = dotplot(enr_res,#分析结果column="Adjusted P-value",#按照adj pvalue展示结果x='UP_DW', # set x axis, so you could do a multi-sample/library comparsionsize=10,#点大小top_term=10,#展示的top5termsfigsize=(3,6),#图长宽title = "Enrichment",#标题show_ring=False, # set to False to revmove outer ringmarker='o',#plot图形形式,参照https:///stable/api/markers_api.html)
4-GSVA分析
GSVA (Gene Set Variation Analysis) 是一种无监督富集分析方法,用于计算每个样本(或细胞)中每个基因集(通路)的活性评分(enrichment score)。这也是我们常见的分析,R中有专门的包,刚好python中GSEApy包集成了这个模块。而且原理与R版GSVA是一致的。
es = gp.gsva(data=adata.to_df().T, #express data表达矩阵gene_sets='./c2.cp.v2023.2.Hs.symbols.gmt',#基因集,输入与GSEA一样outdir=None,#结果输出目录。如果为None,则不会写入磁盘kcdf = 'Gaussian',min_size=15,#默认值,基因集和数据集中允许的最少基因数量max_size = 1000,#默认值threads = 16,#线程数)
#得到的结果是一个矩阵#得到结果后就可以按照R中那样进行一些可视化与分析gsva_df = es.res2d.pivot(index='Term', columns='Name', values='ES')gsva_df.head()
#将GSVA结果矩阵与adata结合,需要查看保证两者cellid顺序一致,关系到cellid的归属#不一致的情况下,需要按照adata的顺序进行重排gsva_df = gsva_df[adata.obs_names]gsva_df.head()
# 构建一个新的 AnnDataadata_gsva = sc.AnnData(X=gsva_df.T, # cell × pathwayobs=adata.obs.copy(), # metadatobsm=adata.obsm.copy(), #降维信息var=pd.DataFrame(index=gsva_df.index) # pathway 名称)
#差异分析,这里以Ly6g+P为例Ly6g_data = adata_gsva[adata_gsva.obs.celltype == "Ly6g+P"].copy()Ly6g_data.X = Ly6g_data.X.astype(float)sc.tl.rank_genes_groups(Ly6g_data,groupby='group',#分组columnuse_raw=False,method='wilcoxon',groups=["disease"],#实验组reference='healthy') #对照组
Ly6g_gsva_df = sc.get.rank_genes_groups_df(Ly6g_data, group='disease')Ly6g_gsva_df
#这个结果就可以适用我们之前的函数直接适用火山图展示了:#函数参照:https://mp.weixin.qq.com/s/Mm0vW86iR5eaZVVB7wpzOAks_plot_volcano(data = Ly6g_gsva_df,pval_index="pvals_adj",logFC_index="logfoldchanges",style='s1',figsize = (5, 5),label = False)
#可以标注感兴趣的显著通路ks_plot_volcano(data = Ly6g_gsva_df,pval_index="pvals_adj",logFC_index="logfoldchanges",style='s1',label = True,gene_index = 'names',lable_genes = gsva_sig['names'])
#既然是adata形式,那么还可以适用sc相关的函数进行可视化通路活性#设置全局字体大小sc.set_figure_params(scanpy=True, fontsize=10)adata_gsva.X = adata_gsva.X.astype(float)terms = gsva_sig['names']#这里随便挑选用于作图演示sc.pl.dotplot(adata_gsva, #adataterms, #需要plot的通路cmap='Blues',#密度颜色groupby="celltype", #group是adata.obs里面的celltype,也可以是其他任何分组return_fig=True).style(cmap='PRGn',dot_edge_color='black', dot_edge_lw=1,grid=True).show(True)
#这个图最好还是展示celltype以及group,所以添加一列,结合group和celltype信息adata_gsva.obs['group_celltype'] = adata.obs["group"].astype(str) + "_" + adata.obs["celltype"].astype(str)#设置全局字体大小sc.set_figure_params(scanpy=True, fontsize=10)sc.pl.dotplot(adata_gsva, #adataterms, #需要plot的通路cmap='Blues',#密度颜色groupby="group_celltype", #group是adata.obs里面的celltype,也可以是其他任何分组return_fig=True).style(cmap='PRGn',dot_edge_color='black', dot_edge_lw=1,grid=True).show(True)
#热图可视化sc.pl.matrixplot(adata_gsva,terms,"celltype",colorbar_title="mean z-score",#legend标题设置vmin=-0.5,vmax=0.5,cmap="RdBu_r",)
