library(knitr) knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "figures/vignette-", out.width = "60%" )
library(scPagwas)
If your computer storage is enough, ignore this item
.
With the rapid development of single-cell data analysis in recent years, the number of single cells has also increased significantly. Analyzing the entire single-cell expression matrix can sometimes lead to memory overflow, making it difficult for many individuals to run scPagwas with the required amount of memory. To address this issue, we provide two solutions corresponding to different memory situations:
Solution 1: If there is enough memory to load all the single-cell data and run until the "Pathway_pcascore_run" function without encountering memory overflow, the solution involves splitting the data after this step.
Solution 2: If there is not enough memory to run the "Pathway_pcascore_run" function, the solution involves splitting the data based on the initial input of single-cell data.
Please note that both of these approaches are specific to single-cell data processing. For cell type calculations, large memory requirements are typically not necessary. If there is still insufficient memory, an alternative approach is to randomly select a subset of single cells, ensuring that each cell type is adequately represented in the subset. Cell type calculations can then be performed based on this subset, as it represents a pseudo-bulk analysis. However, this vignette does not cover cell type calculations.
Solution 1 involves following the scPagwas workflow and proceeding with the calculations until the second step:
library(scPagwas) Pagwas <- list() #Input and preprocess the single-cell data Single_data <- readRDS(system.file("extdata", "scRNAexample.rds", package = "scPagwas")) Pagwas <- scPagwas::Single_data_input( Pagwas = Pagwas, assay = "RNA", Single_data = Single_data, Pathway_list = Genes_by_pathway_kegg ) Single_data <- Single_data[, colnames(Pagwas$data_mat)] #Run svd function Pagwas <- scPagwas::Pathway_pcascore_run( Pagwas = Pagwas, Pathway_list = Genes_by_pathway_kegg ) Pagwas <- scPagwas::pa_meanexpr(Pagwas) #save this result save(Pagwas,file="./single.pagwas.RData")
In this step, the number of divisions to which the data will be split depends on the size of the computer's memory.
library(scPagwas) library(readr) library(dplyr) library(Seurat) library(tidyverse) seudata_path="./single.pagwas.RData" load(seudata_path) #In this case, the data will be divided into two partitions. pca_scCell_mat<-Pagwas$pca_scCell_mat data_mat<-Pagwas$data_mat Pagwas$pca_scCell_mat=pca_scCell_mat[,1:350] Pagwas$data_mat=data_mat[,1:350] save(Pagwas,file="1_Pagwas_singledata.RData") Pagwas$pca_scCell_mat=pca_scCell_mat[,351:700] Pagwas$data_mat=data_mat[,351:700] save(Pagwas,file="2_Pagwas_singledata.RData") Pagwas$pca_scCell_mat=pca_scCell_mat[,701:1000] Pagwas$data_mat=data_mat[,701:1000] save(Pagwas,file="3_Pagwas_singledata.RData")
gwasfile= system.file("extdata", "GWAS_summ_example.txt", package = "scPagwas") out_file= "./gwas_pagwas.RData" Pagwas<-list() gwas_data <- bigreadr::fread2(gwasfile) Pagwas <- scPagwas::GWAS_summary_input( Pagwas = Pagwas, gwas_data = gwas_data ) Pagwas$snp_gene_df <- scPagwas::SnpToGene(gwas_data = Pagwas$gwas_data, block_annotation = block_annotation, marg = 10000) save(Pagwas,file=out_file)
Given that we have divided the single-cell results into two partitions, the following code illustrates the regression process using one of the partitions as an example.
gwas_path <- "gwas_pagwas.RData" scmat_path <- "1_Pagwas_singledata.RData" scmat_path <- "2_Pagwas_singledata.RData" scmat_path <- "3_Pagwas_singledata.RData" #Import and integrate the two data results into a single list. load(gwas_path) Pagwas1<-Pagwas rm(Pagwas) load(scmat_path) Pagwas<- c(Pagwas1,Pagwas) rm(Pagwas1) gc() output.prefix='1' output.prefix='2' output.prefix='3' output.dirs='Test' if (!dir.exists(output.dirs)) { dir.create(output.dirs) } if (!dir.exists(output.dirs)) { dir.create(paste0("./", output.dirs, "/temp")) } Pagwas <- scPagwas::Pathway_annotation_input( Pagwas = Pagwas, block_annotation = block_annotation ) Pagwas <- scPagwas::Link_pathway_blocks_gwas( Pagwas = Pagwas, chrom_ld = chrom_ld, singlecell = T, celltype = F, backingpath=paste0("./", output.dirs, "/temp"), n.cores=1 ) pmat<-Pagwas$Pathway_sclm_results #save result save(pmat,file=paste0("./", output.dirs, "/",output.prefix,"_Pathway_sclm_results.RData")) #save(pmat,file=paste0("./", output.dirs, "/",output.prefix,"_Pathway_sclm_results.RData"))
Here, it is necessary to run all the split single-cell data separately, and obtain the "_Pathway_sclm_results.RData" result for each one.
load(paste0("./", output.dirs, "/1_Pathway_sclm_results.RData")) n=3 pmat_merge<-pmat for (i in 2:n) { print(i) load(paste0("./", output.dirs, "/",i,"_Pathway_sclm_results.RData")) pmat_merge<-rbind(pmat_merge,pmat) } save(pmat_merge,file="pmat_merge.RData")
After calculating the scPagwas.gPAS.score, genetic association gene analysis is performed based on this score.
In this process, 200 cells are randomly sampled from the large-scale single-cell data for computation. This process is repeated five times, and the results are integrated to obtain the final heritability correlation.
The number of cells randomly selected and the number of random iterations depend on the specific quantity of single cells available.
#Note: Here, we need to import the "single.pagwas.RData" data again. load("./single.pagwas.RData") load("pmat_merge.RData") #compute the gPas score. scPagwas.gPAS.score <- scPagwas::Merge_gPas(Pagwas,pmat_merge) #compute the heritability correlation for each gene. PCC<-scPagwas::Corr_Random(Pagwas$data_mat, scPagwas.gPAS.score, seed=1234, random=T, Nrandom=5,# you need change this parameter based on your cell numbers. Nselect=200 # you need change this parameter based on your cell numbers. ) mean_gpas<-mean(scPagwas.gPAS.score) a1<-which(scPagwas.gPAS.score >= mean_gpas) a2<-which(scPagwas.gPAS.score < mean_gpas) PCC_up <- scPagwas::Corr_Random(scPagwas.gPAS.score=scPagwas.gPAS.score[a1],data_mat=Pagwas$data_mat[,a1]) PCC_down <- scPagwas::Corr_Random(scPagwas.gPAS.score=scPagwas.gPAS.score[a2],data_mat=Pagwas$data_mat[,a2]) scPagwas_topgenes <- names(PCC[order(PCC, decreasing = T)])[1:500] scPagwas_upgenes <- names(PCC_up)[order(PCC_up, decreasing = T)[1:500]] scPagwas_downgenes <- names(PCC_down)[order(PCC_down, decreasing = F)[1:500]]
In this step, we need to utilize the initially imported single-cell data once again.
Single_data <- Seurat::AddModuleScore(Single_data, assay = 'RNA', list(scPagwas_topgenes,scPagwas_upgenes,scPagwas_downgenes), name = c("scPagwas.TRS.Score","scPagwas.upTRS.Score","scPagwas.downTRS.Score")) correct_pdf <- scPagwas::Get_CorrectBg_p(Single_data=Single_data, scPagwas.TRS.Score=Single_data$scPagwas.TRS.Score1, iters_singlecell=100, n_topgenes=500, scPagwas_topgenes=scPagwas_topgenes) Pagwas$Random_Correct_BG_pdf <- correct_pdf Pagwas$Merged_celltype_pvalue<-scPagwas::Merge_celltype_p(single_p=correct_pdf$pooled_p, celltype=Pagwas$Celltype_anno$annotation)
All the results can be found in the Pagwas result list.
In the second approach, the single-cell data was initially divided into several partitions, and then each partition was individually processed using scPagwas. When you split the scRNA-seq data in random, you should set the min_clustercells=1.
library(Seurat) Single_data <-readRDS(system.file("extdata", "scRNAexample.rds", package = "scPagwas")) #set the number of split time, it depend on the size of your single cell data. #There have two form of spliting the data: #Create the random index number. n_split=2 Split_index <- rep(1:n_split, time = ceiling(ncol(Single_data)/n_split), length = ncol(Single_data)) for (i in 1:n_split) { Example_splice <- Single_data[,Split_index==i] saveRDS(Example_splice,file = paste0("Example_splice",i,".rds")) }
Second, we run the GWAS_summary_input
and Snp2Gene
firstly, because the following function share the same gwas data.
Sometimes, we need not run these functions for small gwas data, because a little time for these functions.
Pagwas<-list() gwas_data <- bigreadr::fread2(system.file("extdata", "GWAS_summ_example.txt", package = "scPagwas")) Pagwas <- GWAS_summary_input( Pagwas = Pagwas, gwas_data = gwas_data ) Pagwas$snp_gene_df <- SnpToGene(gwas_data = Pagwas$gwas_data, block_annotation = block_annotation, marg = 10000) names(Pagwas)
for (i in 1:n_split) { scPagwas_main(Pagwas =Pagwas, gwas_data =NULL, Single_data = paste0("Example_splice",i,".rds"), #the output.prefix=i, #the prefix can be the circulation coefficient i. output.dirs="splice_scPagwas", #the output.dirs shoukd be the same for each circulation Pathway_list=Genes_by_pathway_kegg, run_split=TRUE, #You must set the key parameter.This parameter is set to run single result for each split result. min_clustercells=10, #the minimum cluster cell number should be 1. assay="RNA", block_annotation = block_annotation, chrom_ld = chrom_ld) gc() }
Read all the "_singlecell_scPagwas.gPAS.score.Result.csv" result files and integrate them together.
output.dirs="splice_scPagwas" oriDir <- paste0("./",output.dirs) files <- list.files(oriDir, pattern="*_singlecell_scPagwas.gPAS.score.Result.csv") #integrate the scPagwas.gPAS.score scPagwas.gPAS.score<-unlist(lapply( files,function(file){ gs<-read.csv(file=paste0(oriDir,"/",file)) ga <- gs$scPagwas.gPAS.score names(ga) <- gs$cellnames return(ga) })) Single_data <- Single_data[,names(scPagwas.gPAS.score)] Single_data$scPagwas.gPAS.score<-scPagwas.gPAS.score
This step is identical to the later steps in Solution 1.
data_mat <- GetAssayData(Single_data, slot = "data", assay = "RNA") PCC<-scPagwas::Corr_Random(data_mat, scPagwas.gPAS.score, seed=1234, random=T, Nrandom=5, # you need change this parameter based on your cell numbers. Nselect=200 # you need change this parameter based on your cell numbers. ) mean_gpas<-mean(scPagwas.gPAS.score) a1<-which(scPagwas.gPAS.score >= mean_gpas) a2<-which(scPagwas.gPAS.score < mean_gpas) PCC_up <- scPagwas::Corr_Random(scPagwas.gPAS.score=scPagwas.gPAS.score[a1],data_mat=data_mat[,a1]) PCC_down <- scPagwas::Corr_Random(scPagwas.gPAS.score=scPagwas.gPAS.score[a2],data_mat=data_mat[,a2]) scPagwas_topgenes <- names(PCC[order(PCC, decreasing = T)])[1:500] scPagwas_upgenes <- names(PCC_up)[order(PCC_up, decreasing = T)[1:500]] scPagwas_downgenes <- names(PCC_down)[order(PCC_down, decreasing = F)[1:500]] Single_data <- Seurat::AddModuleScore(Single_data, assay = 'RNA', list(scPagwas_topgenes,scPagwas_upgenes,scPagwas_downgenes), name = c("scPagwas.TRS.Score","scPagwas.upTRS.Score","scPagwas.downTRS.Score")) correct_pdf <- scPagwas::Get_CorrectBg_p(Single_data=Single_data, scPagwas.TRS.Score=Single_data$scPagwas.TRS.Score1, iters_singlecell=100, n_topgenes=500, scPagwas_topgenes=scPagwas_topgenes) Pagwas$scPagwas.TRS.Score = Single_data$scPagwas.TRS.Score1 Pagwas$Random_Correct_BG_pdf <- correct_pdf Pagwas$Merged_celltype_pvalue<-scPagwas::Merge_celltype_p(single_p=correct_pdf$pooled_p, celltype=Idents(Single_data))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.