#' Summarize the sliding window analysis results generated by \code{STAARpipeline} package
#'
#' The \code{Sliding_Window_Results_Summary} function takes in the results of sliding window analysis,
#' the object from fitting the null model, and the set of known variants to be adjusted for in conditional analysis
#' to summarize the sliding window analysis results and analyze the conditional association between a quantitative/dichotomous phenotype
#' (including imbalanced case-control setting) and
#' the rare variants in the unconditional significant genetic region.
#' @param agds_dir file directory of annotated GDS (aGDS) files for all chromosomes (1-22).
#' @param jobs_num a data frame containing the number of jobs for association analysis.
#' The data frame must include a column with the name "sliding_window_num"
#' @param input_path file directory of the sliding window analysis results.
#' @param output_path file output directory of the summary results.
#' @param sliding_window_results_name the file name of the input sliding window analysis results.
#' @param obj_nullmodel an object from fitting the null model, which is either the output from \code{fit_nullmodel} function in the \code{STAARpipeline} package,
#' or the output from \code{fitNullModel} function in the \code{GENESIS} package and transformed using the \code{genesis2staar_nullmodel} function in the \code{STAARpipeline} package.
#' @param known_loci the data frame of variants to be adjusted for in conditional analysis and should
#' contain 4 columns in the following order: chromosome (CHR), position (POS), reference allele (REF),
#' and alternative allele (ALT) (default = NULL).
#' @param cMAC_cutoff the cutoff of the minimum number of the cumulative minor allele of variants in the masks
#' when summarizing the results (default = 0).
#' @param method_cond a character value indicating the method for conditional analysis.
#' \code{optimal} refers to regressing residuals from the null model on \code{known_loci}
#' as well as all covariates used in fitting the null model (fully adjusted) and taking the residuals;
#' \code{naive} refers to regressing residuals from the null model on \code{known_loci}
#' and taking the residuals (default = \code{optimal}).
#' @param rare_maf_cutoff the cutoff of maximum minor allele frequency in
#' defining rare variants (default = 0.01).
#' @param QC_label channel name of the QC label in the GDS/aGDS file (default = "annotation/filter").
#' @param variant_type variants include in the conditional analysis. Choices include "variant", "SNV", or "Indel" (default = "SNV").
#' @param geno_missing_imputation method of handling missing genotypes. Either "mean" or "minor" (default = "mean").
#' @param Annotation_dir channel name of the annotations in the aGDS file \cr (default = "annotation/info/FunctionalAnnotation").
#' @param Annotation_name_catalog a data frame containing the name and the corresponding channel name in the aGDS file.
#' @param Use_annotation_weights use annotations as weights or not (default = FALSE).
#' @param Annotation_name a vector of annotation names used in STAAR (default = NULL).
#' @param alpha threshod to control the genome-wise (family-wise) error rate (default = 0.05), the p-value threshold is alpha/total number of sliding windows
#' @param manhattan_plot output manhattan plot or not (default = FALSE).
#' @param QQ_plot output Q-Q plot or not (default = FALSE).
#' @param cond_null_model_name the null model name for conditional analysis in the SPA setting, only used for imbalanced case-control setting (default = NULL).
#' @param cond_null_model_dir the directory of storing the null model for conditional analysis in the SPA setting, only used for imbalanced case-control setting (default = NULL).
#' @param SPA_p_filter logical: are only the variants with a normal approximation based p-value smaller than a pre-specified threshold use the SPA method to recalculate the p-value, only used for imbalanced case-control setting (default = FALSE).
#' @param p_filter_cutoff threshold for the p-value recalculation using the SPA method, only used for imbalanced case-control setting (default = 0.05).
#' @return The function returns the following analysis results:
#' @return \code{results_sliding_window_genome.Rdata}: a matrix contains the STAAR p-values (including STAAR-O or STAAR-B in imbalanced case-control setting) of the sliding windows across the genome.
#' @return \code{sliding_window_sig.Rdata} and \code{sliding_window_sig.csv}: a matrix contains the unconditional STAAR p-values (including STAAR-O or STAAR-B in imbalanced case-control setting) of the significant sliding windows (unconditional p-value<alpha/total number of sliding windows).
#' @return \code{sliding_window_sig_cond.Rdata} and \code{sliding_window_sig_cond.csv}: a matrix contains the conditional STAAR p-values (including STAAR-O or STAAR-B in imbalanced case-control setting) of the significant sliding windows (available if known_loci is not a NULL).
#' @return manhattan plot (optional) and Q-Q plot (optional) of the sliding window analysis results.
#' @references Li, Z., Li, X., et al. (2022). A framework for detecting
#' noncoding rare-variant associations of large-scale whole-genome sequencing
#' studies. \emph{Nature Methods}, \emph{19}(12), 1599-1611.
#' (\href{https://doi.org/10.1038/s41592-022-01640-x}{pub})
#' @export
Sliding_Window_Results_Summary <- function(agds_dir,jobs_num,input_path,output_path,sliding_window_results_name,
obj_nullmodel,known_loci=NULL,cMAC_cutoff=0,
method_cond=c("optimal","naive"),rare_maf_cutoff=0.01,
QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
Use_annotation_weights=FALSE,Annotation_name=NULL,
alpha=0.05,manhattan_plot=FALSE,QQ_plot=FALSE,
cond_null_model_name=NULL,cond_null_model_dir=NULL,
SPA_p_filter=FALSE,p_filter_cutoff=0.05){
## evaluate choices
method_cond <- match.arg(method_cond)
variant_type <- match.arg(variant_type)
geno_missing_imputation <- match.arg(geno_missing_imputation)
## SPA status
if(!is.null(obj_nullmodel$use_SPA))
{
use_SPA <- obj_nullmodel$use_SPA
}else
{
use_SPA <- FALSE
}
results_sliding_window_genome <- c()
for(chr in 1:22)
{
results_sliding_window_genome_chr <- c()
if(chr>1)
{
jobs_num_chr <- sum(jobs_num$sliding_window_num[1:(chr-1)])
}else
{
jobs_num_chr <- 0
}
for(i in 1:jobs_num$sliding_window_num[chr])
{
print(i + jobs_num_chr)
results_sliding_window <- get(load(paste0(input_path,sliding_window_results_name,"_",i+jobs_num_chr,".Rdata")))
results_sliding_window_genome_chr <- rbind(results_sliding_window_genome_chr,results_sliding_window)
}
results_sliding_window_genome <- rbind(results_sliding_window_genome,results_sliding_window_genome_chr)
}
rm(results_sliding_window_genome_chr)
gc()
###### cMAC_cutoff
results_sliding_window_genome <- results_sliding_window_genome[results_sliding_window_genome[,"cMAC"]>cMAC_cutoff,]
save(results_sliding_window_genome,file=paste0(output_path,"results_sliding_window_genome.Rdata"))
dim(results_sliding_window_genome)
### Significant Results
if(!use_SPA)
{
results_sig <- results_sliding_window_genome[results_sliding_window_genome[,colnames(results_sliding_window_genome)=="STAAR-O"]<alpha/dim(results_sliding_window_genome)[1],,drop=FALSE]
}else
{
results_sig <- results_sliding_window_genome[results_sliding_window_genome[,colnames(results_sliding_window_genome)=="STAAR-B"]<alpha/dim(results_sliding_window_genome)[1],,drop=FALSE]
}
save(results_sig,file=paste0(output_path,"sliding_window_sig.Rdata"))
write.csv(results_sig,paste0(output_path,"sliding_window_sig.csv"))
dim(results_sig)[1]
if(length(known_loci)!=0)
{
if(!use_SPA)
{
results_sig_cond <- c()
if(length(results_sig)!=0)
{
for(kk in 1:dim(results_sig)[1])
{
chr <- as.numeric(results_sig[kk,1])
start_loc <- as.numeric(results_sig[kk,2])
end_loc <- as.numeric(results_sig[kk,3])
gds.path <- agds_dir[chr]
genofile <- seqOpen(gds.path)
res_cond <- Sliding_Window_cond(chr=chr,genofile=genofile,obj_nullmodel=obj_nullmodel,
start_loc=start_loc,end_loc=end_loc,known_loci=known_loci,
method_cond=method_cond,rare_maf_cutoff=rare_maf_cutoff,
QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
Annotation_name_catalog=Annotation_name_catalog,Annotation_dir=Annotation_dir,
Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name)
results_sig_cond <- rbind(results_sig_cond,res_cond)
seqClose(genofile)
}
save(results_sig_cond,file=paste0(output_path,"sliding_window_sig_cond.Rdata"))
write.csv(results_sig_cond,paste0(output_path,"sliding_window_sig_cond.csv"))
}
}else
{
results_sig_cond <- c()
if(length(results_sig)!=0)
{
for(kk in 1:dim(results_sig)[1])
{
chr <- as.numeric(results_sig[kk,1])
start_loc <- as.numeric(results_sig[kk,2])
end_loc <- as.numeric(results_sig[kk,3])
if(file.exists(paste0(cond_null_model_dir,cond_null_model_name,".chr",chr,".Rdata")))
{
gds.path <- agds_dir[chr]
genofile <- seqOpen(gds.path)
obj_nullmodel_cond <- get(load(paste0(cond_null_model_dir,cond_null_model_name,".chr",chr,".Rdata")))
res_cond <- Sliding_Window_cond_spa(chr=chr,genofile=genofile,obj_nullmodel=obj_nullmodel_cond,
start_loc=start_loc,end_loc=end_loc,known_loci=known_loci,rare_maf_cutoff=rare_maf_cutoff,
QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
Annotation_name_catalog=Annotation_name_catalog,Annotation_dir=Annotation_dir,
Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff)
results_sig_cond <- rbind(results_sig_cond,res_cond)
seqClose(genofile)
}else
{
res_sig_cond <- results_sig[kk,,drop=FALSE]
}
}
save(results_sig_cond,file=paste0(output_path,"sliding_window_sig_cond.Rdata"))
write.csv(results_sig_cond,paste0(output_path,"sliding_window_sig_cond.csv"))
}
}
}
## manhattan plot
if(manhattan_plot)
{
print("Manhattan plot")
png(paste0(output_path,"sliding_window_manhattan.png"), width=9, height=6, units = 'in', res = 600)
if(!use_SPA)
{
print(manhattan_plot(as.numeric(results_sliding_window_genome[,1]), (as.numeric(results_sliding_window_genome[,2])+as.numeric(results_sliding_window_genome[,3]))/2, as.numeric(results_sliding_window_genome[,colnames(results_sliding_window_genome)=="STAAR-O"]), col = c("blue4", "orange3"),sig.level=0.05/dim(results_sliding_window_genome)[1]))
}else
{
print(manhattan_plot(as.numeric(results_sliding_window_genome[,1]), (as.numeric(results_sliding_window_genome[,2])+as.numeric(results_sliding_window_genome[,3]))/2, as.numeric(results_sliding_window_genome[,colnames(results_sliding_window_genome)=="STAAR-B"]), col = c("blue4", "orange3"),sig.level=0.05/dim(results_sliding_window_genome)[1]))
}
dev.off()
}
## Q-Q plot
if(QQ_plot)
{
print("Q-Q plot")
if(!use_SPA)
{
## STAAR-O
observed <- sort(as.numeric(results_sliding_window_genome[,colnames(results_sliding_window_genome)=="STAAR-O"]))
lobs <- -(log10(observed))
expected <- c(1:length(observed))
lexp <- -(log10(expected / (length(expected)+1)))
png(paste0(output_path,"sliding_window_qq_staar_o.png"), width=8, height=8, units = 'in', res = 600)
par(mar=c(5,6,4,4))
plot(lexp,lobs,pch=20, cex=1, xlim = c(0, max(lexp)), ylim = c(0, max(lobs)),
xlab = expression(Expected ~ ~-log[10](italic(p))), ylab = expression(Observed ~ ~-log[10](italic(p))),
font.lab=2,cex.lab=2,cex.axis=2,font.axis=2)
abline(0, 1, col="red",lwd=2)
dev.off()
## S(1,25)
observed <- sort(as.numeric(results_sliding_window_genome[,colnames(results_sliding_window_genome)=="SKAT(1,25)"]))
lobs <- -(log10(observed))
expected <- c(1:length(observed))
lexp <- -(log10(expected / (length(expected)+1)))
png(paste0(output_path,"sliding_window_qq_skat_1_25.png"), width=8, height=8, units = 'in', res = 600)
par(mar=c(5,6,4,4))
plot(lexp,lobs,pch=20, cex=1, xlim = c(0, max(lexp)), ylim = c(0, max(lobs)),
xlab = expression(Expected ~ ~-log[10](italic(p))), ylab = expression(Observed ~ ~-log[10](italic(p))),
font.lab=2,cex.lab=2,cex.axis=2,font.axis=2)
abline(0, 1, col="red",lwd=2)
dev.off()
## B(1,1)
observed <- sort(as.numeric(results_sliding_window_genome[,colnames(results_sliding_window_genome)=="Burden(1,1)"]))
lobs <- -(log10(observed))
expected <- c(1:length(observed))
lexp <- -(log10(expected / (length(expected)+1)))
png(paste0(output_path,"sliding_window_qq_burden_1_1.png"), width=8, height=8, units = 'in', res = 600)
par(mar=c(5,6,4,4))
plot(lexp,lobs,pch=20, cex=1, xlim = c(0, max(lexp)), ylim = c(0, max(lobs)),
xlab = expression(Expected ~ ~-log[10](italic(p))), ylab = expression(Observed ~ ~-log[10](italic(p))),
font.lab=2,cex.lab=2,cex.axis=2,font.axis=2)
abline(0, 1, col="red",lwd=2)
dev.off()
}else
{
## STAAR-B
observed <- sort(as.numeric(results_sliding_window_genome[,colnames(results_sliding_window_genome)=="STAAR-B"]))
lobs <- -(log10(observed))
expected <- c(1:length(observed))
lexp <- -(log10(expected / (length(expected)+1)))
png(paste0(output_path,"sliding_window_qq_staar_b.png"), width=8, height=8, units = 'in', res = 600)
par(mar=c(5,6,4,4))
plot(lexp,lobs,pch=20, cex=1, xlim = c(0, max(lexp)), ylim = c(0, max(lobs)),
xlab = expression(Expected ~ ~-log[10](italic(p))), ylab = expression(Observed ~ ~-log[10](italic(p))),
font.lab=2,cex.lab=2,cex.axis=2,font.axis=2)
abline(0, 1, col="red",lwd=2)
dev.off()
## B(1,1)
observed <- sort(as.numeric(results_sliding_window_genome[,colnames(results_sliding_window_genome)=="Burden(1,1)"]))
lobs <- -(log10(observed))
expected <- c(1:length(observed))
lexp <- -(log10(expected / (length(expected)+1)))
png(paste0(output_path,"sliding_window_qq_burden_1_1.png"), width=8, height=8, units = 'in', res = 600)
par(mar=c(5,6,4,4))
plot(lexp,lobs,pch=20, cex=1, xlim = c(0, max(lexp)), ylim = c(0, max(lobs)),
xlab = expression(Expected ~ ~-log[10](italic(p))), ylab = expression(Observed ~ ~-log[10](italic(p))),
font.lab=2,cex.lab=2,cex.axis=2,font.axis=2)
abline(0, 1, col="red",lwd=2)
dev.off()
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.