#' Meta-analysis of individual variants using \code{MetaSTAAR}
#'
#' The \code{MetaSTAAR_individual_analysis} function takes in the summary statistics file
#' from each participating study (the output from \code{\link{MetaSTAAR_worker_sumstat}})
#' to analyze the associations between a quantitative/dichotomous phenotype and
#' all individual variants in the merged variant list by using the meta-analysis of score test.
#' @param chr a numeric value indicating the chromosome of the genetic region of interest.
#' @param start.loc a numeric value indicating the starting location (position) of the
#' genetic region of interest.
#' @param end.loc a numeric value indicating the ending location (position) of the
#' genetic region of interest.
#' @param study.names a character vector containing the name of each participating
#' study in the meta-analysis.
#' @param sample.sizes a numeric vector with the length of \code{study.names}
#' indicating the sample size of each study.
#' @param sumstat.dir a character vector containing the directories of the study-specific summary statistics file folders.
#' @param common_mac_cutoff the cutoff of minimum combined minor allele count (inclusive) in
#' defining "common" variants.
#' @param trait a character value indicating the underlying trait of interest for
#' the meta-analysis.
#' @param segment.size a numeric value indicating the length of each segment of which
#' the summary statistics and sparse weighted covariance files are stored.
#' Note that the input value should be aligned with the input values of
#' \code{\link{MetaSTAAR_worker_cov}} (default = 5e+05).
#' @param check_qc_label a logical value indicating whether variants need to be dropped according to \code{qc_label}
#' specified in \code{\link{MetaSTAAR_worker_sumstat}} and \code{\link{MetaSTAAR_worker_cov}}.
#' If \code{check_qc_label} is FALSE, it is assumed that no variant will be dropped (default = FALSE).
#' @return a data frame with p rows corresponding to the p genetic variants in the merged variant list
#' and the following columns: chromosome (chr), position (pos), reference allele (ref), alternative allele (alt),
#' combined alternative allele count (alt_AC), combined minor allele count (MAC), combined minor allele frequency (MAF),
#' combined sample size (N), the score test p-value (p), the log score test p-value (logp), the score test statistic (Score),
#' the standard error associated with the score test statistic (Score_SE), the estimated effect size of the minor allele (Est),
#' the standard error associated with the estimated effect size (Est_se).
#' If a variant in the merged variant list has standard error equal to 0, the p-value will be set as 1.
#' @references Li, X., et al. (2023). Powerful, scalable and resource-efficient
#' meta-analysis of rare variant associations in large whole genome sequencing studies.
#' \emph{Nature Genetics}, \emph{55}(1), 154-164.
#' (\href{https://doi.org/10.1038/s41588-022-01225-6}{pub})
#' @export
MetaSTAAR_individual_analysis <- function(chr,start.loc,end.loc,study.names,sample.sizes,sumstat.dir,
common_mac_cutoff,trait,segment.size=5e5,check_qc_label=FALSE){
cov_maf_cutoff <- rep(0.5 + 1e-16,length(study.names))
segment <- floor((start.loc - 1) / segment.size) + 1
if (end.loc <= segment * segment.size) {
### summary statistics
sumstat.files <- paste0(sumstat.dir,"/summary.stat.",trait,".",study.names,".chr",chr,".segment",segment,".Rdata")
sumstat.list <- sapply(sumstat.files, function(x) mget(load(x)), simplify = TRUE)
sumstat.list <- lapply(sumstat.list, function(x) {
if (!(is.null(x)) && !("qc_label" %in% colnames(x))){
data.frame(x[,1:4],qc_label="PASS",x[,5:dim(x)[2]],stringsAsFactors = FALSE)
}else{
x
}
})
position.index <- mapply(function(x,y) {
(x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),"pos"]>=start.loc)&(x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),"pos"]<=end.loc)
}, x = sumstat.list, y = cov_maf_cutoff, SIMPLIFY = FALSE)
sumstat.list <- lapply(sumstat.list, function(x) {
x[(x$pos>=start.loc)&(x$pos<=end.loc),]
})
sumstat.varid.list <- mapply(function(x,y) {
x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),1:4]
}, x = sumstat.list, y = cov_maf_cutoff, SIMPLIFY = FALSE)
sumstat.varid.merge <- do.call("rbind",sumstat.varid.list)
sumstat.varid.nodup <- sumstat.varid.merge[!duplicated(sumstat.varid.merge),]
if (is.null(sumstat.varid.nodup)) {
return(NULL)
}else if (dim(sumstat.varid.nodup)[1] == 0) {
return(NULL)
}
sumstat.merge.list <- lapply(sumstat.list, function(x) {
if (is.null(x)) {
cbind(sumstat.varid.nodup,qc_label=NA,alt_AC=NA,MAC=NA,MAF=NA,N=NA,U=NA,V=NA,"1"=NA)
}else {
left_join(sumstat.varid.nodup,x,by=c("chr"="chr",
"pos"="pos",
"ref"="ref",
"alt"="alt"))
}
})
sumstat.merge.list <- mapply(function(x,y) {
x[is.na(x[,"N"]),"N"] <- y
x[is.na(x[,"qc_label"]),"qc_label"] <- "PASS"
x[is.na(x)] <- 0
return(x)
}, x = sumstat.merge.list, y = sample.sizes, SIMPLIFY = FALSE)
sumstat.varid.nodup$index <- 1:dim(sumstat.varid.nodup)[1]
sumstat.index.list <- lapply(sumstat.varid.list, function(x) {
if (is.null(x)) {
integer(0)
}else{
left_join(x,sumstat.varid.nodup,by=c("chr"="chr",
"pos"="pos",
"ref"="ref",
"alt"="alt"))$index
}
})
rm(sumstat.files,sumstat.list,sumstat.varid.list,sumstat.varid.merge)
gc()
}else if (end.loc <= (segment + 1) * segment.size) {
### summary statistics
sumstat.files1 <- paste0(sumstat.dir,"/summary.stat.",trait,".",study.names,".chr",chr,".segment",segment,".Rdata")
sumstat.list1 <- sapply(sumstat.files1, function(x) mget(load(x)), simplify = TRUE)
sumstat.list1 <- lapply(sumstat.list1, function(x) {
if (!(is.null(x)) && !("qc_label" %in% colnames(x))){
data.frame(x[,1:4],qc_label="PASS",x[,5:dim(x)[2]],stringsAsFactors = FALSE)
}else{
x
}
})
position.index1 <- mapply(function(x,y) {
(x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),"pos"]>=start.loc)&(x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),"pos"]<=end.loc)
}, x = sumstat.list1, y = cov_maf_cutoff, SIMPLIFY = FALSE)
sumstat.list1 <- lapply(sumstat.list1, function(x) {
x[(x$pos>=start.loc)&(x$pos<=end.loc),]
})
sumstat.varid.list1 <- mapply(function(x,y) {
x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),1:4]
}, x = sumstat.list1, y = cov_maf_cutoff, SIMPLIFY = FALSE)
sumstat.files2 <- paste0(sumstat.dir,"/summary.stat.",trait,".",study.names,".chr",chr,".segment",segment+1,".Rdata")
sumstat.list2 <- sapply(sumstat.files2, function(x) mget(load(x)), simplify = TRUE)
sumstat.list2 <- lapply(sumstat.list2, function(x) {
if (!(is.null(x)) && !("qc_label" %in% colnames(x))){
data.frame(x[,1:4],qc_label="PASS",x[,5:dim(x)[2]],stringsAsFactors = FALSE)
}else{
x
}
})
position.index2 <- mapply(function(x,y) {
(x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),"pos"]>=start.loc)&(x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),"pos"]<=end.loc)
}, x = sumstat.list2, y = cov_maf_cutoff, SIMPLIFY = FALSE)
sumstat.list2 <- lapply(sumstat.list2, function(x) {
x[(x$pos>=start.loc)&(x$pos<=end.loc),]
})
sumstat.varid.list2 <- mapply(function(x,y) {
x[(x$MAF<y)&(x$MAF>0)&(x$qc_label=="PASS"),1:4]
}, x = sumstat.list2, y = cov_maf_cutoff, SIMPLIFY = FALSE)
sumstat.list <- mapply(function(x,y) {
rbind(x,y)
}, x = sumstat.list1, y = sumstat.list2, SIMPLIFY = FALSE)
sumstat.varid.list <- mapply(function(x,y) {
rbind(x,y)
}, x = sumstat.varid.list1, y = sumstat.varid.list2, SIMPLIFY = FALSE)
sumstat.varid.merge <- do.call("rbind",sumstat.varid.list)
sumstat.varid.nodup <- sumstat.varid.merge[!duplicated(sumstat.varid.merge),]
if (is.null(sumstat.varid.nodup)) {
return(NULL)
}else if (dim(sumstat.varid.nodup)[1] == 0) {
return(NULL)
}
sumstat.merge.list <- lapply(sumstat.list, function(x) {
if (is.null(x)) {
cbind(sumstat.varid.nodup,qc_label=NA,alt_AC=NA,MAC=NA,MAF=NA,N=NA,U=NA,V=NA,"1"=NA)
}else {
left_join(sumstat.varid.nodup,x,by=c("chr"="chr",
"pos"="pos",
"ref"="ref",
"alt"="alt"))
}
})
sumstat.merge.list <- mapply(function(x,y) {
x[is.na(x[,"N"]),"N"] <- y
x[is.na(x[,"qc_label"]),"qc_label"] <- "PASS"
x[is.na(x)] <- 0
return(x)
}, x = sumstat.merge.list, y = sample.sizes, SIMPLIFY = FALSE)
sumstat.varid.nodup$index <- 1:dim(sumstat.varid.nodup)[1]
sumstat.index.list1 <- lapply(sumstat.varid.list1, function(x) {
if (is.null(x)) {
integer(0)
}else{
left_join(x,sumstat.varid.nodup,by=c("chr"="chr",
"pos"="pos",
"ref"="ref",
"alt"="alt"))$index
}
})
sumstat.index.list2 <- lapply(sumstat.varid.list2, function(x) {
if (is.null(x)) {
integer(0)
}else{
left_join(x,sumstat.varid.nodup,by=c("chr"="chr",
"pos"="pos",
"ref"="ref",
"alt"="alt"))$index
}
})
rm(sumstat.files1,sumstat.files2,sumstat.list,sumstat.list1,sumstat.list2,
sumstat.varid.list,sumstat.varid.list1,sumstat.varid.list2,
sumstat.varid.merge)
gc()
}
### select "common" variant based on the input cutoff
alt_AC.merge <- as.integer(Reduce("+",lapply(sumstat.merge.list, function(x) {x$alt_AC})))
N.merge.nonzero <- Reduce("+",lapply(sumstat.merge.list, function(x) {x$N * (x$MAC != 0)}))
alt_AF.merge.nonzero <- alt_AC.merge / (2 * N.merge.nonzero)
N.merge.zero <- Reduce("+",lapply(sumstat.merge.list, function(x) {x$N * (x$MAC == 0)}))
alt_AC.merge <- alt_AC.merge + (alt_AF.merge.nonzero > 0.5) * (2 * N.merge.zero)
N.merge <- Reduce("+",lapply(sumstat.merge.list, function(x) {x$N}))
MAC.merge <- pmin(alt_AC.merge, 2 * N.merge - alt_AC.merge)
MAF.merge <- MAC.merge / (2 * N.merge)
cv.index <- (MAF.merge>(common_mac_cutoff - 0.5) / (2 * N.merge)) & Reduce("*",mapply(function(x,y) {
x$MAF<y
}, x = sumstat.merge.list, y = cov_maf_cutoff, SIMPLIFY = FALSE))
if (check_qc_label){
cv.index <- cv.index & Reduce("*",lapply(sumstat.merge.list, function(x) {
x$qc_label=="PASS"
}))
}
info <- cbind(sumstat.varid.nodup[,c("chr","pos","ref","alt")],
alt_AC=alt_AC.merge,MAC=MAC.merge,MAF=MAF.merge,N=N.merge)[cv.index,]
U.merge <- Reduce("+", lapply(sumstat.merge.list, function(x) {
x <- x[cv.index,]
return(x$U * (2 * (x$alt_AC == x$MAC) - 1))
}))
V.merge <- Reduce("+", lapply(sumstat.merge.list, function(x) {
x <- x[cv.index,]
return(x$V)
}))
p.merge <- pchisq(U.merge^2/V.merge,df=1,lower.tail=FALSE)
logp.merge <- -pchisq(U.merge^2/V.merge,df=1,lower.tail=FALSE,log.p=TRUE)
rm(list=setdiff(ls(), c("info","U.merge","V.merge","p.merge","logp.merge")))
gc()
return(data.frame(chr=info$chr,
pos=info$pos,
ref=info$ref,
alt=info$alt,
alt_AC=info$alt_AC,
MAC=info$MAC,
MAF=info$MAF,
N=info$N,
p=p.merge,
logp=logp.merge,
Score=U.merge,
Score_se=sqrt(V.merge),
Est=U.merge/V.merge,
Est_se=1/sqrt(V.merge)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.