#' Subset a Primo object.
#'
#' Subset results from Primo output based on a vector of indices.
#'
#' @param Primo_obj list of results returned by Primo (from the function
#' \code{\link{Primo_tstat}}, \code{\link{Primo_pval}}, \code{\link{Primo_modT}}
#' or \code{\link{Primo_chiMix}}).
#' @param idx integer vector of the indices to subset Primo results.
#'
#' @return A list of Primo results with the following elements:
#' \tabular{ll}{
#' \code{post_prob} \tab matrix of posterior probabilities
#' (each column corresponds to an association pattern).\cr
#' \code{pis} \tab vector of estimated proportion of SNPs
#' belonging to each association pattern.\cr
#' \code{D_mat} \tab matrix of densities under each association pattern.\cr
#' \code{Gamma} \tab correlation matrix.\cr
#' }
#'
#' \itemize{
#' \item If the results were originally from the \eqn{t}-statistic version,
#' the list will additionally contain:
#' \tabular{ll}{
#' \code{Tstat_mod} \tab matrix of moderated t-statistics.\cr
#' \code{V_mat} \tab matrix of scaling factors under the alternative distribution.\cr
#' \code{mdf_sd_mat} \tab matrix of standard deviation adjustment according to
#' moderated degrees of freedom: \code{sqrt(df/(df-2))}.\cr
#' \code{prior_df} \tab vector of the prior degrees of freedom for each marginal distribution.\cr
#' \code{prior_var} \tab vector of the prior variance estimators for each marginal distribution.\cr
#' \code{unscaled_var} \tab vector of the unscaled variance priors on non-zero coefficients
#' for each marginal distribution.
#' }
#'
#' \item If the results were originally from the \eqn{P}-value version,
#' the list will additionally contain:
#' \tabular{ll}{
#' \code{chi_mix} \tab matrix of \eqn{-2\log(P)}{-2*log(P)}-values.\cr
#' \code{A} \tab vector of scaling factors under the alternative densities.\cr
#' \code{df_alt} \tab vector of degrees of freedom approximated for the alternative densities.\cr
#' }
#' }
#'
#' @export
#'
subset_Primo_obj <- function(Primo_obj,idx){
Primo_obj$post_prob <- Primo_obj$post_prob[idx,]
Primo_obj$D_mat <- Primo_obj$D_mat[idx,]
## t-statistic version
if(!is.null(Primo_obj$Tstat_mod)) Primo_obj$Tstat_mod <- Primo_obj$Tstat_mod[idx,]
if(!is.null(Primo_obj$V_mat)) Primo_obj$V_mat <- Primo_obj$V_mat[idx,]
if(!is.null(Primo_obj$mdf_sd_mat)) Primo_obj$mdf_sd_mat <- Primo_obj$mdf_sd_mat[idx,]
## p-value version
if(!is.null(Primo_obj$chi_mix)) Primo_obj$chi_mix <- Primo_obj$chi_mix[idx,]
return(Primo_obj)
}
#' Append a Primo object.
#'
#' Append two sets of Primo results. The function assumes that the two sets
#' share marginal distribution parameters and a common correlation structure.
#'
#' @param Primo_obj1 list of results returned by Primo (from the function
#' \code{\link{Primo_tstat}}, \code{\link{Primo_pval}}, \code{\link{Primo_modT}}
#' or \code{\link{Primo_chiMix}}).
#' @param Primo_obj2 list of results returned by Primo.
#'
#' @inherit subset_Primo_obj return
#'
#' @export
#'
append_Primo_obj <- function(Primo_obj1,Primo_obj2){
if(ncol(Primo_obj1$post_prob) != ncol(Primo_obj2$post_prob)) stop("Primo objects do not match (based on different numbers of studies).")
Primo_obj1$post_prob <- rbind(Primo_obj1$post_prob,Primo_obj2$post_prob)
Primo_obj1$D_mat <- rbind(Primo_obj1$D_mat,Primo_obj2$D_mat)
## t-statistic version
if(!is.null(Primo_obj1$Tstat_mod)) Primo_obj1$Tstat_mod <- rbind(Primo_obj1$Tstat_mod,Primo_obj2$Tstat_mod)
if(!is.null(Primo_obj1$V_mat)) Primo_obj1$V_mat <- rbind(Primo_obj1$V_mat,Primo_obj2$V_mat)
if(!is.null(Primo_obj1$mdf_sd_mat)) Primo_obj1$mdf_sd_mat <- rbind(Primo_obj1$mdf_sd_mat,Primo_obj2$mdf_sd_mat)
## p-value version
if(!is.null(Primo_obj1$chi_mix)) Primo_obj1$chi_mix <- rbind(Primo_obj1$chi_mix,Primo_obj2$chi_mix)
return(Primo_obj1)
}
#' Find the lead SNP for each phenotype in each region.
#'
#' Determine the lead SNP for each phenotype in each region
#' based on summary statistics. A data.table will be returned
#' containing lead SNP information for each region.
#'
#' @param data data.table. Each row will be a SNP-phenotype combination
#' with statistics necessary to determine the lead SNP in each phenotype region.
#' @param snp_col character string of the column name of the SNP.
#' @param pheno_cols character vector of the column names of the phenotypes.
#' @param stat_cols character vector of the column names of statistics to be
#' used to determine lead SNPs.
#' @param data_type character string denoting type of statistics being used. Must be
#' either "pvalue" or "tstat".
#' @param suffices character vector denoting suffices to use for the names of the
#' lead SNP columns (optional). If \code{NULL}, consecutive integers will be assigned.
#'
#' @return A data.table containing information about the lead SNPs and
#' associated statistics. The columns will be \code{pheno_cols} followed by two columns
#' for each phenotype: the name of the lead SNP and the value of the statistic for
#' that lead SNP in that phenotype.
#'
#' @export
#'
find_leadsnps <- function(data,snp_col,pheno_cols,stat_cols,data_type="pvalue",suffices=NULL){
base::requireNamespace("data.table")
if(!data.table::is.data.table(data)){
warning("Converting argument `data` to data.table format.")
data <- data.table::data.table(data)
}
data.table::setkeyv(data,pheno_cols)
if(is.null(suffices)) suffices <- 1:length(stat_cols)
if(data_type=="pvalue"){
leadsnps_region <- NULL
for(i in 1:length(stat_cols)){
## find SNP with minimum p-value for the current phenotype in the region
topSNP_currPheno <- data %>% dplyr::group_by(.dots=pheno_cols) %>% dplyr::slice(which.min(get(stat_cols[i])))
topSNP_currPheno <- data.table::data.table(topSNP_currPheno,key=pheno_cols)
topSNP_currPheno<- subset(topSNP_currPheno, select=c(pheno_cols,snp_col,stat_cols[i]))
colnames(topSNP_currPheno)[ncol(topSNP_currPheno)-1] <- paste0("leadSNP_",suffices[i])
## merge results
if(is.null(leadsnps_region)){
leadsnps_region <- topSNP_currPheno
} else{
leadsnps_region <- merge(leadsnps_region,topSNP_currPheno)
}
}
} else if(data_type=="tstat"){
leadsnps_region <- NULL
for(i in 1:length(stat_cols)){
## find SNP with maximum abs(t-statistic) for the current phenotype in the region
topSNP_currPheno <- data %>% dplyr::group_by(.dots=pheno_cols) %>% dplyr::slice(which.max(abs(get(stat_cols[i]))))
topSNP_currPheno <- data.table::data.table(topSNP_currPheno,key=pheno_cols)
topSNP_currPheno<- subset(topSNP_currPheno, select=c(pheno_cols,snp_col,stat_cols[i]))
colnames(topSNP_currPheno)[ncol(topSNP_currPheno)-1] <- paste0("leadSNP_",suffices[i])
## merge results
if(is.null(leadsnps_region)){
leadsnps_region <- topSNP_currPheno
} else{
leadsnps_region <- merge(leadsnps_region,topSNP_currPheno)
}
}
} else{
stop("data_type must be either 'pvalue' or 'tstat'.")
}
return(leadsnps_region)
}
#' Estimate posterior probabilities for observations missing from original Primo analysis.
#'
#' For each observation (e.g. SNP), estimates the posterior probability for each association pattern.
#' Uses parameters estimated by a previous run of \code{\link{Primo_tstat}} or \code{\link{Primo_modT}}
#' to estimate probabilities for SNPs missing in one or more studies.
#' Utilizes parallel computing, when available.
#'
#' @param betas matrix of coefficient estimates.
#' @param sds matrix of standard errors (for coefficient estimates).
#' @param dfs vector or matrix of degrees of freedom.
#' @param trait_idx integer vector of the columns corresponding to non-missing phenotypes/studies.
#' @param mafs vector or matrix of minor allele frequencies (MAFs).
#' If \code{NULL}, error variances will not be adjusted for MAF.
#' @param N vector or matrix of number of subjects.
#' Should be specified if \code{mafs!=NULL}.
#' @param pis vector of the estimated proportion of observations
#' belonging to each association pattern.
#' @param Gamma correlation matrix.
#' @param prior_df vector of the prior degrees of freedom for each marginal distribution.
#' @param prior_var vector of the prior variance estimators for each marginal distribution.
#' @param unscaled_var vector of the unscaled variance priors on non-zero coefficients
#' for each marginal distribution.
#' @param par_size numeric value; specifies the number of workers for
#' parallel computing (1 for sequential processing).
#'
#'
#' @inherit Primo_tstat return
#'
#' @export
#'
Primo_missdata_tstat <- function(betas,sds,dfs,trait_idx,mafs=NULL,N=NULL,pis,Gamma,prior_df,prior_var,unscaled_var,par_size=1){
m <- nrow(betas)
d <- ncol(betas)
if(is.matrix(pis)){
orig_d <- log(ncol(pis),2)
} else orig_d <- log(length(pis),2)
miss_idx <- (1:orig_d)[-trait_idx]
## extract data for traits not missing data
d0 <- prior_df[trait_idx]
s02 <- prior_var[trait_idx]
v0 <- unscaled_var[trait_idx]
Gamma <- Gamma[trait_idx,trait_idx]
orig_Q <- Primo::make_qmat(1:orig_d)
if(length(miss_idx) == 1){
keep_patterns <- which(orig_Q[,miss_idx]==0)
} else{
keep_patterns <- which(rowSums(orig_Q[,miss_idx])==0)
}
if(is.matrix(pis)){
pis <- pis[,keep_patterns]
} else pis <- pis[keep_patterns]
## estimate marginal density functions in limma framework
density_list <- lapply(1:d, function(j){
if(is.matrix(mafs)) mafs <- mafs[,j]
if(is.matrix(N)){
N <- N[,j]
} else if(!is.null(N)) N <- rep(N[j],m)
if(is.matrix(dfs)) {
d1 <- dfs[,j]
} else d1 <- rep(dfs[j],m)
## account for MAF in variance calculations
if (is.null(mafs)){
v1 = rep(1,m)
sigma2 <- sds[,j]^2
} else{
v1 <- 1/(2*mafs*(1-mafs)*N)
sigma2 <- sds[,j]^2*(2*mafs*(1-mafs)*N)
}
## rescale t-statistic (see Smyth, 2004)
sg_tilde <- sqrt((d0[j]*s02[j]+d1*sigma2)/(d0[j]+d1))
moderate.t <- betas[,j]/(sg_tilde*sqrt(v1))
## estimate null and alternative densities
df_mod=d0[j]+d1
scaler=sqrt(1+v0[j]/v1)
return(list(Tstat_mod = moderate.t, df_mod=df_mod, scaler=scaler, prior_df=d0[j], prior_var=s02[j], unscaled_var=v0[j]))
} )
## extract parameters for pattern-specific density estimation
Tstat_mod <- do.call("cbind", lapply(density_list, function(x) x$Tstat_mod))
mdfs <- do.call("cbind", lapply(density_list, function(x) x$df_mod))
V <- do.call("cbind", lapply(density_list, function(x) x$scaler))
## extract parameters from marginal densities (can be used to estimate variables for new observations)
prior_df <- sapply(density_list, function(x) x$prior_df)
prior_var <- sapply(density_list, function(x) x$prior_var)
unscaled_var <- sapply(density_list, function(x) x$unscaled_var)
## create sd matrix from moderated t-statistic dfs
mdf_sd_mat <- sqrt(mdfs/(mdfs-2))
## computation of D_mat (densities under each pattern)
Q<-Primo::make_qmat(1:d)
D_mat_func <- function(k){
m=nrow(V)
d=ncol(V)
v_k <- V %*%diag(Q[k,]) + matrix(1, nrow=nrow(V),ncol=ncol(V))%*%diag( 1-Q[k,])
v_k <- v_k * mdf_sd_mat
Z <- Tstat_mod/v_k
dets <- exp(rowSums(log(v_k)))
return(mvtnorm::dmvnorm(Z, mean=rep(0,d),sigma=Gamma)/dets)
}
## parallel version
if(par_size > 1){
cl <- parallel::makeCluster(par_size)
parallel::clusterEvalQ(cl, library(mvtnorm))
parallel::clusterExport(cl=cl, varlist=list("D_mat_func","Tstat_mod","V","mdf_sd_mat","Q","Gamma"),envir=environment())
D_mat_byk <- parallel::parLapply(cl, 1:2^d, function(k) D_mat_func(k=k))
parallel::stopCluster(cl)
} else{
## sequential version
D_mat_byk <- lapply(1:2^d, function(k) D_mat_func(k=k))
}
## bind densities together into matrix
D_mat <- do.call("cbind",D_mat_byk)
rm(D_mat_byk)
## obtain posterior probabilities
PP <- sweep(log(D_mat),2,log(pis),"+")
# subtract maximum, then e^(B)/rowSums(B)
PP<-PP-matrixStats::rowMaxs(PP)
PP<-exp(PP)
# use rowSums directly since columns are recycled
PP<-PP/rowSums(PP)
return(list(post_prob=PP, pis=pis, D_mat=D_mat, Gamma=Gamma, Tstat_mod=Tstat_mod, V_mat = V, mdf_sd_mat = mdf_sd_mat,
prior_df=prior_df,prior_var=prior_var,unscaled_var=unscaled_var))
}
#' Estimate posterior probabilities for observations missing from original Primo analysis.
#'
#' For each SNP, estimates the posterior probability for each association pattern.
#' Uses parameters estimated by a previous run of \code{\link{Primo_pval}} or \code{\link{Primo_chiMix}}
#' to estimate probabilities for SNPs missing in one or more studies.
#' \eqn{P}-values from non-missing studies are used as input.
#' Utilizes parallel computing, when available.
#'
#' @param pvals matrix of \eqn{P}-values from test statistics.
#' @param trait_idx integer vector of the columns corresponding to non-missing phenotypes/studies.
#' @param pis vector of the estimated proportion of observations
#' belonging to each association pattern
#' @param Gamma correlation matrix.
#' @param A vector of scaling factors under the alternative distributions.
#' @param df_alt vector of degrees of freedom approximated for the alternative distributions.
#' @param par_size numeric value; specifies the number of workers for
#' parallel computing (1 for sequential processing).
#'
#'
#' @inherit Primo_pval return
#'
#' @export
#'
Primo_missdata_pval <- function(pvals,trait_idx,pis,Gamma,A,df_alt,par_size=1){
m <- nrow(pvals)
d <- ncol(pvals)
if(is.matrix(pis)){
orig_d <- log(ncol(pis),2)
} else orig_d <- log(length(pis),2)
miss_idx <- (1:orig_d)[-trait_idx]
## convert p-values to mixture of chi-squared statistics: -2log(p)
chi_mix<-(-2)*log(pvals)
## extract data for traits not missing data
A <- A[trait_idx]
df_alt <- df_alt[trait_idx]
Gamma <- Gamma[trait_idx,trait_idx]
## determine which patterns to keep
orig_Q <- Primo::make_qmat(1:orig_d)
if(length(miss_idx) == 1){
keep_patterns <- which(orig_Q[,miss_idx]==0)
} else{
keep_patterns <- which(rowSums(orig_Q[,miss_idx])==0)
}
if(is.matrix(pis)){
pis <- pis[,keep_patterns]
} else pis <- pis[keep_patterns]
## computation of D_mat (densities under each pattern)
Q<-Primo::make_qmat(1:d)
D_mat_func_p <- function(k){
A_k <- A*Q[k,] + 1*(1-Q[k,])
df_k <- df_alt*Q[k,] + 2*(1-Q[k,])
rate_k <- 1/(2*A_k)
shape_k <- df_k/2
return(lcmix::dmvgamma(chi_mix, shape=shape_k, rate=rate_k, corr=Gamma))
}
## parallel version
if(par_size > 1){
cl <- parallel::makeCluster(par_size)
parallel::clusterEvalQ(cl, library(lcmix))
parallel::clusterExport(cl=cl, varlist=list("D_mat_func_p","chi_mix","A","df_alt","Q","Gamma"),envir=environment())
D_mat_byk <- parallel::parLapply(cl, 1:2^d, function(k) D_mat_func_p(k=k))
parallel::stopCluster(cl)
}else{
## sequential version
D_mat_byk <- lapply(1:2^d, function(k) D_mat_func_p(k=k))
}
## bind densities together into matrix
D_mat <- do.call("cbind",D_mat_byk)
rm(D_mat_byk)
## obtain posterior probabilities
PP <- sweep(log(D_mat),2,log(pis),"+")
# subtract maximum, then e^(B)/rowSums(B)
PP<-PP-matrixStats::rowMaxs(PP)
PP<-exp(PP)
# use rowSums directly since columns are recycled
PP<-PP/rowSums(PP)
return(list(post_prob=PP, pis=pis, D_mat=D_mat, Gamma=Gamma, chi_mix=chi_mix, A=A, df_alt=df_alt))
}
#' Collapse posterior probabilities based on number of traits of association.
#'
#' Combine the posterior probabilities of association patterns
#' according to the number of traits with non-null associations in each pattern.
#' Provides the posterior probability of
#' being associated with "at least \eqn{n}" number of traits/studies
#' by summing over association patterns with at least \eqn{n} studies
#' coming from the alternative distribution. Can also require non-null
#' association with one or more traits and combine over other traits.
#'
#' @param post_prob matrix of posterior probabilities.
#' @param req_idx (optional) scalar or integer vector of trait(s) where a non-null
#' association is required (i.e. from the alternative distribution).
#' If \code{NULL}, no traits will be required.
#' @param prefix character string denoting prefix of column names for the results matrix.
#'
#' @return A numeric matrix of combined posterior probabilities.
#'
#' @export
#'
collapse_pp_num <- function(post_prob,req_idx=NULL,prefix="pp_ge_"){
## number of traits
d <- log(ncol(post_prob),2)
## pattern matrix
Q <- Primo::make_qmat(1:d)
## columns for "at least X traits"
if(is.null(req_idx)){
cols_geX <- lapply(1:d, function(x) which(rowSums(Q) >= x))
names(cols_geX) <- paste0(prefix,1:d)
} else if(length(req_idx)==1){
cols_geX <- lapply(0:(d-length(req_idx)), function(x) which(Q[,req_idx]==1 & rowSums(Q[,-req_idx]) >= x))
names(cols_geX) <- paste0(prefix,0:(d-length(req_idx)))
} else {
cols_geX <- lapply(0:(d-length(req_idx)), function(x) which(rowSums(Q[,req_idx])==length(req_idx) & rowSums(Q[,-req_idx]) >= x))
names(cols_geX) <- paste0(prefix,0:(d-length(req_idx)))
}
## sum over columns denoting "at least X traits" to obtain combined posterior probability
pp_geX <- lapply(cols_geX, function(idx){
if(length(idx)==1){
return(post_prob[,idx])
} else{
return(rowSums(post_prob[,idx]))
}
})
pp_geX <- do.call("cbind",pp_geX)
colnames(pp_geX) <- names(cols_geX)
return(pp_geX)
}
#' Collapse posterior probabilities of association for each individual trait.
#'
#' Combine the posterior probabilities for each individual trait/study according to the
#' association patterns from which that trait comes from the alternative distribution.
#' Provides the posterior probability of being associated with "at least trait \eqn{X}".
#' Can also require non-null association with one or more traits and combine
#' over patterns where both \eqn{X} and that/those trait(s) are from the alternative distribution.
#'
#' @inheritParams collapse_pp_num
#'
#' @inherit collapse_pp_num return
#'
#' @export
#'
collapse_pp_trait <- function(post_prob,req_idx=NULL,prefix="pp_"){
## total number of traits
d <- log(ncol(post_prob),2)
## pattern matrix
Q <- Primo::make_qmat(1:d)
## columns for "at least X"
if(is.null(req_idx)){
cols_X <- lapply(1:d, function(x) which(Q[,x] == 1))
names(cols_X) <- paste0(prefix,1:d)
} else if(length(req_idx)==1){
cols_X <- lapply((1:d)[-req_idx], function(x) which(Q[,req_idx]==1 & Q[,x] == 1))
names(cols_X) <- paste0(prefix,(1:d)[-req_idx])
} else {
cols_X <- lapply((1:d)[-req_idx], function(x) which(rowSums(Q[,req_idx])==length(req_idx) & Q[,x] == 1))
names(cols_X) <- paste0(prefix,(1:d)[-req_idx])
}
## sum over columns denoting "at least X" to obtain combined posterior probability
pp_X <- lapply(cols_X, function(idx){
if(length(idx)==1){
return(post_prob[,idx])
} else{
return(rowSums(post_prob[,idx]))
}
})
pp_X <- do.call("cbind",pp_X)
colnames(pp_X) <- names(cols_X)
return(pp_X)
}
#' Collapse posterior probabilities of association for combinations of individual traits.
#'
#' Combine the posterior probabilities for combinations of individual traits/studies according to the
#' association patterns from which those traits/studies all come from the alternative distribution.
#' Provides the posterior probability of being associated with "at least traits
#' \eqn{X}, \eqn{Y} and \eqn{Z}", for example.
#' Can also require non-null association with one or more traits/studies and combine
#' over patterns where that/those trait(s) also come from the alternative distribution.
#'
#' @inheritParams collapse_pp_num
#' @param combos matrix or integer specifying combinations. When matrix, each row is
#' a combination, with column indices of traits belonging to the combination specified in
#' columns of \code{combos}. When integer, all combinations of (total # traits)
#' taking \code{combos} elements at a time will be tested.
#'
#' @inherit collapse_pp_num return
#'
#' @examples
#' ## all pairs of traits
#' myCombos <- t(combn(4,2))
#' collapsed_pp <- collapse_pp_combin(post_prob,myCombos)
#'
#' @export
#'
collapse_pp_combin <- function(post_prob,combos,req_idx=NULL,prefix="pp"){
## total number of traits
d <- log(ncol(post_prob),2)
## pattern matrix
Q <- Primo::make_qmat(1:d)
if(!is.matrix(combos)) combos <- t(utils::combn(d,combos))
## columns in PP for "at least X"
if(is.null(req_idx)){
cols_X <- lapply(1:nrow(combos), function(i) which(rowSums(Q[,combos[i,]]) == ncol(combos)))
} else {
cols_X <- lapply(1:nrow(combos), function(i)
which(rowSums(Q[,c(req_idx,combos[i,])]) == length(req_idx)+ncol(combos)))
}
## sum over columns denoting "at least X" to obtain combined posterior probability
pp_X <- lapply(cols_X, function(idx){
return(rowSums(post_prob[,idx]))
})
pp_X <- do.call("cbind",pp_X)
# colnames(pp_X) <- paste(prefix,req_idx,apply(combos,1, function(x) paste(x,collapse="_")),sep="_")
colnames(pp_X) <- paste0(prefix,req_idx,apply(combos,1, function(x) paste(x,collapse="_")))
return(pp_X)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.