Primo: Package in R for Integrative Multi-Omics association analysis

Documented in Primo_conditional run_conditional run_conditional_gwas

#' Perform conditional analysis for a specified variant.
#'
#' For a specified variant, re-estimate the posterior probabilities of association
#' patterns, conditioning on other specified variants (e.g. lead SNPs for omics
#' traits). Returns the association pattern with the highest
#' posterior probability after conditional analysis.
#'
#' @param idx_snp integer matching the index (e.g. row of \code{Tstat_mod}) of the genetic variant
#' for which to perform conditional analysis.
#' @param idx_leadsnps vector of indices (e.g. rows of \code{Tstat_mod}) of the leading snps
#' on which to condition.
#' @param LD_mat matrix of genotype correlation coefficients (e.g. Pearson \eqn{r}{r}).
#' Rows and columns should match the order of (\code{idx_snp}, \code{idx_leadsnps}).
#' @param Primo_obj list returned by running the \eqn{t}-statistic version
#' of Primo (i.e. \code{\link{Primo_tstat}} or \code{\link{Primo_modT}})
#'
#' @return The integer corresponding to the association pattern
#' with the highest posterior probability following conditional analysis.
#' The value returned, \eqn{k}, corresponds to the \eqn{k}-th index of \code{pis}
#' from the Primo output, and the \eqn{k}-th row of the \eqn{Q} matrix produced
#' by \code{\link{make_qmat}}.
#'
#'
#' @export
#'
Primo_conditional <- function(idx_snp,idx_leadsnps,LD_mat,Primo_obj){
  n_leadsnps<-length(idx_leadsnps)
  zi<-Primo_obj$Tstat_mod[c(idx_snp,idx_leadsnps),]
  d<-ncol(zi)
  Q <- make_qmat(1:d)
  V <- Primo_obj$V_mat[c(idx_snp,idx_leadsnps),]
  mdf_sd<-Primo_obj$mdf_sd_mat[c(idx_snp,idx_leadsnps),]
  post_prob<-Primo_obj$post_prob
  pis<-Primo_obj$pis
  v2<-NULL
  Gamma<-Primo_obj$Gamma

  ## variance of t-statistic for each lead SNP under most plausible association pattern
  for(i in 1:n_leadsnps){
    q2<-Q[which.max(post_prob[idx_leadsnps[i],]),]
    v2<-rbind(v2,(V[(i+1),]%*%diag(q2)+ matrix(1, nrow=1,ncol=d)%*%diag( 1-q2))*matrix(mdf_sd[(i+1),],ncol=d))
  }

  CD_mat=matrix(NA, nrow=1, ncol=2^d)

  ## joint conditional densities under each pattern for index SNP
  for(k in 1:2^d){
    v_k <-  (V[1,]%*%diag(Q[k,]) + matrix(1, nrow=1,ncol=d)%*%diag( 1-Q[k,]))*matrix(mdf_sd[1,],ncol=d)
    cmean<-NULL
    csigma<-NULL
    for(j in 1:d){
      Var<-(c(v_k[j],v2[,j])%*%t(c(v_k[j],v2[,j])))*LD_mat
      cmean<-c(cmean,Var[1,(2:(1+n_leadsnps))]%*%solve(Var[(2:(1+n_leadsnps)),(2:(1+n_leadsnps))])%*%zi[(2:(1+n_leadsnps)),j])
      csigma<-c(csigma,Var[1,1] - Var[1,(2:(1+n_leadsnps))]%*%solve(Var[(2:(1+n_leadsnps)),(2:(1+n_leadsnps))])%*%as.matrix(Var[1,(2:(1+n_leadsnps))]))
    }
    sigma<-(sqrt(csigma)%*%t(sqrt(csigma)))*Gamma
    CD_mat[,k]<-mvtnorm::dmvnorm(zi[1,],mean = cmean,sigma =sigma,log=TRUE)
  }

  sp<-which.max(CD_mat+log(pis))

  return(sp)
}

#' Set up conditional analysis for specified variant(s) by index.
#'
#' For specified variant(s) (passed by index), set-up and run conditional analysis.
#' The function uses a data.table of lead SNPs to identify possible SNPs
#' for conditional analysis, and determines which SNPs
#' will be conditioned on based on specified criteria. Returns the association
#' pattern with the highest posterior probability for each specified variant
#' after conditional analysis.
#'
#' @param Primo_obj list returned by running the \eqn{t}-statistic version
#' of Primo (i.e. \code{\link{Primo_tstat}})
#' @param IDs data.table of the SNP and phenotype IDs corresponding to each row
#' of the Primo results stored in \code{Primo_obj}.
#' @param idx integer of the index of the genetic variant (e.g. row of \code{Tstat_mod})
#' on which one wants to perform conditional analysis.
#' @param leadsnps_region data.table that stores the lead SNP of each phenotype
#' in each region of the Primo results. Also includes \eqn{P}-values for the lead SNPs.
#' See Details for format.
#' @param snp_col string of the column name of SNPs/variants.
#' @param pheno_cols character vector of the column names of the phenotype ID columns.
#' @param snp_info data.table reporting the chromosome and position of each SNP.
#' Columns must include: \code{SNP, CHR, POS}.
#' @param LD_mat matrix of genotype correlation coefficients (e.g. Pearson \eqn{r}{r}). Row and column names
#' should be SNP/variant names (e.g. in \code{snp_col}).
#' @param LD_thresh scalar corresponding to the LD \eqn{r^{2}}{r^2}
#' threshold to be used for conditional analysis. Lead SNPs with \eqn{r^{2} <}{r^2 <}
#' \code{LD_thresh} with the \code{idx} variant will be conditioned on.
#' Default value (1) signifies no consideration of LD in conditional analyses.
#' @param dist_thresh scalar of the minimum number of base pairs away from the \code{idx} variant
#' that a lead SNP must be in order to be conditioned on. Default value (0)
#' signifies no consideration of chromosomal distance in conditional analyses.
#' @param pval_thresh scalar of the \eqn{P}-value threshold a lead SNP must be below
#' with the phenotype for which it is lead SNP in order to be conditioned on.
#' Default value (1) signifies no consideration of strength of effect in conditional analyses.
#' @param suffices character vector of the suffices corresponding to columns in
#' \code{leadsnps_region}.
#'
#' @inherit Primo_conditional return
#'
#' @return An integer vector corresponding to the association pattern
#' with the highest posterior probabilities for each variant
#' represented by \code{idx}, following conditional analysis.
#' Each value returned, \eqn{k}, corresponds to the \eqn{k}-th column of \code{pis}
#' from the Primo output, and the \eqn{k}-th row of the \eqn{Q} matrix produced
#' by \code{\link{make_qmat}}.
#'
#' @export
#'
run_conditional <- function(Primo_obj,IDs,idx,leadsnps_region,snp_col="SNP",pheno_cols,snp_info,LD_mat,LD_thresh=1,dist_thresh=0,pval_thresh=1,suffices=1:length(pheno_cols)){

  base::requireNamespace("data.table")

  if(!data.table::is.data.table(IDs)) IDs <- data.table::data.table(IDs)
  if(!data.table::is.data.table(leadsnps_region)) leadsnps_region <- data.table::data.table(leadsnps_region)
  if(!data.table::is.data.table(snp_info)) snp_info <- data.table::data.table(snp_info)

  sp_vec <- NULL

  for(i in idx){
    ## get current region
    curr.IDs <- IDs[i,]
    curr.SNP <- subset(curr.IDs,select=snp_col)[[1]]
    curr.Region <- merge(leadsnps_region,curr.IDs,by=pheno_cols)

    ## subset Primo results to the current region
    IDs_copy <- IDs
    IDs_copy$ObsNum <- 1:nrow(IDs_copy)
    IDs_copy <- merge(IDs_copy,curr.IDs,by=pheno_cols)
    curr_Region.idx <- IDs_copy$ObsNum
    Primo_obj_sub <- Primo::subset_Primo_obj(Primo_obj,curr_Region.idx)
    IDs_copy <- IDs[curr_Region.idx,]

    ## melt data so each lead SNP is its own row
    curr.Region_long <- data.table::melt(curr.Region,id.vars=pheno_cols,measure.vars = paste0("leadSNP_",suffices),
                                         value.name=snp_col)
    curr.Region_long <- cbind(curr.Region_long,pval=data.table::melt(curr.Region,id.vars=pheno_cols,measure.vars = paste0("pvalue_",suffices),
                                                                     value.name="pval")$pval)

    ## merge in chr and position to calculate distance between lead SNPs and SNP of interest
    data.table::setkeyv(curr.Region_long,snp_col)
    data.table::setkeyv(snp_info,snp_col)
    curr.Region_long <- merge(curr.Region_long,snp_info)
    curr.Region_long$dist <- abs(snp_info$POS[which(subset(snp_info, select=snp_col)[[1]]==curr.SNP)] - curr.Region_long$POS)
    ## merge in LD coefficients
    curr.Region_long$LD_r2 <- LD_mat[curr.SNP,subset(curr.Region_long,select=snp_col)[[1]]]^2

    ## determine which lead SNPs meet criteria
    keep_idx <- which(curr.Region_long$dist > dist_thresh & curr.Region_long$pval <= pval_thresh & curr.Region_long$LD_r2 < LD_thresh)
    leadSNPs <- unique(subset(curr.Region_long[keep_idx,],select=snp_col)[[1]])

    ## index of SNP of interest
    idx_snp <- which(subset(IDs_copy,select=snp_col)[[1]]==curr.SNP)

    if(length(leadSNPs)==0){
      if(is.matrix(Primo_obj_sub$post_prob)){
        sp_vec <- c(sp_vec,which.max(Primo_obj_sub$post_prob[idx_snp,]))
      } else sp_vec <- c(sp_vec,which.max(Primo_obj_sub$post_prob))
    } else{

      ## get snp_indices for other snps to adjust for
      idx_leadsnps <- NULL
      for(j in 1:length(leadSNPs)){
        idx_leadsnps <- c(idx_leadsnps, which(subset(IDs_copy,select=snp_col)[[1]]==leadSNPs[j]))
      }

      ## run conditional analysis
      sp <- Primo::Primo_conditional(idx_snp,idx_leadsnps,LD_mat[c(curr.SNP,leadSNPs),c(curr.SNP,leadSNPs)],Primo_obj_sub)
      sp_vec <- c(sp_vec,sp)
    }
  }

  return(sp_vec)
}

#' Conditional analysis for known complex trait-associated variants.
#'
#' For specified, known complex trait-associated (GWAS) variant(s),
#' set-up and run conditional analysis.
#' The function identifies lead omics SNPs to consider for conditional analysis,
#' and determines which SNPs will be conditioned on for each GWAS variant
#' based on specified criteria. Returns a data.frame with posterior probabilities
#' for collapsed association patterns and results from conditional analysis.
#' Also returns a vector of estimated FDR for each collapsed association pattern
#' at a specified posterior probability threshold.
#'
#' @param Primo_obj list returned by running the \eqn{t}-statistic version
#' of Primo (i.e. \code{\link{Primo_tstat}})
#' @param IDs data.frame of the SNP and phenotype IDs corresponding to each row
#' of the Primo results stored in \code{Primo_obj}.
#' @param gwas_snps character vector of known trait-associated (GWAS) SNPs.
#' @param pvals matrix of \eqn{P}-values from test statistics.
#' @param LD_mat matrix of genotype correlation coefficients (e.g. Pearson \eqn{r}{r}). Row and column names
#' should be SNP/variant names (i.e matching variant names in \code{IDs}).
#' @param snp_info data.frame reporting the chromosome and position of each SNP.
#' Columns must include: \code{SNP, CHR, POS}.
#' @param pp_thresh scalar of the posterior probability threshold used for significance.
#' @param LD_thresh scalar corresponding to the LD \eqn{r^{2}}{r^2}
#' threshold to be used for conditional analysis. Lead omics SNPs with \eqn{r^{2} <}{r^2 <}
#' \code{LD_thresh} with the GWAS SNP will be conditioned on.
#' @param dist_thresh scalar of the minimum number of base pairs away from the GWAS SNP
#' that a lead SNP must be in order to be conditioned on.
#' @param pval_thresh scalar of the \eqn{P}-value threshold a lead SNP must be below
#' with the phenotype for which it is lead SNP in order to be conditioned on.
#' @param gwas_col integer of the data column (e.g. in \code{Primo_obj$Tstat_mod}) of the
#' GWAS study. Default is the first data column.
#' @param use_ID_labels logical of whether or not phenotype column names in \code{IDs}
#' should be used as labels for identifying possible omics associations. If \code{FALSE},
#' labels will append \eqn{j} to 'study', where \eqn{j} is the corresponding data column
#' of the omics phenotype.
#'
#'
#' @return A list with two elements, \code{pp_grouped} and \code{fdr}.
#'
#' \code{fdr} is a named vector of the estimated false discovery rates (FDR)
#' for each collapsed association pattern at the posterior probability
#' threshold, \code{pp_thresh}.
#'
#' \code{pp_grouped} is a data.frame with the following information:
#'
#' \itemize{
#'   \item SNP and trait identifiers corresponding to each observation
#'   \item posterior probabilities of the collapsed association patterns
#'   ("GWAS + at least \eqn{n} omics trait(s)")
#'   \item number of omics traits with which the SNP was associated before conditional analysis
#'   (at posterior probability \code{> pp_thresh})
#'   \item number of omics traits with which the SNP is associated after conditional analysis
#'   \item top association pattern before conditional analysis
#'   \item top association pattern after conditional analysis
#'   \item omics traits with which SNP may be associated based on top patterns
#'   before and after conditional analysis
#' }
#'
#' @export
#'
run_conditional_gwas <- function(Primo_obj,IDs,gwas_snps,pvals,LD_mat,snp_info,pp_thresh,LD_thresh=0.9,dist_thresh=5e3,pval_thresh=1e-3,gwas_col=1,use_ID_labels=TRUE){

  snp_col <- colnames(IDs)[1]
  pheno_cols <- colnames(IDs)[2:ncol(IDs)]

  ## use subset command to allow IDs to be either data.table or data.frame
  gwas_idx <- which(subset(IDs, select=snp_col)[[1]] %in% gwas_snps)

  colnames(pvals) <- paste0("pvalue_",1:ncol(pvals))

  ## determine lead omics SNPs
  ID_pvals <- data.table::data.table(IDs,pvals)
  leadsnps_region <- Primo::find_leadsnps(data=ID_pvals,snp_col=snp_col,pheno_cols=pheno_cols,
                                          stat_cols=colnames(ID_pvals)[(ncol(IDs)+1):ncol(ID_pvals)],data_type="pvalue")

  ##determine top pattern before conditional analysis
  PP_gwas <- Primo_obj$post_prob[gwas_idx,]
  if(length(gwas_idx)==1){
    orig_max <- which.max(PP_gwas)
  } else{
    orig_max <- max.col(PP_gwas)
  }

  ## determine top pattern after conditional analysis
  sp_vec <- Primo::run_conditional(Primo_obj,IDs,idx=gwas_idx,leadsnps_region,snp_col,
                                   pheno_cols,snp_info,LD_mat,LD_thresh,dist_thresh,pval_thresh)

  ## collapsed probabilities
  PP_grouped <- Primo::collapse_pp_num(Primo_obj$post_prob[gwas_idx,],req_idx=gwas_col,prefix="pp_nQTL_ge")
  PP_grouped <- PP_grouped[,-1] ## drop 0qtl+GWAS since that is not goal of analysis

  ## number of QTLs, per collapsed probabilities
  PP_grouped <- cbind(PP_grouped,nQTL_orig=0)
  for(k in 1:(ncol(pvals)-1)){
    PP_grouped[,"nQTL_orig"] <- PP_grouped[,"nQTL_orig"] + as.numeric(PP_grouped[,paste0("pp_nQTL_ge",k)] > pp_thresh)
  }

  ## determine the number of QTLs in each corresponding pattern
  myQ <- Primo::make_qmat(1:ncol(pvals))
  QTL_cols <- 1:ncol(pvals)
  QTL_cols <- QTL_cols[-gwas_col]

  ## determine final nQTL based on pre- and post-conditional patterns with max. probability
  QTL_status_byObs <- myQ[orig_max,QTL_cols] * myQ[sp_vec,QTL_cols]
  nQTL_final <- rowSums(QTL_status_byObs)

  ## create character string of possible omics QTL associations
  if(use_ID_labels){
    study_names <- pheno_cols[QTL_cols]
  } else study_names <- paste0("study",QTL_cols)
  poss_assoc <- sapply(apply(QTL_status_byObs, 1, function(x) study_names[as.logical(x)]),function(y) paste(y,collapse=";"))

  ## add in final nQTL and patterns with max probability
  PP_grouped <- data.frame(PP_grouped,nQTL_final=nQTL_final,top_pattern_precond=orig_max,top_pattern_postcond=sp_vec,
                           poss_QTL_assoc=poss_assoc,stringsAsFactors = F)
  PP_grouped[,"nQTL_final"] <- pmin(PP_grouped[,"nQTL_orig"],PP_grouped[,"nQTL_final"])

  ## calculate estimated FDR
  fdr <- NULL
  for(i in 1:(ncol(pvals)-1)){
    pp_col <- paste0("pp_nQTL_ge",i)
    next_fdr <- Primo::calc_fdr_conditional(PP_grouped[,pp_col],thresh=pp_thresh,
                                            fail_idx=which(PP_grouped[,"nQTL_orig"]>=i & PP_grouped[,"nQTL_final"] < i))
    fdr <- c(fdr,next_fdr)
  }
  names(fdr) <- paste0("nQTL_ge",1:(ncol(pvals)-1))

  PP_grouped <- data.frame(IDs[gwas_idx,],PP_grouped,stringsAsFactors = F)

  return(list(pp_grouped=PP_grouped,fdr=fdr))
}