R/direct_influence.R
In cape: Combined Analysis of Pleiotropy and Epistasis for Diversity Outbred Mice

Documented in direct_influence

#' Calculate the significance of direct influences of variant pairs on phenotypes
#' 
#' This function rotates the variant-to-eigentrait effects back to variant-to-phenotype 
#' effects. It multiplies the \eqn{\beta}-coefficient matrices of each variant (i) and each 
#' phenotype (j) (\eqn{\beta_{i}^{j}}) by the singular value matrices (\eqn{V \cdot W^{T}}) 
#' obtained from the singular value decomposition performed in \code{\link{get_eigentraits}}. 
#' \eqn{\beta_{i}^{j} = V \cdot W^{T}}. It also uses the permutation data from the pairwise 
#' scan (\code{\link{pairscan}}) to calculate an empirical p value for the influence of each 
#' marker pair on each phenotype. The empirical p values are then adjusted for multiple 
#' testing using Holm's step-down procedure.
#' 
#' @param data_obj a \code{\link{Cape}} object
#' @param pairscan_obj a pairscan object
#' @param transform_to_phenospace A logical value. If TRUE, the influence of each marker on 
#' each eigentrait is transformed to the influence of each marker on each of the original 
#' phenotypes. If FALSE, no transformation is made. If the pair scan was done on eigentraits, 
#' the influence of each marker on each eigentrait is calculated. If the pair scan was done 
#' on raw phenotypes, the influence of each marker on each phenotype is calculated. The 
#' default behavior is to transform variant influences on eigentraits to variant influences 
#' on phenotypes.
#' @param pval_correction 	One of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none", 
#' indicating whether the p value correction method used should be the Holm step-down procedure, 
#' false discovery rate or local false discovery rate respectively.
#' @param perm_data The permutation data generated by \code{\link{pairscan}}.
#' @param save_permutations A logical value indicating whether the data from permutations should be 
#' saved. Saving the permutations requires more memory but can be helpful in diagnostics. If 
#' save_permutations is TRUE all permutation data are saved in an object called "permutation.data.RDS".
#' @param n_cores The number of cores to use if using parallel computing
#' @param path The path in which to write output data
#' @param verbose A logical value indicating whether to write progress to standard out.
#' 
#' @return This function returns data_obj with an additional list called 
#' max_var_to_pheno_influence. This list has one element for each trait. 
#' Each element is a table with eight columns: 
#' marker: the marker name
#' conditioning_marker: the marker whose effect was conditioned on to achieve the
#' maximum main effect of marker.   
#' coef: the direct influence coefficient.
#' se: the standard error of the direct influence coefficient 
#' t_stat: the t statistic for the direct influence coefficient
#' |t_stat|: the absolute value of the t statistic
#' emp_p: the empirical p value of the direct influence coefficient
#' p_adjusted: the adjusted p value of the direct influence coefficient.
#'
#' 
#' @export
 
direct_influence <- function(data_obj, pairscan_obj, transform_to_phenospace = TRUE, 
                             pval_correction = c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"), 
                             perm_data = NULL, save_permutations = FALSE, n_cores = 4, path = ".", verbose = FALSE) {
  
  
  #This object keeps large intermediate data
  intermediate_data <- vector(mode = "list")
  
  pval_correction = pval_correction[1]
  
  #calculate the direct influences for either the actual
  #tests or the permutations. Return a list with one element
  #for each phenotype with columns:
  #marker1, marker2, marker1.influence.coef, marker2.influence.coef, marker1.se, marker2.se
  
  dir_inf <- function(data_obj, perm){
    if(perm){
      scan_two_results <- pairscan_obj$pairscan_perm		
      intermediate_data$pairscan_results <- scan_two_results
      pairscan_obj$pairscan_perm <- NULL
    }else{
      scan_two_results <- pairscan_obj$pairscan_results
    }
    
    n_pairs <- length(scan_two_results[[1]][[1]][,1]) 
    marker_mat <- scan_two_results[[1]][[1]][,1:2] 
    colnames(marker_mat) <- c("marker1", "marker2")
    orig_pheno <- data_obj$pheno
    
    
    #if transform_to_phenospace is FALSE, figure out
    #if we are calculating the direct influences on
    #phenotypes or eigentraits
    
    if(!transform_to_phenospace){
      pheno_names <- names(pairscan_obj$pairscan_results)	
      pheno_check <- match(pheno_names, colnames(data_obj$pheno))
      if(length(which(!is.na(pheno_check))) == 0){ #if we scanned eigentraits
        ET <- data_obj$ET					
      }else{
        ET <- data_obj$pheno	
      }					
    }else{
      ET <- data_obj$ET
      right_sing_vals <- data_obj$right_singular_vectors
      diag_mat <- round(t(ET)%*%orig_pheno%*%right_sing_vals, 2) #get the singular value matrix
      pheno_names <- colnames(data_obj$pheno)			
    }			
    
    
    num_ET <- dim(ET)[2] 
    num_pheno <- length(pheno_names)
    coeff_names <- c("marker1", "marker2")
    
    #=========================================================
    # preallocate matrices to hold the statistics for each pair
    # The columns are for marker1, marker2, marker1.beta, 
    # marker2.beta, marker1.se and marker2.se (6 columns)
    # there is one of these for each phenotype
    #=========================================================
    stats_mat <- matrix(NA, ncol = 6, nrow = n_pairs)
    colnames(stats_mat) <- c("marker1", "marker2", "marker1_coef", "marker2_coef", "marker1_se", "marker2_se")
    stats_list <- vector(mode = "list", length = num_pheno)
    
    
    names(stats_list) <- pheno_names
    for(n in 1:num_pheno){
      stats_list[[n]] <- stats_mat
    }
    
    
    #This function grabs either the beta matrix or the se matrix
    #The result element determines which type of matrix will be 
    #returned: beta, result.element = 1, se, result.element = 2
    
    get_beta_prime <- function(scan_two_results, marker_pair_number){
      
      #the beta matrix is composed of the coefficients from each pairwise
      #marker model (except for the interaction coefficient)
      #we use all markers in each row and set the non-covariate entries to 0
      num_pheno <- length(scan_two_results)
      beta_mat <- matrix(NA, nrow = (dim(scan_two_results[[1]][[1]])[2]-3), ncol = num_pheno)
      # beta_mat <- matrix(NA, nrow = 2, ncol = num_pheno)
      for(ph in 1:num_pheno){
        beta_mat[,ph] <- as.numeric(scan_two_results[[ph]][[1]][marker_pair_number,3:(dim(scan_two_results[[ph]][[1]])[2]-1)])
      }				
      colnames(beta_mat) <-colnames(ET)
      rownames(beta_mat) <- colnames(scan_two_results[[ph]][[1]])[3:(dim(scan_two_results[[ph]][[1]])[2]-1)]
      
      
      #calculate the full beta prime matrix
      #if we are converting to phenotype space
      if(transform_to_phenospace){
        beta_prime <- beta_mat%*%diag_mat%*%t(right_sing_vals)
        #get only the stats for marker1 and marker2
        trunc_beta_prime <- matrix(beta_prime[(length(beta_prime[,1])-1):length(beta_prime[,1]),], nrow = 2)
        colnames(trunc_beta_prime) <- colnames(orig_pheno)
      }else{
        beta_prime <- beta_mat #otherwise, just use the straight beta matrix
        trunc_beta_prime <- matrix(beta_prime[(length(beta_prime[,1])-1):length(beta_prime[,1]),], nrow = 2)
        colnames(trunc_beta_prime) <- colnames(ET)
      }
      
      just_marker_beta <- vector(mode = "list", length = dim(trunc_beta_prime)[2])
      for(i in 1:length(just_marker_beta)){
        just_marker_beta[[i]] <- trunc_beta_prime[,i]
      }
      names(just_marker_beta) <- colnames(trunc_beta_prime)
      return(just_marker_beta)
      
    }
    
    
    get_se_prime <- function(scan_two_results, marker_pair_number){
      #also get the se prime matrix
      num_pheno <- length(scan_two_results)
      se_mat <- matrix(NA, nrow = (dim(scan_two_results[[1]][[1]])[2]-3), ncol = num_pheno)
      for(ph in 1:num_pheno){
        se_mat[,ph] <- as.numeric(scan_two_results[[ph]][[2]][marker_pair_number,3:(dim(scan_two_results[[ph]][[2]])[2]-1)])
      }				
      colnames(se_mat) <-colnames(ET)
      rownames(se_mat) <- colnames(scan_two_results[[ph]][[2]])[3:(dim(scan_two_results[[ph]][[2]])[2]-1)]
      
      
      #Calculate a beta se prime matrix from each
      #beta matrix by multiplying by the beta se, squared diagonal singular value 
      #matrix, and the square of the transpose of the right singular values.	
      
      sqrd_se_mat <- se_mat ^ 2
      if(transform_to_phenospace){
        temp_mat <- diag_mat %*% t(right_sing_vals)
        sqrd_temp_mat <- (temp_mat ^  2)
        se_prime <- sqrd_se_mat %*% sqrd_temp_mat
        colnames(se_prime) <- colnames(orig_pheno)
      }else{
        se_prime <- sqrd_se_mat
        colnames(se_prime) <- colnames(ET)
      }
      
      se_prime <- sqrt(se_prime)
      
      just_marker_se <- vector(mode = "list", length = dim(se_prime)[2])
      for(i in 1:length(just_marker_se)){
        just_marker_se[[i]] <- se_prime[((dim(se_prime)[1]-1):dim(se_prime)[1]),i]
      }
      names(just_marker_se) <- colnames(se_prime)
      return(just_marker_se)
      
    }
    
    
    #get the beta matrix and se matrix for each pair of markers		
    all_beta_prime <- apply(matrix(1:nrow(marker_mat), ncol = 1), 1, function(x) get_beta_prime(scan_two_results, x))
    
    #add these into the final stats_matrix
    for(i in 1:length(stats_list)){
      stats_list[[i]][,1:2] <- marker_mat
      stats_list[[i]][,3:4] <- t(sapply(all_beta_prime, function(x) x[[i]]))
    }
    all_beta_prime <- NULL
    
    #also get the se matrices
    all_se_prime <- apply(matrix(1:dim(marker_mat)[1], ncol = 1), 1, function(x) get_se_prime(scan_two_results, x))
    
    #add these into the final stats_matrix
    for(i in 1:length(stats_list)){
      stats_list[[i]][,5:6] <- t(sapply(all_se_prime, function(x) x[[i]]))
    }
    all_se_prime = NULL
    
    
    return(stats_list)
  }
  
  
  #================================================================
  # calculate the direct influence of the variants and permutations
  #================================================================
  
  if(verbose){cat("calculating direct influence of variants...\n")}
  exp_stats <- dir_inf(data_obj, perm = FALSE)
  intermediate_data$var_to_pheno_influence <- exp_stats
  
  if(is.null(perm_data)){
    if(verbose){cat("calculating direct influence of permutations...\n")}
    perm_stats <- dir_inf(data_obj, perm = TRUE)
    intermediate_data$var_to_pheno_influence_perm <- perm_stats
  }else{
    perm_stats <- perm_data$var_to_pheno_influence_perm
  }
  
  #================================================================
  #================================================================	
  
  
  
  direct_influence_stat <- function(pair_stats, perm){
    
    
    #separate out the markers from each pair
    #for each marker in each context, give it an 
    #influence coefficient, influence se, t stat
    #and |t stat|
    
    markers <- unique(c(pair_stats[[1]][,1], pair_stats[[1]][,2]))
    pheno_names <- names(pair_stats)
    
    #preallocate a matrix that will hold the statistics for each marker
    #in each pair context. The columns are marker, coef, se, t_stat
    #|t_stat|, emp.p (6 columns)
    if(perm){
      stats_mat <- matrix(NA, nrow = dim(pair_stats[[1]])[1]*2, ncol = 6)
      colnames(stats_mat) <- c("marker", "conditioning_marker", "coef", "se", "t_stat", "|t_stat|")
    }else{
      stats_mat <- matrix(NA, nrow = dim(pair_stats[[1]])[1]*2, ncol = 7)
      colnames(stats_mat) <- c("marker", "conditioning_marker", "coef", "se", "t_stat", "|t_stat|", "emp_p")			
    }
    
    ind_stats_list <- vector(mode = "list", length = length(pheno_names))
    names(ind_stats_list) <- pheno_names
    for(i in 1:length(ind_stats_list)){
      ind_stats_list[[i]] <- stats_mat
    }
    
    #for each phenotype, go through the marker names and
    #collect the statistics from var_to_pheno_influence 
    #for when the marker was in position 1 and in position 2
    for(ph in 1:length(pheno_names)){
      
      #go through each marker and take out its statistics
      for(m in markers){
        
        #find out where the marker was in position 1
        m1_locale <- which(pair_stats[[ph]][,1] == m)
        
        if(length(m1_locale) > 0){
          beta_coef1 <- as.numeric(pair_stats[[ph]][m1_locale, "marker1_coef"])
          other_m <- pair_stats[[ph]][m1_locale, "marker2"]
          se1 <- as.numeric(pair_stats[[ph]][m1_locale, "marker1_se"])
          t_stat1 <- beta_coef1/se1
          stat_section1 <- matrix(c(rep(m, length(beta_coef1)), other_m, beta_coef1, se1, t_stat1, abs(t_stat1)), ncol = 6)
          
          start_row <- min(which(is.na(ind_stats_list[[ph]][,1])))
          ind_stats_list[[ph]][start_row:(start_row + dim(stat_section1)[1] - 1),1:dim(stat_section1)[2]] <- stat_section1
        }
        
        #find out where the marker was in position 2
        m2_locale <- which(pair_stats[[ph]][,2] == m)
        
        if(length(m2_locale) > 0){
          beta_coef2 <- as.numeric(pair_stats[[ph]][m2_locale, "marker2_coef"])
          other_m <- pair_stats[[ph]][m2_locale, "marker1"]
          se2 <- as.numeric(pair_stats[[ph]][m2_locale, "marker2_se"])
          t_stat2 <- beta_coef2/se2
          stat_section2 <- matrix(c(rep(m, length(beta_coef2)), other_m, beta_coef2, se2, t_stat2, abs(t_stat2)), ncol = 6)
          
          start_row <- min(which(is.na(ind_stats_list[[ph]][,1])))
          ind_stats_list[[ph]][start_row:(start_row + dim(stat_section2)[1] - 1),1:dim(stat_section2)[2]] <- stat_section2
        }
        
      }
      
    }
    
    return(ind_stats_list)
    
  }
  
  
  #================================================================
  # tabulate influences of individual markers		
  #================================================================
  if(verbose){cat("calculating individual marker influences...\n")}
  exp_stats_per_marker <- direct_influence_stat(pair_stats = exp_stats, perm = FALSE)
  
  if(is.null(perm_data)){
    if(verbose){cat("calculating individual marker influences in permutations...\n")}
    perm_stats_per_marker <- direct_influence_stat(perm_stats, perm = TRUE)
    intermediate_data$var_to_pheno_test_stat_perm <- perm_stats_per_marker
  }else{
    perm_stats_per_marker <- perm_data$var_to_pheno_test_stat_perm
  }
  
  #================================================================
  
  
  direct_influence_epcal<- function(exp_stats_per_marker, perm_stats_per_marker){
    stat <- exp_stats_per_marker
    perm_test_stat <- perm_stats_per_marker		
    
    #caclualte an empirical p value for each entry in stat
    for(ph in 1:length(stat)){
      if(verbose){cat(names(stat)[ph], "\n")}
      comp_dist <- as.numeric(perm_test_stat[[ph]][,"|t_stat|"]) #compare each calculated value to the permuted distribution
      obs_dist <- as.numeric(stat[[ph]][,"|t_stat|"])
      
      all_emp_p <- calc_emp_p(obs_dist, comp_dist)
      stat[[ph]][,"emp_p"] <- all_emp_p
    }
    
    return(stat)
  }
  
  
  #================================================================
  # calculate empirical p values
  #================================================================
  if(verbose){cat("calculating p values...\n")}
  emp_p_table <- direct_influence_epcal(exp_stats_per_marker, perm_stats_per_marker)
  #replace the direct influence tables with the tables with the added empirical p values
  intermediate_data$var_to_pheno_test_stat <- emp_p_table
  
  #================================================================
  #================================================================
  
  
  direct_influence_max <- function(emp_p_table){
    
    stat <- emp_p_table
    
    markers <- unique(stat[[1]][,1])
    max_stat_list <- vector(length(stat), mode = "list")
    names(max_stat_list) <- names(stat)
    
    #go through each of the phenotypes and find the maximum 
    #influence of each marker
    for(ph in 1:length(stat)){
      max_stat_table <- matrix(NA, nrow = length(markers), ncol = ncol(stat[[ph]]))
      for(m in 1:length(markers)){
        marker_locale <- which(stat[[ph]][,1] == markers[m])
        temp_table <- stat[[ph]][marker_locale,,drop=FALSE]
        colnames(temp_table) <- colnames(stat[[ph]])
        max_stat_locale <- which(as.numeric(temp_table[,"|t_stat|"]) == max(as.numeric(temp_table[,"|t_stat|"]), na.rm = TRUE))
        max_stat_table[m,] <- temp_table[max_stat_locale[1],,drop=FALSE]
      }
      ordered_table <- max_stat_table[order(max_stat_table[,5], decreasing = TRUE),]
      max_stat_list[[ph]] <- ordered_table
    }
    
    return(max_stat_list)
  }
  
  
  if(save_permutations){
    data_obj$save_rds(intermediate_data, "permutation.data.RDS")
  }	
  
  
  all_p <- as.numeric(unlist(lapply(emp_p_table, function(x) x[,"emp_p"])))
  p_adjusted <- p.adjust(all_p, pval_correction)
  chunkp <- chunkV(p_adjusted, length(emp_p_table))		
  for(p in 1:length(chunkp)){
    emp_p_table[[p]] <- cbind(emp_p_table[[p]], chunkp[[p]])	
    colnames(emp_p_table[[p]])[8] <- "p_adjusted"
  }
  
  if(verbose){cat("calculating maximum influence of each marker...\n")}
  max_stat_list <- direct_influence_max(emp_p_table)
  for(i in 1:length(max_stat_list)){
    colnames(max_stat_list[[i]]) <- colnames(emp_p_table[[i]])
  }
  
  data_obj$max_var_to_pheno_influence <- max_stat_list
  
  
  return(data_obj)
}