#' Calculate the significance of direct influences of variant pairs on phenotypes
#'
#' This function rotates the variant-to-eigentrait effects back to variant-to-phenotype
#' effects. It multiplies the \eqn{\beta}-coefficient matrices of each variant (i) and each
#' phenotype (j) (\eqn{\beta_{i}^{j}}) by the singular value matrices (\eqn{V \cdot W^{T}})
#' obtained from the singular value decomposition performed in \code{\link{get_eigentraits}}.
#' \eqn{\beta_{i}^{j} = V \cdot W^{T}}. It also uses the permutation data from the pairwise
#' scan (\code{\link{pairscan}}) to calculate an empirical p value for the influence of each
#' marker pair on each phenotype. The empirical p values are then adjusted for multiple
#' testing using Holm's step-down procedure.
#'
#' @param data_obj a \code{\link{Cape}} object
#' @param pairscan_obj a pairscan object
#' @param transform_to_phenospace A logical value. If TRUE, the influence of each marker on
#' each eigentrait is transformed to the influence of each marker on each of the original
#' phenotypes. If FALSE, no transformation is made. If the pair scan was done on eigentraits,
#' the influence of each marker on each eigentrait is calculated. If the pair scan was done
#' on raw phenotypes, the influence of each marker on each phenotype is calculated. The
#' default behavior is to transform variant influences on eigentraits to variant influences
#' on phenotypes.
#' @param pval_correction One of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none",
#' indicating whether the p value correction method used should be the Holm step-down procedure,
#' false discovery rate or local false discovery rate respectively.
#' @param perm_data The permutation data generated by \code{\link{pairscan}}.
#' @param save_permutations A logical value indicating whether the data from permutations should be
#' saved. Saving the permutations requires more memory but can be helpful in diagnostics. If
#' save_permutations is TRUE all permutation data are saved in an object called "permutation.data.RDS".
#' @param n_cores The number of cores to use if using parallel computing
#' @param path The path in which to write output data
#' @param verbose A logical value indicating whether to write progress to standard out.
#'
#' @return This function returns data_obj with an additional list called
#' max_var_to_pheno_influence. This list has one element for each trait.
#' Each element is a table with eight columns:
#' marker: the marker name
#' conditioning_marker: the marker whose effect was conditioned on to achieve the
#' maximum main effect of marker.
#' coef: the direct influence coefficient.
#' se: the standard error of the direct influence coefficient
#' t_stat: the t statistic for the direct influence coefficient
#' |t_stat|: the absolute value of the t statistic
#' emp_p: the empirical p value of the direct influence coefficient
#' p_adjusted: the adjusted p value of the direct influence coefficient.
#'
#'
#' @export
direct_influence <- function(data_obj, pairscan_obj, transform_to_phenospace = TRUE,
pval_correction = c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"),
perm_data = NULL, save_permutations = FALSE, n_cores = 4, path = ".", verbose = FALSE) {
#This object keeps large intermediate data
intermediate_data <- vector(mode = "list")
pval_correction = pval_correction[1]
#calculate the direct influences for either the actual
#tests or the permutations. Return a list with one element
#for each phenotype with columns:
#marker1, marker2, marker1.influence.coef, marker2.influence.coef, marker1.se, marker2.se
dir_inf <- function(data_obj, perm){
if(perm){
scan_two_results <- pairscan_obj$pairscan_perm
intermediate_data$pairscan_results <- scan_two_results
pairscan_obj$pairscan_perm <- NULL
}else{
scan_two_results <- pairscan_obj$pairscan_results
}
n_pairs <- length(scan_two_results[[1]][[1]][,1])
marker_mat <- scan_two_results[[1]][[1]][,1:2]
colnames(marker_mat) <- c("marker1", "marker2")
orig_pheno <- data_obj$pheno
#if transform_to_phenospace is FALSE, figure out
#if we are calculating the direct influences on
#phenotypes or eigentraits
if(!transform_to_phenospace){
pheno_names <- names(pairscan_obj$pairscan_results)
pheno_check <- match(pheno_names, colnames(data_obj$pheno))
if(length(which(!is.na(pheno_check))) == 0){ #if we scanned eigentraits
ET <- data_obj$ET
}else{
ET <- data_obj$pheno
}
}else{
ET <- data_obj$ET
right_sing_vals <- data_obj$right_singular_vectors
diag_mat <- round(t(ET)%*%orig_pheno%*%right_sing_vals, 2) #get the singular value matrix
pheno_names <- colnames(data_obj$pheno)
}
num_ET <- dim(ET)[2]
num_pheno <- length(pheno_names)
coeff_names <- c("marker1", "marker2")
#=========================================================
# preallocate matrices to hold the statistics for each pair
# The columns are for marker1, marker2, marker1.beta,
# marker2.beta, marker1.se and marker2.se (6 columns)
# there is one of these for each phenotype
#=========================================================
stats_mat <- matrix(NA, ncol = 6, nrow = n_pairs)
colnames(stats_mat) <- c("marker1", "marker2", "marker1_coef", "marker2_coef", "marker1_se", "marker2_se")
stats_list <- vector(mode = "list", length = num_pheno)
names(stats_list) <- pheno_names
for(n in 1:num_pheno){
stats_list[[n]] <- stats_mat
}
#This function grabs either the beta matrix or the se matrix
#The result element determines which type of matrix will be
#returned: beta, result.element = 1, se, result.element = 2
get_beta_prime <- function(scan_two_results, marker_pair_number){
#the beta matrix is composed of the coefficients from each pairwise
#marker model (except for the interaction coefficient)
#we use all markers in each row and set the non-covariate entries to 0
num_pheno <- length(scan_two_results)
beta_mat <- matrix(NA, nrow = (dim(scan_two_results[[1]][[1]])[2]-3), ncol = num_pheno)
# beta_mat <- matrix(NA, nrow = 2, ncol = num_pheno)
for(ph in 1:num_pheno){
beta_mat[,ph] <- as.numeric(scan_two_results[[ph]][[1]][marker_pair_number,3:(dim(scan_two_results[[ph]][[1]])[2]-1)])
}
colnames(beta_mat) <-colnames(ET)
rownames(beta_mat) <- colnames(scan_two_results[[ph]][[1]])[3:(dim(scan_two_results[[ph]][[1]])[2]-1)]
#calculate the full beta prime matrix
#if we are converting to phenotype space
if(transform_to_phenospace){
beta_prime <- beta_mat%*%diag_mat%*%t(right_sing_vals)
#get only the stats for marker1 and marker2
trunc_beta_prime <- matrix(beta_prime[(length(beta_prime[,1])-1):length(beta_prime[,1]),], nrow = 2)
colnames(trunc_beta_prime) <- colnames(orig_pheno)
}else{
beta_prime <- beta_mat #otherwise, just use the straight beta matrix
trunc_beta_prime <- matrix(beta_prime[(length(beta_prime[,1])-1):length(beta_prime[,1]),], nrow = 2)
colnames(trunc_beta_prime) <- colnames(ET)
}
just_marker_beta <- vector(mode = "list", length = dim(trunc_beta_prime)[2])
for(i in 1:length(just_marker_beta)){
just_marker_beta[[i]] <- trunc_beta_prime[,i]
}
names(just_marker_beta) <- colnames(trunc_beta_prime)
return(just_marker_beta)
}
get_se_prime <- function(scan_two_results, marker_pair_number){
#also get the se prime matrix
num_pheno <- length(scan_two_results)
se_mat <- matrix(NA, nrow = (dim(scan_two_results[[1]][[1]])[2]-3), ncol = num_pheno)
for(ph in 1:num_pheno){
se_mat[,ph] <- as.numeric(scan_two_results[[ph]][[2]][marker_pair_number,3:(dim(scan_two_results[[ph]][[2]])[2]-1)])
}
colnames(se_mat) <-colnames(ET)
rownames(se_mat) <- colnames(scan_two_results[[ph]][[2]])[3:(dim(scan_two_results[[ph]][[2]])[2]-1)]
#Calculate a beta se prime matrix from each
#beta matrix by multiplying by the beta se, squared diagonal singular value
#matrix, and the square of the transpose of the right singular values.
sqrd_se_mat <- se_mat ^ 2
if(transform_to_phenospace){
temp_mat <- diag_mat %*% t(right_sing_vals)
sqrd_temp_mat <- (temp_mat ^ 2)
se_prime <- sqrd_se_mat %*% sqrd_temp_mat
colnames(se_prime) <- colnames(orig_pheno)
}else{
se_prime <- sqrd_se_mat
colnames(se_prime) <- colnames(ET)
}
se_prime <- sqrt(se_prime)
just_marker_se <- vector(mode = "list", length = dim(se_prime)[2])
for(i in 1:length(just_marker_se)){
just_marker_se[[i]] <- se_prime[((dim(se_prime)[1]-1):dim(se_prime)[1]),i]
}
names(just_marker_se) <- colnames(se_prime)
return(just_marker_se)
}
#get the beta matrix and se matrix for each pair of markers
all_beta_prime <- apply(matrix(1:nrow(marker_mat), ncol = 1), 1, function(x) get_beta_prime(scan_two_results, x))
#add these into the final stats_matrix
for(i in 1:length(stats_list)){
stats_list[[i]][,1:2] <- marker_mat
stats_list[[i]][,3:4] <- t(sapply(all_beta_prime, function(x) x[[i]]))
}
all_beta_prime <- NULL
#also get the se matrices
all_se_prime <- apply(matrix(1:dim(marker_mat)[1], ncol = 1), 1, function(x) get_se_prime(scan_two_results, x))
#add these into the final stats_matrix
for(i in 1:length(stats_list)){
stats_list[[i]][,5:6] <- t(sapply(all_se_prime, function(x) x[[i]]))
}
all_se_prime = NULL
return(stats_list)
}
#================================================================
# calculate the direct influence of the variants and permutations
#================================================================
if(verbose){cat("calculating direct influence of variants...\n")}
exp_stats <- dir_inf(data_obj, perm = FALSE)
intermediate_data$var_to_pheno_influence <- exp_stats
if(is.null(perm_data)){
if(verbose){cat("calculating direct influence of permutations...\n")}
perm_stats <- dir_inf(data_obj, perm = TRUE)
intermediate_data$var_to_pheno_influence_perm <- perm_stats
}else{
perm_stats <- perm_data$var_to_pheno_influence_perm
}
#================================================================
#================================================================
direct_influence_stat <- function(pair_stats, perm){
#separate out the markers from each pair
#for each marker in each context, give it an
#influence coefficient, influence se, t stat
#and |t stat|
markers <- unique(c(pair_stats[[1]][,1], pair_stats[[1]][,2]))
pheno_names <- names(pair_stats)
#preallocate a matrix that will hold the statistics for each marker
#in each pair context. The columns are marker, coef, se, t_stat
#|t_stat|, emp.p (6 columns)
if(perm){
stats_mat <- matrix(NA, nrow = dim(pair_stats[[1]])[1]*2, ncol = 6)
colnames(stats_mat) <- c("marker", "conditioning_marker", "coef", "se", "t_stat", "|t_stat|")
}else{
stats_mat <- matrix(NA, nrow = dim(pair_stats[[1]])[1]*2, ncol = 7)
colnames(stats_mat) <- c("marker", "conditioning_marker", "coef", "se", "t_stat", "|t_stat|", "emp_p")
}
ind_stats_list <- vector(mode = "list", length = length(pheno_names))
names(ind_stats_list) <- pheno_names
for(i in 1:length(ind_stats_list)){
ind_stats_list[[i]] <- stats_mat
}
#for each phenotype, go through the marker names and
#collect the statistics from var_to_pheno_influence
#for when the marker was in position 1 and in position 2
for(ph in 1:length(pheno_names)){
#go through each marker and take out its statistics
for(m in markers){
#find out where the marker was in position 1
m1_locale <- which(pair_stats[[ph]][,1] == m)
if(length(m1_locale) > 0){
beta_coef1 <- as.numeric(pair_stats[[ph]][m1_locale, "marker1_coef"])
other_m <- pair_stats[[ph]][m1_locale, "marker2"]
se1 <- as.numeric(pair_stats[[ph]][m1_locale, "marker1_se"])
t_stat1 <- beta_coef1/se1
stat_section1 <- matrix(c(rep(m, length(beta_coef1)), other_m, beta_coef1, se1, t_stat1, abs(t_stat1)), ncol = 6)
start_row <- min(which(is.na(ind_stats_list[[ph]][,1])))
ind_stats_list[[ph]][start_row:(start_row + dim(stat_section1)[1] - 1),1:dim(stat_section1)[2]] <- stat_section1
}
#find out where the marker was in position 2
m2_locale <- which(pair_stats[[ph]][,2] == m)
if(length(m2_locale) > 0){
beta_coef2 <- as.numeric(pair_stats[[ph]][m2_locale, "marker2_coef"])
other_m <- pair_stats[[ph]][m2_locale, "marker1"]
se2 <- as.numeric(pair_stats[[ph]][m2_locale, "marker2_se"])
t_stat2 <- beta_coef2/se2
stat_section2 <- matrix(c(rep(m, length(beta_coef2)), other_m, beta_coef2, se2, t_stat2, abs(t_stat2)), ncol = 6)
start_row <- min(which(is.na(ind_stats_list[[ph]][,1])))
ind_stats_list[[ph]][start_row:(start_row + dim(stat_section2)[1] - 1),1:dim(stat_section2)[2]] <- stat_section2
}
}
}
return(ind_stats_list)
}
#================================================================
# tabulate influences of individual markers
#================================================================
if(verbose){cat("calculating individual marker influences...\n")}
exp_stats_per_marker <- direct_influence_stat(pair_stats = exp_stats, perm = FALSE)
if(is.null(perm_data)){
if(verbose){cat("calculating individual marker influences in permutations...\n")}
perm_stats_per_marker <- direct_influence_stat(perm_stats, perm = TRUE)
intermediate_data$var_to_pheno_test_stat_perm <- perm_stats_per_marker
}else{
perm_stats_per_marker <- perm_data$var_to_pheno_test_stat_perm
}
#================================================================
direct_influence_epcal<- function(exp_stats_per_marker, perm_stats_per_marker){
stat <- exp_stats_per_marker
perm_test_stat <- perm_stats_per_marker
#caclualte an empirical p value for each entry in stat
for(ph in 1:length(stat)){
if(verbose){cat(names(stat)[ph], "\n")}
comp_dist <- as.numeric(perm_test_stat[[ph]][,"|t_stat|"]) #compare each calculated value to the permuted distribution
obs_dist <- as.numeric(stat[[ph]][,"|t_stat|"])
all_emp_p <- calc_emp_p(obs_dist, comp_dist)
stat[[ph]][,"emp_p"] <- all_emp_p
}
return(stat)
}
#================================================================
# calculate empirical p values
#================================================================
if(verbose){cat("calculating p values...\n")}
emp_p_table <- direct_influence_epcal(exp_stats_per_marker, perm_stats_per_marker)
#replace the direct influence tables with the tables with the added empirical p values
intermediate_data$var_to_pheno_test_stat <- emp_p_table
#================================================================
#================================================================
direct_influence_max <- function(emp_p_table){
stat <- emp_p_table
markers <- unique(stat[[1]][,1])
max_stat_list <- vector(length(stat), mode = "list")
names(max_stat_list) <- names(stat)
#go through each of the phenotypes and find the maximum
#influence of each marker
for(ph in 1:length(stat)){
max_stat_table <- matrix(NA, nrow = length(markers), ncol = ncol(stat[[ph]]))
for(m in 1:length(markers)){
marker_locale <- which(stat[[ph]][,1] == markers[m])
temp_table <- stat[[ph]][marker_locale,,drop=FALSE]
colnames(temp_table) <- colnames(stat[[ph]])
max_stat_locale <- which(as.numeric(temp_table[,"|t_stat|"]) == max(as.numeric(temp_table[,"|t_stat|"]), na.rm = TRUE))
max_stat_table[m,] <- temp_table[max_stat_locale[1],,drop=FALSE]
}
ordered_table <- max_stat_table[order(max_stat_table[,5], decreasing = TRUE),]
max_stat_list[[ph]] <- ordered_table
}
return(max_stat_list)
}
if(save_permutations){
data_obj$save_rds(intermediate_data, "permutation.data.RDS")
}
all_p <- as.numeric(unlist(lapply(emp_p_table, function(x) x[,"emp_p"])))
p_adjusted <- p.adjust(all_p, pval_correction)
chunkp <- chunkV(p_adjusted, length(emp_p_table))
for(p in 1:length(chunkp)){
emp_p_table[[p]] <- cbind(emp_p_table[[p]], chunkp[[p]])
colnames(emp_p_table[[p]])[8] <- "p_adjusted"
}
if(verbose){cat("calculating maximum influence of each marker...\n")}
max_stat_list <- direct_influence_max(emp_p_table)
for(i in 1:length(max_stat_list)){
colnames(max_stat_list[[i]]) <- colnames(emp_p_table[[i]])
}
data_obj$max_var_to_pheno_influence <- max_stat_list
return(data_obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.