R/fdr.R

Defines functions vector_to_matrix fdr

Documented in fdr vector_to_matrix

#' Determine which associations are significant
#' 
#' Determines which associations are significant using emperical 
#' bayes and false discovery rate adjustment.
#' @param scores Either a vector or a symmetric matrix of association scores.
#' @param threshold The threshold for false discovery rate, used to determine 
#' significance; scores with ikelihood ratios above this threshold are
#' set to zero. If NULL, all scores are preserved.
#' @param transformation (option) Function applied to scores before
#' estimating the null distribution. If provided, the function should take 
#' a vector as input and return a vector for output.
#' @param robust Should robust estimates of mean and variance be used?
#' @param gene_names (optional) Vector of gene names.
#' @param include_likelihood Should the matrix of likelihood ratios be provided
#' in the output?
#' @param show_plot If true, a histogram of scores with estimated density
#' and null distribution is plotted.
#' @param ignore_zeroes If true, zeroes in scores input will be ignored.
#' @return A list containing: `scores`, the scores with non-significant values 
#' set to zero; `mu_f0`, the estimate of mu for the null distribution; `sigma_f0`,
#' the estimate of sigma for the null distribution; `f`, the empirical density
#' estimate; and `likelihood`, (optional) the likelihood ratios (fdr rates)
#' for each score.
#' @export
#' @note The robust estimators used are median and MAD. If the majority of scores
#' are zero, the robust estimate for sigma might be zero; in this case the 
#' usual estimator is used instead.
fdr <- function(scores, 
                threshold = 0.05,
                transformation = NULL, 
                robust = TRUE,
                include_likelihood = FALSE,
                show_plot = TRUE,
                ignore_zeroes = TRUE) {
  
  # Determine whether scores is a vector or matrix.
  if(is.matrix(scores)) {
    if(!isSymmetric(unname(scores), tol = 10^-10)) stop("scores is not symmetric.")
    score_names <- colnames(scores)     # Keep column names for scores.
    scores <- scores[lower.tri(scores)] # Turn scores into vector.
    matrix_provided <- TRUE
  } else if(is.vector(scores)) {
    matrix_provided <- FALSE
  } else {
    stop("scores is neither a vector nor symmetric matrix.")
  }
  
  # Check that scores are not all zero (or have zero variability).
  if(all(scores == 0) || sd(scores) == 0) {
    warning("All scores have the same value. Returning scores unchanged.")
    if(matrix_provided) {
      # Put scores back into matrix form.
      scores <- vector_to_matrix(scores, diag_val = 0)
      colnames(scores) <- score_names
    }
    fdr_return_list <- list(scores = scores, 
                            mu_f0 = NA, 
                            sigma_f0 = NA,
                            f = NA,
                            transformation = transformation)
    return(fdr_return_list)
  }
  
  
  index_include <- 1:length(scores)
  if(ignore_zeroes) {
    index_include <- which(scores != 0)
  }
  # Apply transformation, if one is provided.
  if(!is.null(transformation)) {
    scores[index_include] <- transformation(scores[index_include])
    if(!is.vector(scores)) stop("transformation did not return a vector as output.")
  }
  
  
  
  # Estimate paramters of null distribution (assumed to be Gaussian).
  if(robust) {
    mu_f0 <- median(scores[index_include]) # Use median to account for any skewness.
    sigma_f0 <- 1.4826 * median(abs(scores[index_include] - median(scores[index_include]))) # Use 1.4826 * MAD.
  }
  if(!robust || (robust && sigma_f0 == 0)){
    mu_f0 <- mean(scores[index_include]) 
    sigma_f0 <- sd(scores[index_include]) 
  }
  
  # Emperical Bayes FDR. Save likelihood ratios.
  f <- density(scores[index_include], 
               kernel = "gaussian")
  likelihood <- rep(1, length(scores))
  likelihood[index_include] <- dnorm(scores[index_include], mu_f0, sigma_f0) / 
    approx(f, xout = scores[index_include])$y
  
  # Generate plot of requested.
  if(show_plot) {
    hist(scores[index_include], freq = FALSE)
    par(xpd = FALSE) # Keep lines within plot
    curve(approx(f, xout = x)$y, col = "blue", add = TRUE)
    curve(dnorm(x, mu_f0, sigma_f0), col = "orange", add = TRUE)
    if(!is.null(threshold)) {
      lower <- scores[(likelihood < threshold) & (scores < mu_f0)]
      if(length(lower) > 0) abline(v = max(lower), col = "red")
      upper <- scores[(likelihood < threshold) & (scores > mu_f0)]
      if(length(upper) > 0) abline(v = min(upper), col = "red")
    }
  }
  
  # Apply thresholding if provided.
  if(!is.null(threshold)) {
    scores[likelihood > threshold] <- 0
  } else {
    # If no threshold is provided, return likelihood ratios.
    include_likelihood <- TRUE
  }
  
  if(matrix_provided) {
    # Put scores back into matrix form.
    scores <- vector_to_matrix(scores, diag_val = 0)
    colnames(scores) <- score_names
    
    # If likelihoods are requested, put them into matrix form.
    if(include_likelihood) {
      likelihood <- vector_to_matrix(likelihood, diag_val = Inf)
      colnames(likelihood) <- score_names
    }
  } else {
    if(include_likelihood) {
      names(likelihood) <- names(scores) # Pair score names with likelihoods.
    }
  }
  
  fdr_return_list <- list(scores = scores, 
                          mu_f0 = mu_f0, 
                          sigma_f0 = sigma_f0,
                          f = f,
                          transformation = transformation)
  if(include_likelihood) {
    fdr_return_list <- c(fdr_return_list, 
                         list(likelihood = likelihood))
  }
  
  return(fdr_return_list)
}

#' Turn a vector into a matrix
#' 
#' The input vector is assumed to be the lower.tri of a symmetric matrix. This
#' function recreates the original matrix from the vector.
#' @param x the vector containing lower.tri entries of a symmetric matrix.
#' @param diag_val value to put along the diagonal.
#' @return A symmetric matrix.
#' @export
vector_to_matrix <- function(x, diag_val = 0) {
  if(!is.vector(x)) stop("x should be a vector.")
  
  p <- sqrt(1 + 8 * length(x)) / 2 + 0.5 # Dimension of original matrix.
  if((p %% 1) != 0) stop("x is not the lower.tri entries of a square matrix.")
  
  y <- matrix(0, p, p) # Setup the output matrix
  y[lower.tri(y)] <- x # Add x to the lower.tri
  y <- y + t(y)        # Add x to the upper.tri (maintaining symmetry).
  
  diag(y) <- diag_val
  
  return(y)
}
tgrimes/dnapath2 documentation built on May 21, 2020, 5:53 p.m.