SVA confounding analysis framework

#' Cut p-values.
#'
#' Cut p-values according to a significance criterium.
#'
#' This is a simple internal function for cutting a vector of p-values into
#' a set of meaningful categories. The categories are taken from the original
#' book chapter that inspired the package.
#'
#' @param p_values A numeric containing the p-values
#' @return A factor containing the resulting categories.
#' @keywords internal
cut_pvalues = function(p_values) {
  result = cut(
    p_values,
    c(-Inf, 1e-10, 1e-5, 0.001, 0.01, 0.05, Inf),
    labels = c('< 1e-10', '< 1e-5', '< 0.001', '< 0.01', '< 0.05', 'NS'),
    include.lowest = TRUE
    )
  return(result)
}

#' Get p-value from F statistic data.
#'
#' Get p-value from F statistic data in the summary of an \code{lm} object.
#'
#' This is an internal function that extracts the p-value from the information
#' generated by a call to the \code{summary} function on a \code{lm} object. It
#' uses the F statistic value and the degrees of freedom in order to compute the
#' p-value.
#'
#' @param fstat A numeric vector of length three obtained from a linear model
#' fit.
#' @return The computed p-value.
#' @keywords internal
get_p_value = function(fstat) {
  if (is.numeric(fstat['value'])) {
    p_value = 1 - pf(fstat[['value']], fstat[['numdf']], fstat[['dendf']])
  } else {
    warning('Non-numeric F statistic. Setting p-value to 1.')
    p_value = 1
  }
  return(p_value)
}

#' Get variable names suitable for analysis.
#'
#' Get variable names that fulfill the conditions in order to be tested for
#' association.
#'
#' This is an internal function taking the phenotype data.frame as input. It
#' returns a character vector containing the names of those variables which can
#' be tested for association. The current implementation avoids testing on
#' categorical variables where all elements belong to the same level, or where
#' every level has only one observation.
#'
#' @param pdata A data.frame containing the phenotypical data.
#' @return A character vector containing the selected variable names.
#' @keywords internal
get_var_names = function(pdata) {
  if (!is.data.frame(pdata)) {
    stop('Input must be a data.frame.')
  }

  n_levels = sapply(pdata, function(xx) length(unique(xx)))
  var_names = colnames(pdata)[(n_levels > 1 & n_levels < nrow(pdata)) |
                                sapply(pdata, is.numeric)]

  if (length(var_names) == 0) {
    stop('There are no suitable variables for testing.')
  }

  names(var_names) = var_names

  return(var_names)
}

#' Compute associations with phenotypical variables.
#'
#' Compute associations with phenotypical and control variables in order to
#' find out which of them are significant, and thus possibly introducing
#' some confounding in the data.
#'
#' The current implementation allows the user to use two different methods
#' for testing association: a general linear model (lm) and a Kruskal-Wallis
#' test. User should be careful, as the Kruskal Wallis test should not be used
#' for continuous independent variables.
#'
#' An association test is performed between every component (Principal Component
#' or Surrogate Variable) and every phenotype and control variable. P-values are
#' then collected and stored in a \code{tbl_df} object describing all the
#' comparisons.
#'
#' This function receives variable names as a parameter.
#'
#' @inheritParams compute_significance_data
#' @param var_names Variable names to be used for calculating the p_values.
#' @return A data.frame with the results of the association tests.
#' @importFrom dplyr %>% mutate_
#' @importFrom tidyr gather_
#' @importFrom stats lm na.omit pf model.matrix kruskal.test
#' @keywords internal
compute_significance_data_var_names = function(values, pdata, component_names,
                                                var_names,
                                                method = c('lm', 'kruskal')) {
  method = match.arg(method)

  if (is.null(names(var_names))) {
    names(var_names) = var_names
  }

  if (method == 'kruskal') {
    is_col_numeric = vapply(pdata[, var_names], is.numeric, FALSE)

    if (any(is_col_numeric)) {
      stop(paste('Kruskal-Wallis method cannot be used when there are numeric',
                 'phenotype variables.'))
    }

    get_kruskal_p_value = function(val, pd) {
      kruskal.test(val, pd)[['p.value']]
    }
    pvalues = lapply(var_names,
                     function(xx) apply(values, 2,
                                        get_kruskal_p_value,
                                        as.factor(pdata[, xx])))
  } else {
    model_fits = lapply(var_names, function(xx) lm(values ~ pdata[, xx]))
    model_summaries = lapply(model_fits, summary)

    # Result being a nested list depends on the number of columns of values
    if (ncol(values) == 1) {
      pvalues = lapply(
        model_summaries,
        function(xx) get_p_value(xx[['fstatistic']])
      )
    } else {
      pvalues = lapply(
        model_summaries,
        function(xx) sapply(xx, function(yy) get_p_value(yy[['fstatistic']]))
      )
    }
  }
  p_values_df = data.frame(pvalues, check.names = FALSE)

  significance_data = p_values_df %>%
    mutate_('PC' = ~ component_names) %>%
    gather_('Variable', 'P_Value', var_names) %>%
    mutate_('Sig' = ~ cut_pvalues(P_Value))

  return(significance_data)
}

#' Compute associations with phenotypical variables.
#'
#' Compute associations with phenotypical and control variables in order to
#' find out which of them are significant, and thus possibly introducing
#' some confounding in the data.
#'
#' The current implementation allows the user to use two different methods
#' for testing association: a general linear model (lm) and a Kruskal-Wallis
#' test. User should be careful, as the Kruskal Wallis test should not be used
#' for continuous independent variables.
#'
#' An association test is performed between every component (Principal Component
#' or Surrogate Variable) and every phenotype and control variable. P-values are
#' then collected and stored in a \code{tbl_df} object describing all the
#' comparisons.
#'
#' @param values A matrix containing either principal components or surrogate
#' variables.
#' @param pdata A data.frame containing the phenotypical data for the samples.
#' @param component_names A character vector containing the component names.
#' @param method A character indicating the method used for the association
#' tests.
#' @return A data.frame with the results of the association tests.
#' @importFrom dplyr %>% mutate_
#' @importFrom tidyr gather_
#' @keywords internal
compute_significance_data = function (values, pdata, component_names,
                                      method = c('lm', 'kruskal')) {
  var_names = get_var_names(pdata)

  return(compute_significance_data_var_names(values, pdata, component_names,
                                        var_names, method))
}

#' Get control variables.
#'
#' Get a matrix containing the signals for the control probes of a given
#' RGChannelSet object.
#'
#' The function \code{get_control_variables} just takes a \code{RGChannelSet}
#' object as input, accesses the values of its control probes, and returns
#' them as a numeric matrix.
#'
#' This is an internal function. Its purpose is to provide the main analysis
#' functions with more data to test. It is usually useful to test if control
#' probes are introducing some confounding in the dataset, as it might be
#' signaling that there are unknown effects in the data.
#'
#' @param rgset An RGChannelSet object containing the control data.
#' @importFrom methods is
#' @importFrom minfi getControlAddress getRed getGreen
#' @return A matrix with the values of the control elements.
#' @keywords internal
get_control_variables = function(rgset) {
  if (!is(rgset, 'RGChannelSet')) {
    stop('Input must be of RGChannelSet type.')
  }

  bc1 = getControlAddress(rgset, controlType = c('BISULFITE CONVERSION I'))
  bc2 = getControlAddress(rgset, controlType = c('BISULFITE CONVERSION II'))
  ext = getControlAddress(rgset, controlType = c('EXTENSION'))
  tr = getControlAddress(rgset, controlType = c('TARGET REMOVAL'))
  hyb = getControlAddress(rgset, controlType = c('HYBRIDIZATION'))

  control_names = c(
    'BSC-I C1 Grn',
    'BSC-I C2 Grn',
    'BSC-I C3 Grn',
    'BSC-I C4 Red',
    'BSC-I C5 Red',
    'BSC-I C6 Red',
    'BSC-II C1 Red',
    'BSC-II C2 Red',
    'BSC-II C3 Red',
    'BSC-II C4 Red',
    'Target Removal 1 Grn',
    'Target Removal 2 Grn',
    'Hyb (Low) Grn',
    'Hyb (Medium) Grn',
    'Hyb (High) Grn',
    'Extension (A) Red',
    'Extension (T) Red',
    'Extension (C) Grn',
    'Extension (G) Grn'
  )

  control_signals = rbind(
    getGreen(rgset)[bc1[1:3],],
    getRed(rgset)[bc1[7:9],],
    getRed(rgset)[bc2[1:4],],
    getGreen(rgset)[tr[1:2],],
    getGreen(rgset)[hyb[1:3],],
    getRed(rgset)[ext[1:2],],
    getGreen(rgset)[ext[3:4],]
  )
  dimnames(control_signals) = list(control_names, colnames(rgset))

  datac2_m = t(log2(control_signals))

  return(datac2_m)
}