#' Cut p-values.
#'
#' Cut p-values according to a significance criterium.
#'
#' This is a simple internal function for cutting a vector of p-values into
#' a set of meaningful categories. The categories are taken from the original
#' book chapter that inspired the package.
#'
#' @param p_values A numeric containing the p-values
#' @return A factor containing the resulting categories.
#' @keywords internal
cut_pvalues = function(p_values) {
result = cut(
p_values,
c(-Inf, 1e-10, 1e-5, 0.001, 0.01, 0.05, Inf),
labels = c('< 1e-10', '< 1e-5', '< 0.001', '< 0.01', '< 0.05', 'NS'),
include.lowest = TRUE
)
return(result)
}
#' Get p-value from F statistic data.
#'
#' Get p-value from F statistic data in the summary of an \code{lm} object.
#'
#' This is an internal function that extracts the p-value from the information
#' generated by a call to the \code{summary} function on a \code{lm} object. It
#' uses the F statistic value and the degrees of freedom in order to compute the
#' p-value.
#'
#' @param fstat A numeric vector of length three obtained from a linear model
#' fit.
#' @return The computed p-value.
#' @keywords internal
get_p_value = function(fstat) {
if (is.numeric(fstat['value'])) {
p_value = 1 - pf(fstat[['value']], fstat[['numdf']], fstat[['dendf']])
} else {
warning('Non-numeric F statistic. Setting p-value to 1.')
p_value = 1
}
return(p_value)
}
#' Get variable names suitable for analysis.
#'
#' Get variable names that fulfill the conditions in order to be tested for
#' association.
#'
#' This is an internal function taking the phenotype data.frame as input. It
#' returns a character vector containing the names of those variables which can
#' be tested for association. The current implementation avoids testing on
#' categorical variables where all elements belong to the same level, or where
#' every level has only one observation.
#'
#' @param pdata A data.frame containing the phenotypical data.
#' @return A character vector containing the selected variable names.
#' @keywords internal
get_var_names = function(pdata) {
if (!is.data.frame(pdata)) {
stop('Input must be a data.frame.')
}
n_levels = sapply(pdata, function(xx) length(unique(xx)))
var_names = colnames(pdata)[(n_levels > 1 & n_levels < nrow(pdata)) |
sapply(pdata, is.numeric)]
if (length(var_names) == 0) {
stop('There are no suitable variables for testing.')
}
names(var_names) = var_names
return(var_names)
}
#' Compute associations with phenotypical variables.
#'
#' Compute associations with phenotypical and control variables in order to
#' find out which of them are significant, and thus possibly introducing
#' some confounding in the data.
#'
#' The current implementation allows the user to use two different methods
#' for testing association: a general linear model (lm) and a Kruskal-Wallis
#' test. User should be careful, as the Kruskal Wallis test should not be used
#' for continuous independent variables.
#'
#' An association test is performed between every component (Principal Component
#' or Surrogate Variable) and every phenotype and control variable. P-values are
#' then collected and stored in a \code{tbl_df} object describing all the
#' comparisons.
#'
#' This function receives variable names as a parameter.
#'
#' @inheritParams compute_significance_data
#' @param var_names Variable names to be used for calculating the p_values.
#' @return A data.frame with the results of the association tests.
#' @importFrom dplyr %>% mutate_
#' @importFrom tidyr gather_
#' @importFrom stats lm na.omit pf model.matrix kruskal.test
#' @keywords internal
compute_significance_data_var_names = function(values, pdata, component_names,
var_names,
method = c('lm', 'kruskal')) {
method = match.arg(method)
if (is.null(names(var_names))) {
names(var_names) = var_names
}
if (method == 'kruskal') {
is_col_numeric = vapply(pdata[, var_names], is.numeric, FALSE)
if (any(is_col_numeric)) {
stop(paste('Kruskal-Wallis method cannot be used when there are numeric',
'phenotype variables.'))
}
get_kruskal_p_value = function(val, pd) {
kruskal.test(val, pd)[['p.value']]
}
pvalues = lapply(var_names,
function(xx) apply(values, 2,
get_kruskal_p_value,
as.factor(pdata[, xx])))
} else {
model_fits = lapply(var_names, function(xx) lm(values ~ pdata[, xx]))
model_summaries = lapply(model_fits, summary)
# Result being a nested list depends on the number of columns of values
if (ncol(values) == 1) {
pvalues = lapply(
model_summaries,
function(xx) get_p_value(xx[['fstatistic']])
)
} else {
pvalues = lapply(
model_summaries,
function(xx) sapply(xx, function(yy) get_p_value(yy[['fstatistic']]))
)
}
}
p_values_df = data.frame(pvalues, check.names = FALSE)
significance_data = p_values_df %>%
mutate_('PC' = ~ component_names) %>%
gather_('Variable', 'P_Value', var_names) %>%
mutate_('Sig' = ~ cut_pvalues(P_Value))
return(significance_data)
}
#' Compute associations with phenotypical variables.
#'
#' Compute associations with phenotypical and control variables in order to
#' find out which of them are significant, and thus possibly introducing
#' some confounding in the data.
#'
#' The current implementation allows the user to use two different methods
#' for testing association: a general linear model (lm) and a Kruskal-Wallis
#' test. User should be careful, as the Kruskal Wallis test should not be used
#' for continuous independent variables.
#'
#' An association test is performed between every component (Principal Component
#' or Surrogate Variable) and every phenotype and control variable. P-values are
#' then collected and stored in a \code{tbl_df} object describing all the
#' comparisons.
#'
#' @param values A matrix containing either principal components or surrogate
#' variables.
#' @param pdata A data.frame containing the phenotypical data for the samples.
#' @param component_names A character vector containing the component names.
#' @param method A character indicating the method used for the association
#' tests.
#' @return A data.frame with the results of the association tests.
#' @importFrom dplyr %>% mutate_
#' @importFrom tidyr gather_
#' @keywords internal
compute_significance_data = function (values, pdata, component_names,
method = c('lm', 'kruskal')) {
var_names = get_var_names(pdata)
return(compute_significance_data_var_names(values, pdata, component_names,
var_names, method))
}
#' Get control variables.
#'
#' Get a matrix containing the signals for the control probes of a given
#' RGChannelSet object.
#'
#' The function \code{get_control_variables} just takes a \code{RGChannelSet}
#' object as input, accesses the values of its control probes, and returns
#' them as a numeric matrix.
#'
#' This is an internal function. Its purpose is to provide the main analysis
#' functions with more data to test. It is usually useful to test if control
#' probes are introducing some confounding in the dataset, as it might be
#' signaling that there are unknown effects in the data.
#'
#' @param rgset An RGChannelSet object containing the control data.
#' @importFrom methods is
#' @importFrom minfi getControlAddress getRed getGreen
#' @return A matrix with the values of the control elements.
#' @keywords internal
get_control_variables = function(rgset) {
if (!is(rgset, 'RGChannelSet')) {
stop('Input must be of RGChannelSet type.')
}
bc1 = getControlAddress(rgset, controlType = c('BISULFITE CONVERSION I'))
bc2 = getControlAddress(rgset, controlType = c('BISULFITE CONVERSION II'))
ext = getControlAddress(rgset, controlType = c('EXTENSION'))
tr = getControlAddress(rgset, controlType = c('TARGET REMOVAL'))
hyb = getControlAddress(rgset, controlType = c('HYBRIDIZATION'))
control_names = c(
'BSC-I C1 Grn',
'BSC-I C2 Grn',
'BSC-I C3 Grn',
'BSC-I C4 Red',
'BSC-I C5 Red',
'BSC-I C6 Red',
'BSC-II C1 Red',
'BSC-II C2 Red',
'BSC-II C3 Red',
'BSC-II C4 Red',
'Target Removal 1 Grn',
'Target Removal 2 Grn',
'Hyb (Low) Grn',
'Hyb (Medium) Grn',
'Hyb (High) Grn',
'Extension (A) Red',
'Extension (T) Red',
'Extension (C) Grn',
'Extension (G) Grn'
)
control_signals = rbind(
getGreen(rgset)[bc1[1:3],],
getRed(rgset)[bc1[7:9],],
getRed(rgset)[bc2[1:4],],
getGreen(rgset)[tr[1:2],],
getGreen(rgset)[hyb[1:3],],
getRed(rgset)[ext[1:2],],
getGreen(rgset)[ext[3:4],]
)
dimnames(control_signals) = list(control_names, colnames(rgset))
datac2_m = t(log2(control_signals))
return(datac2_m)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.