##########################
# INPUT METHODS
# -----------------------
#
# Functions to read and process data for GEVA
#
# ########################
# Copyright (C) 2020 Nunes IJG et al
#' @include c_GEVAInput.R
#' @include statmath.R
#' @include callhelpers.R
#' @include stringhelpers.R
NULL
# Processes any compatible data as input to GEVA, including data.frames, lists and
#' @title GEVA Input Processing and Merge
#' @description Functions to read, load, and concatenate the experimental comparisons from the data input. This is the initial step to proceed with any GEVA analysis.
#'
#' @details The `geva.merge.input` function takes multiple tables as arguments (*e.g.*, `matrix` or `data.frame` objects), extracts the *logFC* columns from each table and merges them into a single [`GEVAInput-class`] dataset.
#'
#' The column names are specified in the `col.values` and `col.pvals` arguments (`character`) and must correctly match the column names for *logFC* and p-value columns, respectively, in the inputs to be extracted.
#' Multiple values for column names can also be specified as valid name possibilities if they differ among the tables.
#'
#' @note The inclusion of p-value columns is not technically required, but strongly recommended as they improve the statistical accuracy in the summarization steps. If the p-value (or adjusted p-value) columns are present, their values are converted to weights by applying `1 - pvalue` for each `pvalue` element, otherwise an optional `na.val` optional argument can specified as replacement to the absent values (default is `NA`). Weights are used to accomodate the central *logFC* values towards the most significant observations and penalize potential statistical innacuracies.
#'
#' @param ... multiple `matrix` or `data.frame` objects. At least two arguments are required for `geva.merge.input`, but it's optional for `geva.read.tables`. The optional arguments in `geva.read.tables` are also passed to its internal call to `geva.merge.input` and [`geva.input.filter`].
#' \cr In addition, the following optional arguments are accepted:
#' \itemize{
#' \item{`na.val` : (`numeric`) value between `0` and `1` used as replacement when a p-value column is not present (default is `NA`)}
#' \item{`use.regex` : (`logical`) whether to match the column names using [regular expressions][base::regex] (default is `FALSE`) }
#' \item{`verbose` : (`logical`) whether to print the current loading and merge progress (default is `TRUE`)}
#' }
#' @param col.values `character` vector, possible name(s) to match the *logFC* column(s) from each table
#' @param col.pvals `character` vector, possible name(s) to match the p-value column(s) from each table
#' @param col.other `character` vector, name(s) to match additional columns (*e.g.*, gene symbols). Ignored if `NULL`
#'
#' @return A [`GEVAInput-class`] object
#'
#' @examples
#' ### EXAMPLE 1
#' ## geva.merge.input example with three randomly generated tables
#' ## (For demonstration purposes only)
#'
#' # Number of rows
#' n <- 10000
#'
#' # Random row (probe) names
#' probnms <- sprintf("PROBE_%s", 1:n)
#'
#' # Random gene names (optional)
#' genenms <- paste0(sprintf("GENE_%s", 1:n), LETTERS[1:n %% (length(LETTERS)+1)])
#'
#' # Random table 1
#' dt1 <- data.frame(row.names=probnms,
#' logfc=(rnorm(n, 0, sd=2) * rnorm(n, 0, sd=0.5)),
#' pvalues = runif(n, max=0.08),
#' genesymbol = genenms)
#' # Random table 2
#' dt2 <- data.frame(row.names=probnms,
#' logfc=(rnorm(n, 0, sd=2) * rnorm(n, 0, sd=0.5)),
#' pvalues = runif(n, max=0.08),
#' genesymbol = genenms)
#' # Random table 3
#' dt3 <- data.frame(row.names=probnms,
#' logfc=(rnorm(n, 0, sd=2) * rnorm(n, 0, sd=0.5)),
#' pvalues = runif(n, max=0.08),
#' genesymbol = genenms)
#'
#' # Merges the three tables
#' ginput <- geva.merge.input(exp1=dt1, exp2=dt2, exp3=dt3,
#' col.values="logfc",
#' col.pvals="pvalues",
#' col.other="genesymbol")
#'
#' # Prints the first rows from the merged table
#' print(head(ginput)) # values
#' print(head(inputweights(ginput))) # weights
#'
#' # ---
#' \dontrun{
#'
#' ### EXAMPLE 2
#' ## geva.read.tables example with three tab-delimited files
#'
#' # Table file examples. Each one has 3 columns: "logfc", "pvalues", and "genesymbol"
#' # Replace it with your tab-delimited files (e.g. exported from limma's topTable)
#' fnames <- c("dt1.txt", "dt2.txt", "dt3.txt")
#'
#' ginput <- geva.read.tables(fnames,
#' col.values="logfc",
#' col.pvals="pvalues",
#' col.other="genesymbol")
#'
#' # Prints the first rows from the merged table
#' print(head(ginput)) # values
#' print(head(inputweights(ginput))) # weights
#'
#'
#' # ---
#'
#' ### EXAMPLE 3
#' ## geva.read.tables example with tab-delimited files in a directory
#'
#' # Directory name (replace it with a directory containing the table files)
#' dirnm <- "C:/User/robertplant123/Documents/R/gevaexamples"
#'
#' # In this example, table files contain 3 columns: "logfc", "pvalues", and "genesymbol"
#' # Reads all txt files in the directory
#' ginput <- geva.read.tables(dirname=dirnm,
#' col.values="logfc",
#' col.pvals="pvalues",
#' col.other="genesymbol")
#'
#' # (Optional step)
#' # Let's assume that all table file names start with "dt" and ends with the ".txt" extension,
#' # such as dt1.txt, dt2.txt and so on...
#' fname_pattern <- c("^dt.+?\\.txt$") # Defines a RegEx pattern to find the files
#' # Loads only files that match the file name pattern
#' ginput <- geva.read.tables(dirname=dirnm,
#' files.pattern=fname_pattern,
#' col.values="logfc",
#' col.pvals="pvalues",
#' col.other="genesymbol")
#'
#' # Prints the first rows from the merged table
#' print(head(ginput)) # values
#' print(head(inputweights(ginput))) # weights
#' }
#'
#' @name geva.merge.input
#' @rdname geva.merge.input
#' @export
geva.merge.input <- function(..., col.values="logFC", col.pvals="adj.P.Val", col.other=NULL)
{
if (...length() == 0L) return(NULL)
argls = list(...)
names(argls) = call.dots.namesorargs(...)
argls = list.flatten(argls, ignore.class = c('geva.promise.read.path', 'data.frame'))
df = NULL
dw = NULL
da = NULL
verbose = ...arg(verbose, TRUE)
lsfacts = list()
na.val = ...arg(na.val, 0)
if (!is.na(na.val))
assert.operator(na.val, `>=` = 0, `<=` = 1)
use.regex = ...arg(use.regex, FALSE)
if (!identical(use.regex, TRUE))
use.regex = FALSE
nms = names(argls)
if (is.null(nms))
nms = paste0("arg", as.character(seq_along(argls)))
nms[nchar(nms) == 0L] = "arg"
nms = make.unique(nms, sep="_")
for (i in seq_along(argls))
{
arg = argls[[i]]
arg.nm = nms[i]
argcall = arg.nm
arg.values = NULL
arg.weights = NULL
arg.attrs = NULL
sel.attrs = FALSE
vfacts = character(0)
if (is(arg, 'geva.promise.read.path'))
{
fnm = arg$path
read.args = arg$read.args
vprint("Reading '%s' ...", basename(fnm))
basenm = sub('(.*)\\..*$', '\\1', basename(fnm))
rdargs = list.merge(read.args, list(file = fnm, row.names=1, check.names = FALSE))
arg = do.call(read.delim, rdargs)
argcall = basename(fnm)
}
if (is.matrix(arg))
arg = as.data.frame(arg)
else if (is.vector(arg) && length(arg) > 1L && (if (is.null(df)) length(arg) else nrow(df)) == length(arg))
{
if (is.numeric(arg))
{
arg = data.frame(v=arg, w=rep(0, length(arg)), row.names = names(arg))
colnames(arg) = c(c(col.values, 'v')[1], c(col.pvals, 'w')[1])
}
}
if (is.data.frame(arg))
{
sel.vals = if (use.regex) {rowAnys(vapply(col.values, grepl, logical(nrow(arg)), x=colnames(arg), perl=TRUE))} else {colnames(arg) %in% col.values}
if (!any(sel.vals))
{
warning(sprintf("No column %s '%s' was found in %s",
ifelse(use.regex, "matching", "named"),
strconjunct(col.values, strvec = "', '", conj="' or '"), argcall))
next
}
sel.ws = if (use.regex) {rowAnys(vapply(col.pvals, grepl, logical(nrow(arg)), x=colnames(arg), perl=TRUE))} else {colnames(arg) %in% col.pvals}
if (!any(sel.ws))
{
warning(sprintf("No column %s '%s' was found in %s. Using %s as replacement for weights",
ifelse(use.regex, "matching", "named"),
strconjunct(col.values, strvec = "', '", conj="' or '"), argcall, na.val))
}
else if (sum(sel.ws) != sum(sel.vals))
{
sel.vals = seq(1L, length.out = length(sel.vals)) %in% which(sel.vals)[1]
sel.ws = seq(1L, length.out = length(sel.ws)) %in% which(sel.ws)[1]
}
sel.attrs = if (use.regex) {rowAnys(vapply(col.other, grepl, logical(nrow(arg)), x=colnames(arg), perl=TRUE))} else {colnames(arg) %in% col.other}
if (!is.null(da))
sel.attrs[colnames(arg) %in% colnames(da)] = FALSE
arg.values = arg[, sel.vals, drop=FALSE]
if (!(is.numeric(arg.values) || all(vapply(arg.values, is.numeric, FALSE))))
{
warning(sprintf("Input '%s' is not numeric. Argument ignored", argcall))
next
}
if (ncol(arg.values) == 1L)
colnames(arg.values) = arg.nm
if (any(sel.ws))
arg.weights = 1 - arg[, sel.ws, drop=FALSE]
if (any(sel.attrs))
arg.attrs = arg[, sel.attrs, drop=FALSE]
}
else if (hasMethod('inputdata', class(arg)))
{
arg = inputdata(arg)
if (inherits(arg, 'GEVAInput'))
{
arg.values = inputvalues(arg)
arg.weights = inputweights(arg)
vfacts = factors(arg)
}
}
if (is.null(arg.values))
next
if (is.null(arg.weights))
arg.weights = (arg.values * 0) + na.val
if (is.null(df))
{
df = data.frame(row.names = rownames(arg.values))
dw = data.frame(row.names = rownames(arg.values))
da = data.frame(row.names = rownames(arg.values))
}
else
{
ind.matchrows = match(rownames(df), rownames(arg.values))
if (anyNA(ind.matchrows))
{
sel.valids = !is.na(ind.matchrows)
if (any(sel.valids))
{
df = df[sel.valids,,drop=FALSE]
dw = dw[sel.valids,,drop=FALSE]
da = da[sel.valids,,drop=FALSE]
ind.matchrows = ind.matchrows[sel.valids]
vprint("Removed %s rows upon merging with '%s'", sum(!sel.valids), argcall)
}
else
{
if (nrow(df) == nrow(arg.values))
ind.matchrows = 1L:nrow(arg.values)
else
{
warning(sprintf("'%s' ignored: no matching rows or dimensions (%s != %s)", argcall, nrow(arg.values), nrow(df)))
next
}
}
}
arg.values = arg.values[ind.matchrows,,drop=FALSE]
arg.weights = arg.weights[ind.matchrows,,drop=FALSE]
if (!is.null(arg.attrs))
arg.attrs = arg.attrs[ind.matchrows,,drop=FALSE]
}
if (length(vfacts) != ncol(arg.values))
vfacts = rep(NA_character_, ncol(arg.values))
lsfacts[[length(lsfacts)+1L]] = as.factor(vfacts)
colnms = setdiff(make.unique(c(colnames(df), colnames(arg.values)), sep = '_'), colnames(df))
df[, colnms] = arg.values
dw[, colnms] = arg.weights
if (!is.null(arg.attrs) && any(sel.attrs))
da[,colnames(arg.attrs)] = arg.attrs
}
facts = as.factor(unlist(lsfacts))
info = list(values.column = col.values, pvalues.column = col.pvals)
ginput = new('GEVAInput', values=as.matrix(df), weights=as.matrix(dw), ftable = da, info = info, factors=facts)
ginput
}
# Processes data for GEVA using table files as input
#'
#'
#' @details The function `geva.merge.input` reads multiple tab-delimited text files containing, extracts the *logFC* columns from each table and merges into a single [`GEVAInput-class`] dataset.
#'
#' @param filenames `character` vector with two or more file paths
#' @param dirname single `character`, base directory containing the input files. Ignored if `filenames` is specified
#' @param files.pattern single `character`, pattern used to filter the files inside `dirname`. Ignored if `filenames` is specified
#' @param p.value.cutoff `numeric` (`0` to `1`), initial p-value threshold. Rows entirely composed by p-values above this cutoff (*i.e.*, no significant *logFC*) are removed after the final merge. Ignored if `NA` or `NULL`
#' @param read.args `list` of additional arguments passed to [utils::read.table]
#'
#' @aliases geva.read.tables
#' @rdname geva.merge.input
#' @export
geva.read.tables <- function(filenames=NULL, dirname=".", col.values="logFC", col.pvals="adj.P.Val", col.other=NULL, ..., files.pattern = "\\.txt$", p.value.cutoff=0.05, read.args = list())
{
if (is.null(p.value.cutoff) || is.na(p.value.cutoff[1]))
p.value.cutoff = 1
assert.operator(p.value.cutoff, `>=` = 0, `<=` = 1)
if (length(filenames) == 0L || dir.exists(filenames[1]) ) filenames = list.files(dirname, full.names = TRUE, all.files = FALSE, recursive = FALSE, include.dirs = FALSE, pattern = files.pattern)
assert.notempty(filenames)
verbose = ...arg(verbose, TRUE)
fcount = length(filenames)
if (fcount <= 1L) stop("GEVA requires at least two tables of comparison results")
lsinput = lapply(setNames(filenames, basename_noext(filenames)), function(fnm)
{
fls = list(path=fnm, read.args=read.args)
class(fls) = 'geva.promise.read.path'
fls
})
lsinput$col.values = col.values
lsinput$col.pvals = col.pvals
lsinput$col.other = col.other
ginput = do.call(geva.merge.input, lsinput)
vprint("Read %d columns with %d probes", ncol(ginput), nrow(ginput))
if (p.value.cutoff < 1)
ginput = geva.input.filter(ginput, p.value.cutoff=p.value.cutoff, ...)
ginput
}
# Corrects the input data, removing rows containing invalid values (NA, Inf)
#' @title GEVA Input Post-processing
#'
#' @description Helper functions used to edit the contents from a [`GEVAInput-class`].
#'
#' @param ginput A [`GEVAInput-class`] object
#' @param na.rm `logical`; if `TRUE`, removes all rows containing `NA`
#' @param inf.rm `logical`; if `TRUE`, removes all rows containing infinite values (`Inf` or `-Inf`)
#' @param invalid.col.rm `logical`; if `TRUE`, searches for any column that is entirely composed by invalid values (according to the other arguments) and removes it before checking the rows
#' @param ... optional arguments. Accepts `verbose` (`logical`, default is `TRUE`) to enable or disable printing the progress
#'
#' @return A modified [`GEVAInput-class`] object
#'
#' @details `geva.input.correct` corrects the numeric input data (values and weights), removing rows that include invalid values such as NA or infinite.
#'
#' @examples
#' ## geva.input.correct example
#' colexample1 <- runif(1000, -1, 1) # Random column 1
#' colexample2 <- runif(1000, -1, 1) # Random column 2
#' colexample3 <- runif(1000, -1, 1) # Random column 3
#' colexample3[runif(1000, -1, 1) < 0] = NA # Random NA's
#' ginput = geva.merge.input(col1=colexample1,
#' col2=colexample2,
#' col3=colexample3)
#' # Before the correction:
#' print(nrow(ginput)) # Returns 1000
#' # Applies the correction (removes rows with NA's)
#' ginput <- geva.input.correct(ginput)
#' # After the correction:
#' print(nrow(ginput)) # Returns less than 1000
#'
#' @rdname geva.input.correct
#' @export
geva.input.correct <- function(ginput, na.rm=TRUE, inf.rm=TRUE, invalid.col.rm=TRUE)
{
mv = inputvalues(ginput)
mw = inputweights(ginput)
sel.valid.cols = rep(TRUE, ncol(mv))
if (invalid.col.rm)
{
if (na.rm)
{
sel.valid.cols = sel.valid.cols & !(all.apply(mv, is.na, 2)) & !(all.apply(mw, is.na, 2))
}
if (inf.rm)
{
sel.valid.cols = sel.valid.cols & !(all.apply(mv, is.infinite, 2)) & !(all.apply(mw, is.infinite, 2))
}
if (all(!sel.valid.cols)) stop("All columns are invalid. Perhaps the source data was corrupted?")
mv = mv[,sel.valid.cols,drop=FALSE]
mw = mw[,sel.valid.cols,drop=FALSE]
}
sel.valid.rows = rep(TRUE, nrow(mv))
if (na.rm)
{
sel.valid.rows = sel.valid.rows & !(apply(mv, 1, anyNA)) & !(apply(mw, 1, anyNA))
}
if (inf.rm)
{
sel.valid.rows = sel.valid.rows & !(any.apply(mv, is.infinite)) & !(any.apply(mw, is.infinite))
}
ginput = ginput[sel.valid.rows,sel.valid.cols,drop=FALSE]
ginput
}
# Filters the input data, removing rows containing p.values (1 - weights) below a threshold
#'
#' @param p.value.cutoff `numeric` (`0` to `1`), the p-value cutoff. Rows containing values above this threshold are removed
#' @param by.any `logical`, set to `TRUE` to delete the rows with at least one occurrence above the cutoff; or `FALSE` to delete only those rows in which all values are above the specified threshold
#' @param na.val `numeric`, the replacement for `NA` values
#'
#' @details `geva.input.filter` attempts to select the most relevant part of the input data, removing rows containing p.values (1 - weights) above a specific threshold.
#'
#' @examples
#' ## ---
#' ## geva.input.filter example
#' ginput <- geva.ideal.example(1000) # Generates a random input
#' # Before the filter:
#' print(nrow(ginput)) # Returns 1000
#' # Applies the filter
#' ginput <- geva.input.filter(ginput)
#' # After the filter:
#' print(nrow(ginput)) # Returns less than 1000
#'
#' @aliases geva.input.filter
#' @rdname geva.input.correct
#' @export
geva.input.filter <- function(ginput, p.value.cutoff=0.05, by.any=FALSE, na.val=0, ...)
{
# Removes the rows where all p-values are above the cutoff
dw = inputweights(ginput)
verbose = ...arg(verbose, TRUE)
sel.remove = 1L:nrow(ginput) %in% which.rows.outside.cutoff(1 - dw,
cutoff=1-p.value.cutoff,
na.val=na.val,
by.any=identical(by.any, TRUE))
remcount = sum(sel.remove)
if (remcount != 0L)
{
ginput = ginput[!sel.remove,,drop=FALSE]
vprint("Removed %s rows where all p-values were above the cutoff (%s)", remcount, as.character(p.value.cutoff))
}
ginput
}
# Replaces the row.names from a GEVAInput based on a attribute column or a character vector
#' @param attr.column `character`, target column with the values that will replace the current row names
#' @param dupl.rm.method `character`, method to remove duplicate names. The possible options are:
#' \itemize{
#' \item{`"least.p.vals"` : Keeps the duplicate that contains the least sum of p-values}
#' \item{`"order"` : Keeps the first occurrence of the duplicate in the current row order}
#' }
#'
#' @details `geva.input.rename.rows` replaces the row names with a column from the feature table (see [`GEVAInput-class`]). The column name specified for `attr.column` must be included in the `names(featureTable(ginput))`. Any duplicates are removed according to the `dupl.rm.method`, and the selected duplicates are stored as a new column named `"renamed_id"` inside the feature table from the returned object.
#'
#' @examples
#' ## ---
#' ## geva.input.rename.rows example
#' ginput <- geva.ideal.example() # Generates a random input
#' # Renames to 'Symbol'
#' ginput <- geva.input.rename.rows(ginput,
#' attr.column = "Symbol")
#' print(head(ginput)) # The row names are set now as the gene symbols
#'
#' @aliases geva.input.rename.rows
#' @rdname geva.input.correct
#' @export
geva.input.rename.rows <- function(ginput, attr.column, dupl.rm.method = c("least.p.vals", "order") )
{
dupl.rm.method = match.arg(dupl.rm.method)
errmsg = sprintf("%s must be a vector (length %d) or a valid attribute column name", "%s", nrow(ginput))
assert.notempty(attr.column, .posmsg = errmsg)
if (is.character(attr.column) && length(attr.column) == 1L)
{
if (attr.column %in% colnames(featureTable(ginput)))
{
attr.column = featureTable(ginput)[,attr.column]
}
else stop(sprintf("could not find an attribute column named '%s'", attr.column))
}
if (!is.character(attr.column))
{
if (is.vector(attr.column) || is.factor(attr.column))
attr.column = as.character(attr.column)
else stop(sprintf(errmsg, as.character(quote(attr.column))))
}
assert.dim(attr.column, length=nrow(ginput))
attr.column[is.na(attr.column)] = ''
sel.empty.vals = nchar(attr.column) == 0L
if (all(sel.empty.vals)) stop("The selected column must contain at least one non-empty character")
inds.match = match(attr.column, rownames(ginput))
if (!anyNA(inds.match)) return(ginput[inds.match,,drop=FALSE])
sel.dupls = duplicated(attr.column)
if (any(sel.dupls) && dupl.rm.method == "least.p.vals")
{
# Removes the duplicated rows where the sum of p.values is
sel.dupls = sel.dupls | duplicated(attr.column, fromLast = TRUE)
vdinds = which(sel.dupls)
fdgroups = as.factor(attr.column[vdinds])
mw = inputweights(ginput)
vsumpvals = rowSums(t(apply(mw[vdinds,,drop=FALSE], 1, restore.weights.pvals)))
sel.dupls[vdinds] = gtapply(vsumpvals, fdgroups, function(x) seq_along(x) != which.min(x))
}
sel.valid = !sel.dupls & (nchar(attr.column) != 0L)
ginput = ginput[sel.valid,,drop=FALSE]
featureTable(ginput)[,'renamed_id'] = rownames(ginput)
rownames(ginput) = attr.column[sel.valid]
catline("Row names renamed")
if (any(!sel.valid))
catline("Removed %d duplicated lines", sum(!sel.valid))
ginput
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.