#' Test pathways with Supervised PCA
#'
#' @description Given a supervised \code{OmicsPath} object (one of
#' \code{OmicsSurv}, \code{OmicsReg}, or \code{OmicsCateg}), extract the
#' first \eqn{k} principal components (PCs) from each pathway-subset of the
#' -Omics assay design matrix, test their association with the response
#' matrix, and return a data frame of the adjusted \eqn{p}-values for each
#' pathway.
#'
#' @param object An object of superclass \code{OmicsPathway} with a response
#' matrix or vector.
#' @param n.threshold The number of bins into which to split the feature scores
#' in the fit object returned internally by the \code{\link{superpc.train}}
#' function to the \code{\link{pathway_tScores}} and
#' \code{\link{pathway_tControl}} functions. Defaults to 20. Smaller values
#' may result in less accurate pathway \eqn{p}-values while larger values
#' increase computation time.
#' @param numPCs The number of PCs to extract from each pathway. Defaults to 1.
#' @param parallel Should the computation be completed in parallel? Defaults to
#' \code{FALSE}.
#' @param numCores If \code{parallel = TRUE}, how many cores should be used for
#' computation? Internally defaults to the number of available cores minus 1.
#' @param adjustpValues Should you adjust the \eqn{p}-values for multiple
#' comparisons? Defaults to TRUE.
#' @param adjustment Character vector of procedures. The returned data frame
#' will be sorted in ascending order by the first procedure in this vector,
#' with ties broken by the unadjusted \eqn{p}-value. If only one procedure is
#' selected, then it is necessarily the first procedure. See the documentation
#' for the \code{\link{ControlFDR}} function for the adjustment procedure
#' definitions and citations.
#' @param ... Dots for additional internal arguments.
#'
#' @return A data frame with columns:
#' \itemize{
#' \item{\code{pathways} : }{The names of the pathways in the \code{Omics*}}
#' object (given in \code{object@@trimPathwayCollection$pathways}.)
#' \item{\code{setsize} : }{The number of genes in each of the original
#' pathways (given in the \code{object@@trimPathwayCollection$setsize}
#' object).}
#' \item{\code{terms} : }{The pathway description, as given in the
#' \code{object@@trimPathwayCollection$TERMS} object.}
#' \item{\code{rawp} : }{The unadjusted \eqn{p}-values of each pathway.}
#' \item{\code{...} : }{Additional columns as specified through the
#' \code{adjustment} argument.}
#' }
#'
#' The data frame will be sorted in ascending order by the method specified
#' first in the \code{adjustment} argument. If \code{adjustpValues = FALSE},
#' then the data frame will be sorted by the raw \eqn{p}-values. If you have
#' the suggested \code{tidyverse} package suite loaded, then this data frame
#' will print as a \code{\link[tibble]{tibble}}. Otherwise, it will print as
#' a data frame.
#'
#' @details This is a wrapper function for the \code{\link{pathway_tScores}},
#' \code{\link{pathway_tControl}}, \code{\link{OptimGumbelMixParams}},
#' \code{\link{GumbelMixpValues}}, and \code{\link{TabulatepValues}}
#' functions.
#'
#' Please see our Quickstart Guide for this package:
#' \url{https://gabrielodom.github.io/pathwayPCA/articles/Supplement1-Quickstart_Guide.html}
#'
#' @seealso \code{\link{CreateOmics}}; \code{\link{TabulatepValues}};
#' \code{\link{pathway_tScores}}; \code{\link{pathway_tControl}};
#' \code{\link{OptimGumbelMixParams}}; \code{\link{GumbelMixpValues}};
#' \code{\link[parallel]{clusterApply}}
#'
#' @export
#'
#' @include createClass_validOmics.R
#' @include createClass_OmicsPath.R
#' @include createClass_OmicsSurv.R
#' @include createClass_OmicsReg.R
#' @include createClass_OmicsCateg.R
#'
#' @importFrom methods setGeneric
#' @importFrom methods is
#'
#' @examples
#' ### Load the Example Data ###
#' data("colonSurv_df")
#' data("colon_pathwayCollection")
#'
#' ### Create an OmicsSurv Object ###
#' colon_OmicsSurv <- CreateOmics(
#' assayData_df = colonSurv_df[, -(2:3)],
#' pathwayCollection_ls = colon_pathwayCollection,
#' response = colonSurv_df[, 1:3],
#' respType = "surv"
#' )
#'
#' ### Calculate Pathway p-Values ###
#' colonSurv_superpc <- SuperPCA_pVals(
#' object = colon_OmicsSurv,
#' parallel = TRUE,
#' numCores = 2,
#' adjustpValues = TRUE,
#' adjustment = c("Hoch", "SidakSD")
#' )
#'
#' @rdname SuperPCA_pVals
setGeneric("SuperPCA_pVals",
function(object,
n.threshold = 20,
numPCs = 1,
parallel = FALSE,
numCores = NULL,
adjustpValues = TRUE,
adjustment = c("Bonferroni",
"Holm",
"Hochberg",
"SidakSS",
"SidakSD",
"BH",
"BY",
"ABH",
"TSBH"),
...){
standardGeneric("SuperPCA_pVals")
}
)
#' @importFrom parallel clusterEvalQ
#' @importFrom parallel clusterExport
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#'
#' @rdname SuperPCA_pVals
setMethod(f = "SuperPCA_pVals", signature = "OmicsPathway",
definition = function(object,
n.threshold = 20,
numPCs = 1,
parallel = FALSE,
numCores = NULL,
adjustpValues = TRUE,
adjustment = c("Bonferroni",
"Holm",
"Hochberg",
"SidakSS",
"SidakSD",
"BH",
"BY",
"ABH",
"TSBH"),
...){
# browser()
### Extract Information from S4 Object ###
geneArray_df <- t(object@assayData_df)
obj_class <- class(object)
switch (obj_class,
OmicsSurv = {
if(anyNA(object@eventTime) ||
anyNA(object@eventObserved)){
stop("\n Missing values are not permitted in the response for Supervised PCA.")
}
eventTime <- matrix(object@eventTime, ncol = 1)
eventObserved <- matrix(object@eventObserved, ncol = 1)
response_mat <- cbind(eventTime, eventObserved)
responseType <- "survival"
},
OmicsReg = {
if(anyNA(object@response)){
stop("\n Missing values are not permitted in the response for Supervised PCA.")
}
response_mat <- matrix(object@response, ncol = 1)
responseType <- "regression"
},
OmicsCateg = {
if(anyNA(object@response)){
stop("\n Missing values are not permitted in the response for Supervised PCA.")
}
# Matrices in R cannot be factors, so I'm going to try something
# very hacky: a matrix in R is any atomic vector with a dim
# attribute. This even works for ordered factors.
response_mat <- object@response
dim(response_mat) <- c(length(response_mat), 1)
responseType <- "categorical"
}
)
# responseType <- match.arg(responseType)
if(adjustpValues){
adjustment <- match.arg(adjustment, several.ok = TRUE)
}
pathwayGeneSets_ls <- object@trimPathwayCollection
paths_ls <- pathwayGeneSets_ls$pathways
minFeats <- attr(paths_ls, "minFeatures")
if(parallel){
### Parallel Computing Setup ###
message("Initializing Computing Cluster: ", appendLF = FALSE)
numCores <- ifelse(is.null(numCores), detectCores() - 1, numCores)
clust <- makeCluster(numCores)
clustVars_vec <- c(deparse(quote(paths_ls)),
deparse(quote(geneArray_df)),
deparse(quote(response_mat)))
clusterExport(cl = clust,
varlist = clustVars_vec,
envir = environment())
invisible(clusterEvalQ(cl = clust, library(pathwayPCA)))
message("DONE")
# browser()
### Matrix of Student's t Scores and Controls ###
message("Calculating Pathway Test Statistics in Parallel: ",
appendLF = FALSE)
tScores_ls <- parLapply(
cl = clust,
paths_ls,
pathway_tScores,
geneArray_df = geneArray_df,
response_mat = response_mat,
responseType = responseType,
n.threshold = n.threshold,
numPCs = numPCs,
min.features = minFeats
)
message("DONE")
message("Calculating Pathway Critical Values in Parallel: ",
appendLF = FALSE)
tControl_ls <- parLapply(
cl = clust,
paths_ls,
pathway_tControl,
geneArray_df = geneArray_df,
response_mat = response_mat,
responseType = responseType,
n.threshold = n.threshold,
numPCs = numPCs,
min.features = minFeats
)
message("DONE")
stopCluster(clust)
} else {
### Matrix of Student's t Scores and Controls ###
message("Calculating Pathway Test Statistics Serially: ",
appendLF = FALSE)
# browser()
tScores_ls <- lapply(
paths_ls,
pathway_tScores,
geneArray_df = geneArray_df,
response_mat = response_mat,
responseType = responseType,
n.threshold = n.threshold,
numPCs = numPCs,
min.features = minFeats
)
message("DONE")
message("Calculating Pathway Critical Values Serially: ",
appendLF = FALSE)
tControl_ls <- lapply(
paths_ls,
pathway_tControl,
geneArray_df = geneArray_df,
response_mat = response_mat,
responseType = responseType,
n.threshold = n.threshold,
numPCs = numPCs,
min.features = minFeats
)
message("DONE")
}
# browser()
### Maximum t-Score for each Gene Pathway ###
absMax <- function(vec){
vec[which.max(abs(vec))]
}
# Lily told me that we only care about the t-scores from the first
# PC, so we will only extract the absolute maxima from the first
# row of the t-value matrices
tScoreMax_num <- vapply(
tScores_ls,
function(ls) absMax(ls$tscor[1, ]) ,
FUN.VALUE = numeric(1)
)
tControlMax_num <- vapply(
tControl_ls,
function(x) absMax(x[1, ]) ,
FUN.VALUE = numeric(1)
)
### Calculate Raw Pathway p-Values ###
message("Calculating Pathway p-Values: ", appendLF = FALSE)
genesPerPathway_vec <- unlist(pathwayGeneSets_ls$setsize)
genesPerPathway_vec <- genesPerPathway_vec[names(paths_ls)]
# Attempt to Fit the Distribution
optParams_vec <- try(
OptimGumbelMixParams(
max_tControl_vec = tControlMax_num,
pathwaySize_vec = genesPerPathway_vec,
...
),
silent = TRUE
)
if(is(optParams_vec, "try-error")){
warning("Gumbel Mixture Distribution parameter estimation failed. Re-call SuperPCA_pVals()
as soon as possible.", immediate. = TRUE)
pvalues_vec <- rep(1, length(tScoreMax_num))
adjustpValues <- FALSE
} else {
pvalues_vec <- GumbelMixpValues(
tScore_vec = tScoreMax_num,
pathwaySize_vec = genesPerPathway_vec,
optimParams_vec = optParams_vec
)
}
message("DONE")
if(adjustpValues){
message("Adjusting p-Values and Sorting Pathway p-Value Data Frame: ",
appendLF = FALSE)
} else {
message("Sorting Pathway p-Value Data Frame: ", appendLF = FALSE)
}
out_df <- TabulatepValues(
pVals_vec = pvalues_vec,
genesets_ls = pathwayGeneSets_ls,
adjust = adjustpValues,
proc_vec = adjustment,
...
)
message("DONE")
### Wrangle PCA Output ###
sortedScores_ls <- tScores_ls[out_df$pathways]
PCs_ls <- lapply(sortedScores_ls, `[[`, "PCs_mat")
PCs_ls <- lapply(PCs_ls, as.data.frame)
attr(PCs_ls, "sampleIDs") <- object@sampleIDs_char
loadings_ls <- lapply(sortedScores_ls, `[[`, "loadings")
loadings_ls <- lapply(loadings_ls, t)
### Return ###
out_ls <- list(
pVals_df = out_df,
PCs_ls = PCs_ls,
loadings_ls = loadings_ls
)
class(out_ls) <- c("superpcOut", "pathwayPCA", "list")
out_ls
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.