#' Add a dataset to a SPEAR object.
#' @param mae MultiAssayExperiment with multi-omics data. Columns of ExperimentList must match (correspond to sample identifiers) and link to rows of colData (with response/responses).
#' @param response Column name(s) of colData from MultiAssayExperiment that will be the response(s). More than one response should be inputted as a vector.
#' @param name Name for the stored dataset in the SPEAR object. Access via `SPEAR$get.data(__name__)`
#' @examples
#' SPEAR <- SPEAR$new(...)
#'
#' SPEAR$add.data(MultiAssayExperiment(...), response = "death", name = "dataset1")
#'
#'@export
add.data = function(data, response = NULL, name = NULL){
# Set name:
if(is.null(name)){
d.tmp = 1
name = paste0("dataset", d.tmp)
while(name %in% names(self$data)){
d.tmp = d.tmp + 1
name = paste0("dataset", d.tmp)
}
} else if(name %in% names(self$data)){
stop("ERROR: name provided already exists in names(SPEAR$data). Please remove this dataset manually via SPEAR$remove.data(...) and then try updating again.\n")
}
if(name != "train"){
# X...
# Check that length is the same
if(length(data) != length(self$data$train)){
stop("ERROR: Number of datasets provided (", length(data), ") is not equal to the number of datasets used to train the SPEAR object (", length(self$data$train), "). SPEAR requires new samples to have the same assays used for training.")
}
# Check that number of analytes are the same
if(any(sapply(1:length(data), function(d){return(nrow(data[[d]]) != nrow(self$data$train[[d]]))}))){
stop("ERROR: One or more datasets provided has a different number of analytes than those in SPEAR$data$train. SPEAR requires all new samples have the same analytes used for training.")
}
}
# Check that columns all match columns
if(name != "train"){
if(any(sapply(1:length(data), function(d){return(any(rownames(data[[d]]) != rownames(SPEAR$data$train[[d]])))}))){
stop("ERROR: One or more analyte (row) names in mae experiments provided were not found in the data used to train this SPEAR object (SPEAR$data$train). All analytes must match to add a dataset.")
}
}
# response...
if(!is.null(response)){
c.data <- MultiAssayExperiment::colData(data)
if(!all(response %in% colnames(c.data))){
stop("ERROR: `response` provided was not found in the MultiAssayExperiment::colData(data). Needs to be a matching column(s) from the data.frame...")
}
} else if(name == "train"){
stop("ERROR: response is a required parameter for the train (initial) dataset. Please choose a response from colnames(MultiAssayExperiment::colData(data))")
}
# name and store:
self$data[[name]] <- data
# FUTURE: use ids in case of names not matching...
# if train: generate SPEARids
#if(name == "train"){
# private$generate.spear.ids()
#}
if(!self$options$quiet & name != "train"){cat("Saved dataset to $data$", name, "\n\n", sep = "")}
return(invisible(self))
}
#' Remove a dataset from a SPEAR object.
#' @param name Name for the stored MultiAssayExperiment dataset in the SPEAR object.
#' @examples
#' SPEAR <- SPEAR$new(...)
#'
#' # add a dataset under SPEAR$data$dataset1:
#' SPEAR$add.data(MultiAssayExperiment(...), response = "death", name = "dataset1")
#'
#' # remove the dataset stored under SPEAR$data$dataset1:
#' SPEAR$remove.data("dataset1")
#'
#'@export
remove.data = function(name = NULL){
if(is.null(name)){
stop("ERROR: no name provided...")
} else if(!name %in% names(self$data)){
stop("ERROR: name `", name, "` not found amongst possible data names. Use `names(SPEAR$data)` to see datasets.")
}
# Remove dataset:
self$data[[name]] <- NULL
if(!self$options$quiet & name != "train"){cat("Removed dataset from $data$", name, "\n\n", sep = "")}
return(invisible(self))
}
#' Extract a dataset from a SPEAR object.
#' @param name Name for the stored MultiAssayExperiment dataset in the SPEAR object.
#' @examples
#' SPEAR <- SPEAR$new(...)
#'
#' # add a dataset under SPEAR$data$dataset1:
#' SPEAR$add.data(MultiAssayExperiment(...), response = "death", name = "dataset1")
#'
#' # extract the dataset stored under SPEAR$data$dataset1:
#' data <- SPEAR$get.data("dataset1")
#'
#'@export
get.data = function(name = NULL){
if(is.null(name)){
stop("ERROR: no name provided...")
} else if(!name %in% names(self$data)){
stop("ERROR: name `", name, "` not found amongst possible data names. Use `names(SPEAR$data)` to see datasets.")
}
return(self$data[[name]])
}
#' Set the colors for a SPEAR object, to be used in plotting functions
#' @param colors.x Which colors to use for assays (X)? Must be a vector of length SPEAR$data$train If not named, will assume order matches.
#' @param colors.y Which colors to use for the response(s) (Y)? Must be a vector of length ncol(SPEARobj$data$train$Y). If not named, will assume order matches.
#' @examples
#' SPEARobj <- make.SPEARobject(...)
#'
#' # Option 1: set to defaults:
#' SPEARobj$set.color.scheme()
#'
#' # Option 2: provide colors:
#' colors.x <- c("red", "blue", "green", ...) # one for each dataset in SPEARobj$data$train$Xlist
#' colors.y <- c("yellow", ...) # one for each column in SPEARobj$data$train$Y
#' SPEARobj$set.color.scheme(colors.x = colors.x, colors.y = colors.y)
#'
#' # Option 3: provide colors as named list:
#' colors.x <- c("red", "blue", "green", ...) # one for each dataset in SPEARobj$data$train$Xlist
#' names(colors.x) <- names(SPEARobj$data$train$Xlist)
#' colors.y <- c("yellow", ...) # one for each column in SPEARobj$data$train$Y
#' names(colors.y) <- colnames(SPEARobj$data$train$Y)
#' SPEARobj$set.color.scheme(colors.x = colors.x, colors.y = colors.y)
#'
set.color.scheme = function(colors.x = NULL, colors.y = NULL){
colorlist <- list()
# X:
num.assays <- length(self$data$train)
colorvec.assays <- RColorBrewer::brewer.pal(n = 9, name = "Set1")[1:num.assays]
names(colorvec.assays) <- names(self$data$train)
# Y:
if(self$params$family == "gaussian"){
num.responses <- length(self$params$response)
if(length(self$params$response) < 3){
colorvec.responses <- RColorBrewer::brewer.pal(n = 3, name = "Reds")[1:num.responses]
} else {
colorvec.responses <- RColorBrewer::brewer.pal(n = num.responses, name = "Reds")
}
names(colorvec.responses) <- self$params$response
} else {
Y <- private$get.Y()
num.responses <- length(self$params$response)
colorvec.responses <- list()
palettes <- c("Reds", "Blues", "Purples", "Greens") # Up to 4 currently...
for(r in 1:length(self$params$response)){
num.responses.tmp <- length(levels(MultiAssayExperiment::colData(self$data$train)[[self$params$response[r]]]))
if(num.responses.tmp < 3){
colorvec.responses[[self$params$response[r]]] <- RColorBrewer::brewer.pal(n = 3, name = palettes[r])[c(1,num.responses.tmp)]
} else {
colorvec.responses[[self$params$response[r]]] <- RColorBrewer::brewer.pal(n = num.responses.tmp, name = palettes[r])
}
names(colorvec.responses[[self$params$response[r]]]) <- levels(MultiAssayExperiment::colData(self$data$train)[[self$params$response[r]]])
}
}
# Save:
colorlist$X <- colorvec.assays
colorlist$Y <- colorvec.responses
self$options$color.scheme <- colorlist
return(invisible(self))
}
#' Print out a variety of summary information about a SPEARobject
#' @param type Which type of summary to print? Can be "data". Defaults to NULL.
#' @param remove.formatting Remove text color/bold font face. Defaults to FALSE.
#' @param quiet Do not print anything. Defaults to FALSE.
#'@export
print.out = function(type = NULL, remove.formatting = FALSE, quiet = FALSE){
if(quiet){
return(NULL)
}
if(is.null(type)){
cat("")
}
# ---------------------------------
if(type == "data"){
cat(paste0("Detected ", length(self$data$train), " datasets:\n"))
for(i in 1:length(self$data$train)){
cat(names(self$data$train)[i], "\tSamples: ", ncol(self$data$train[[i]]), "\tAnalytes: ", nrow(self$data$train[[i]]), "\n")
}
cat(paste0("Detected ", length(self$params$response), " response ", ifelse(length(self$params$response) == 1, "variable", "variables"), ":\n"))
for(i in 1:length(self$params$response)){
cat(self$params$response[i], "\tSamples: ", nrow(MultiAssayExperiment::colData(self$data$train)), "\tType: ", self$params$family, "\n")
}
}
# ---------------------------------
if(type == "help"){
cat('For assistance, browse the SPEAR vignettes with any of these commands...\n> SPEAR$SPEAR.help()\n> SPEAR::SPEAR.help()\n> browseVignettes("SPEAR")\n\n')
}
}
#' One-hot encode a matrix or data.frame (Y)
#'
#'@export
one.hot.encode.multinomial <- function(){
# get levels for factor:
vals <- levels(MultiAssayExperiment::colData(self$data$train)[[self$params$response]])
Y.multi <- matrix(0, nrow = nrow(MultiAssayExperiment::colData(self$data$train)), ncol = length(vals))
for(i in 1:nrow(Y.multi)){
Y.multi[i, which(vals == MultiAssayExperiment::colData(self$data$train)[i,self$params$response])] <- 1
}
colnames(Y.multi) <- vals
rownames(Y.multi) <- rownames(MultiAssayExperiment::colData(self$data$train))
# return
return(Y.multi)
}
#' Encode a matrix or data.frame (Y) into ordinal (0, 1, 2, ...)
#'
#'@export
encode.ordinal <- function(){
# get levels for factor:
vals <- levels(MultiAssayExperiment::colData(self$data$train)[,self$params$response])
encoded <- 0:(length(vals)-1)
names(encoded) <- vals
Y.ord <- matrix(0, nrow = nrow(MultiAssayExperiment::colData(self$data$train)), ncol = 1)
for(i in 1:nrow(Y.ord)){
Y.ord[i,1] <- encoded[MultiAssayExperiment::colData(self$data$train)[i,self$params$response]]
}
colnames(Y.ord) <- self$params$response
rownames(Y.ord) <- rownames(MultiAssayExperiment::colData(self$data$train))
# return
return(Y.ord)
}
#' Update the dimension names for all SPEARobject matrices. Used internally.
#'@export
update.dimnames = function(){
Y <- private$get.Y()
dimnames(self$fit$regression.coefs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights)))
dimnames(self$fit$projection.coefs.x) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights)))
dimnames(self$fit$projection.coefs.y) = list(paste0("Factor", 1:self$params$num.factors), colnames(Y), paste0("w.idx.", 1:nrow(self$params$weights)))
dimnames(self$fit$nonzero.probs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights)))
dimnames(self$fit$projection.probs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights)))
dimnames(self$fit$marginal.probs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights)))
dimnames(self$fit$joint.probs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights)))
if(self$fit$type == "cv"){
dimnames(self$fit$regression.coefs.cv) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("Fold", 1:self$params$num.folds), paste0("w.idx.", 1:nrow(self$params$weights)))
dimnames(self$fit$projection.coefs.cv) = list(paste0("Factor", 1:self$params$num.factors), colnames(Y), paste0("Fold", 1:self$params$num.folds), paste0("w.idx.", 1:nrow(self$params$weights)))
}
}
#' Create the parameters for SPEAR. Private method.
#'@export
create.params = function(px, py, pz, n, num_factors, n_weights, npath){
params = list()
params$post_mu = array(0, dim = c(px,num_factors,n_weights))
params$post_sigma2 = array(0, dim = c(px,num_factors,n_weights))
params$post_pi = array(0, dim = c(px,num_factors,n_weights))
params$post_tmuX = array(0, dim = c(px,num_factors,n_weights))
params$post_tsigma2X = array(0, dim = c(px,num_factors,n_weights))
params$post_tpiX = array(0, dim = c(px,num_factors,n_weights))
params$post_tpiX_marginal = array(0, dim = c(px,num_factors,n_weights))
params$post_tmuY = array(0, dim = c(py,num_factors,n_weights))
params$post_tsigma2Y = array(0, dim = c(py,num_factors,n_weights))
params$post_tpiY = array(0, dim = c(py,num_factors,n_weights))
params$tauY = array(0, dim = c(py,num_factors,n_weights))
params$tauZ = array(0, dim = c(pz,num_factors,n_weights))
params$log_pi = array(0, dim = c(pz,num_factors,n_weights))
params$log_minus_pi= array(0, dim = c(pz,num_factors,n_weights))
params$nuYmat = array(0, dim = c(n, py,n_weights))
params$nuXmat = array(0, dim = c(n, px,n_weights))
params$meanFactors = array(0, dim = c(n,num_factors,n_weights))
params$post_a0 = array(0, dim = c(npath,num_factors,n_weights))
params$post_a1 = array(0, dim = c(npath,num_factors,n_weights))
params$post_b0 = array(0, dim = c(npath,num_factors,n_weights))
params$post_b1 = array(0, dim = c(npath,num_factors,n_weights))
params$post_a2x = array(0, dim = c(px,n_weights))
params$post_b2x = array(0, dim = c(px,n_weights))
params$post_a2y = array(0, dim = c(py,n_weights))
params$post_b2y = array(0, dim = c(py,n_weights))
params$ordering = array(0, dim = c(num_factors,n_weights))
return(params)
}
#' Check if a fit object is generated and if a weight has been set...
#'@export
check.fit = function(){
# Check if a fit object exists:
if(is.null(self$fit)){
stop("ERROR: $fit is NULL. Run $run.cv.spear(...) or $run.spear() to generate the $fit object.")
}
if(is.null(self$options$current.weight.idx)){
stop("ERROR: $options$current.weight.idx is NULL. Set this parameter manually [1 through nrow($params$weights)] or use $set.weights()")
}
}
#' Confirm fold.ids will work in cv.spear
#' @param fold.ids Fold ids
#'@export
check.fold.ids = function(fold.ids){
Y <- private$get.Y()
return(any(sapply(unique(fold.ids), function(k){
for(j in 1:ncol(Y)){
subset = Y[which(fold.ids != k),j]
if(self$params$family != "gaussian" & any(table(subset) < 2)){
return(TRUE)
}
}
return(FALSE)
})))
}
#' Get factor scores from a SPEAR object for a particular dataset `data`.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param cv If `data = "train"`, get factor scores generated from cross validation? If `$run.spear` was used or if `data != "train"` this parameter is ignored. Defaults to `FALSE`.
#'@export
get.factor.scores = function(data = "train", cv = FALSE){
private$check.fit()
X <- private$get.concatenated.X(data = data)
if(data == "train" & self$fit$type == "cv" & cv){
factor.scores.cv <- matrix(NA, nrow = nrow(X), ncol = self$params$num.factors)
for(i in 1:length(self$params$fold.ids)){
factor.scores.cv[i,] <- X[i,] %*% self$fit$regression.coefs.cv[,, self$params$fold.ids[i], self$options$current.weight.idx]
}
rownames(factor.scores.cv) <- rownames(X)
colnames(factor.scores.cv) <- paste0("Factor", 1:ncol(factor.scores.cv))
return(factor.scores.cv)
} else if(cv == TRUE){
warning("Warning: Can't return CV factor scores (data != 'train'). Returning factor scores from all data.")
}
factor.scores <- X %*% self$fit$regression.coefs[,,self$options$current.weight.idx]
rownames(factor.scores) <- rownames(X)
colnames(factor.scores) <- paste0("Factor", 1:ncol(factor.scores))
return(factor.scores)
}
#' Get features from a SPEAR object.
#' @param rank How to rank features? Defaults to `"probability"` (joint.probability) followed by projection.coefficient. Can also be "`coefficient`" (only projection.coefficient magnitude).
#' @param factors Which factors to return features for? Accepts integers (i.e. `factors = c(1,2,4)` or `factors = 1`) Defaults to `NULL` (all).
#' @param assays Which assays? Check names via `names(SPEAR$data$train)`. Defaults to `NULL` (all)
#' @param coefficient.cutoff What projection.coefficient value to use as the cutoff? Defaults to 0
#' @param probability.cutoff What joint.probability value to use as the cutoff? Defaults to .95
#' @param correlation Return columns with factor score correlations? Accepts `"spearman"`, `"pearson"`, `"kendall"`, or "`none`" (default)
#' @param simplify Simplify data.frame output to only include Factor, Analyte, Assay, joint.probability and projection.coefficient? Defaults to `TRUE`
#'@export
get.analyte.scores = function(rank = "probability", factors = NULL, assays = NULL, coefficient.cutoff = 0, probability.cutoff = .95, correlation = "none", simplify = TRUE){
private$check.fit()
w.idx <- self$options$current.weight.idx
if(is.null(factors)){
factors <- 1:self$params$num.factors
}
factors.string <- paste0("Factor", factors)
if(is.null(assays)){
assays <- names(self$data$train)
} else {
if(any(!assays %in% names(self$data$train))){
stop("ERROR: assays provided not recognized. Need to be names found in names(SPEAR$data$train)")
}
}
X <- private$get.concatenated.X()
features <- list()
w.joint.probabilities = self$fit$joint.probs[,,w.idx]
w.marginal.probabilities = self$fit$marginal.probs[,,w.idx]
w.projection.probabilities = self$fit$projection.probs[,,w.idx]
w.nonzero.probabilities = self$fit$nonzero.probs[,,w.idx]
w.regression.coefficients = self$fit$regression.coefs[,,w.idx]
w.projection.coefficients = self$fit$projection.coefs.x[,,w.idx]
colnames(w.joint.probabilities) <- paste0("Factor", 1:ncol(w.joint.probabilities))
rownames(w.joint.probabilities) <- colnames(X)
colnames(w.marginal.probabilities) <- paste0("Factor", 1:ncol(w.marginal.probabilities))
rownames(w.marginal.probabilities) <- colnames(X)
colnames(w.projection.probabilities) <- paste0("Factor", 1:ncol(w.projection.probabilities))
rownames(w.projection.probabilities) <- colnames(X)
colnames(w.nonzero.probabilities) <- paste0("Factor", 1:ncol(w.nonzero.probabilities))
rownames(w.nonzero.probabilities) <- colnames(X)
colnames(w.regression.coefficients) <- paste0("Factor", 1:ncol(w.joint.probabilities))
rownames(w.regression.coefficients) <- colnames(X)
colnames(w.projection.coefficients) <- paste0("Factor", 1:ncol(w.joint.probabilities))
rownames(w.projection.coefficients) <- colnames(X)
# by factor
num.omics <- length(self$data$train)
factor.features <- list()
for(f in 1:self$params$num.factors){
temp <- list()
factor <- paste0("Factor", f)
s <- 1
for(o in 1:num.omics){
omic.features <- list()
w.joint.probabilities.current <- w.joint.probabilities[s:(s + nrow(self$data$train[[o]]) - 1),factor]
w.marginal.probabilities.current <- w.marginal.probabilities[s:(s + nrow(self$data$train[[o]]) - 1),factor]
w.projection.probabilities.current <- w.projection.probabilities[s:(s + nrow(self$data$train[[o]]) - 1),factor]
w.nonzero.probabilities.current <- w.nonzero.probabilities[s:(s + nrow(self$data$train[[o]]) - 1),factor]
w.regression.coefficients.current <- w.regression.coefficients[s:(s + nrow(self$data$train[[o]]) - 1),factor]
w.projection.coefficients.current <- w.projection.coefficients[s:(s + nrow(self$data$train[[o]]) - 1),factor]
s <- s + nrow(self$data$train[[o]])
# Round loading probabilities to 7 decimals (to rank 1's):
#w.joint.probabilities.current <- round(w.joint.probabilities.current, 7)
#w.marginal.probabilities.current <- round(w.marginal.probabilities.current, 7)
#w.projection.probabilities.current <- round(w.projection.probabilities.current, 7)
feat.order <- order(w.joint.probabilities.current, abs(w.projection.coefficients.current), decreasing = TRUE)
w.regression.coefficients.current <- w.regression.coefficients.current[feat.order]
w.projection.coefficients.current <- w.projection.coefficients.current[feat.order]
w.joint.probabilities.current <- w.joint.probabilities.current[feat.order]
w.marginal.probabilities.current <- w.marginal.probabilities.current[feat.order]
w.projection.probabilities.current <- w.projection.probabilities.current[feat.order]
w.nonzero.probabilities.current <- w.nonzero.probabilities.current[feat.order]
temp[[names(self$data$train)[o]]]$features <- names(w.joint.probabilities.current)
temp[[names(self$data$train)[o]]]$joint.probabilities <- w.joint.probabilities.current
temp[[names(self$data$train)[o]]]$marginal.probabilities <- w.marginal.probabilities.current
temp[[names(self$data$train)[o]]]$projection.probabilities <- w.projection.probabilities.current
temp[[names(self$data$train)[o]]]$nonzero.probabilities <- w.nonzero.probabilities.current
temp[[names(self$data$train)[o]]]$projection.coefficients <- w.projection.coefficients.current
temp[[names(self$data$train)[o]]]$regression.coefficients <- w.regression.coefficients.current
}
features[[factor]] <- temp
}
fs <- self$get.factor.scores(cv = FALSE)
fs.cv <- self$get.factor.scores(cv = TRUE)
results.list <- list()
for(f in factors.string){
for(o in assays){
feature.vec <- features[[f]][[o]]
passed.coeff.cutoff <- abs(feature.vec$projection.coefficients) >= coefficient.cutoff
passed.prob.cutoff <- feature.vec$joint.probabilities >= probability.cutoff
passed.total.cutoff <- passed.coeff.cutoff & passed.prob.cutoff
if(sum(passed.total.cutoff) > 0){
res <- as.data.frame(feature.vec)[passed.coeff.cutoff & passed.prob.cutoff,]
res$omic <- o
res$factor <- f
res$cor.in.sample <- NA
res$cor.cv <- NA
res$pval.in.sample <- NA
res$pval.cv <- NA
# Correlations:
if(correlation != "none"){
for(r in 1:nrow(res)){
vals <- self$data$train$Xlist[[o]][,which(colnames(self$data$train$Xlist[[o]]) == res$features[r])]
res$cor.in.sample[r] <- cor(vals, fs[,f], method = correlation)
res$cor.cv[r] <- cor(vals, fs.cv[,f], method = correlation)
res$pval.in.sample[r] <- suppressWarnings(cor.test(vals, fs[,f], method = correlation)$p.value)
res$pval.cv[r] <- suppressWarnings(cor.test(vals, fs.cv[,f], method = correlation)$p.value)
}
colnames(res) <- c("Analyte", "joint.probability", "marginal.probability", "projection.probability", "nonzero.probability", "projection.coefficient", "regression.coefficient", "Assay", "Factor", "factor.correlation", "factor.cv.correlation", "factor.cor.pvalue", "factor.cv.cor.pvalue")
rownames(res) <- NULL
res <- dplyr::select(res,
Factor,
Analyte,
Assay,
regression.coefficient,
projection.coefficient,
joint.probability,
marginal.probability,
projection.probability,
nonzero.probability,
factor.correlation,
factor.cor.pvalue,
factor.cv.correlation,
factor.cv.cor.pvalue)
} else {
colnames(res) <- c("Analyte", "joint.probability", "marginal.probability", "projection.probability", "nonzero.probability", "projection.coefficient", "regression.coefficient", "Assay", "Factor")
rownames(res) <- NULL
res <- dplyr::select(res,
Factor,
Analyte,
Assay,
regression.coefficient,
projection.coefficient,
joint.probability,
marginal.probability,
projection.probability,
nonzero.probability)
}
results.list[[length(results.list) + 1]] <- res
}
}
}
if(length(results.list) == 0){
warning("*** Warning: No features found within cutoffs for chosen Factors/assays Returning data.frame with 0 rows. Try broadening the criteria.")
total.results <- data.frame( Factor = character(0),
Analyte = character(0),
Assay = character(0),
regression.coefficient = numeric(0),
projection.coefficient = numeric(0),
joint.probability = numeric(0),
marginal.probability = numeric(0),
projection.probability = numeric(0),
nonzero.probability = numeric(0))
} else {
total.results <- do.call("rbind", results.list)
total.results <- dplyr::arrange(total.results, Factor, -joint.probability, -abs(projection.coefficient))
}
# Return simplified matrix:
if(simplify){
total.results <- dplyr::select(total.results, Factor, Analyte, Assay, joint.probability, projection.coefficient)
}
return(total.results)
}
#' Get features from a SPEARobject.
#' @param rank How to rank features? Defaults to `"probability"` (joint.probability) followed by projection.coefficient. Can also be "`coefficient`" (only projection.coefficient magnitude).
#' @param factors Which factors to return signatures for? Accepts integers (i.e. `factors = c(1,2,4)` or `factors = 1`) Defaults to `NULL` (all).
#' @param assays Which assays? Check names via `names(SPEAR$data$train)`. Defaults to `NULL` (all)
#' @param coefficient.cutoff What projection.coefficient value to use as the cutoff? Defaults to 0
#' @param probability.cutoff What joint.probability value to use as the cutoff? Defaults to .95
#' @param correlation Return columns with factor score correlations? Accepts `"spearman"`, `"pearson"`, `"kendall"`, or "`none`" (default)
#'@export
get.signatures = function(rank = "probability", factors = NULL, assays = NULL, coefficient.cutoff = 0, probability.cutoff = .95, correlation = "none"){
total.results <- self$get.analyte.scores(rank = rank, factors = factors, assays = assays, coefficient.cutoff = coefficient.cutoff, probability.cutoff = probability.cutoff, correlation = correlation)
if(nrow(total.results) == 0){
stop("ERROR: No analytes found for the criteria selected. Try to loosen the cutoffs or select a different factor.")
}
if(is.null(factors)){
factors <- 1:self$params$num.factors
}
factor.vec <- paste0("Factor", factors)
assay.vec <- names(self$data$train)
# Return list of signatures per assay:
sigs <- list()
for(f in 1:length(factor.vec)){
tmp <- list()
for(a in 1:length(assay.vec)){
filt <- dplyr::filter(total.results, Factor == factor.vec[f], Assay == assay.vec[a])
if(nrow(filt) > 0){
tmp[[assay.vec[a]]] <- filt$Analyte
} else {
tmp[[assay.vec[a]]] <- character(0)
}
}
sigs[[factor.vec[f]]] <- tmp
}
return(sigs)
}
#' Randomly generate identifiers for analytes (to prevent misidentification)
#'@export
generate.spear.ids = function(){
Xlist <- as.list(MultiAssayExperiment::experiments(self$data$train))
assays = c()
assay.ids = c()
analytes = c()
spear.ids <- c()
for(a in 1:length(Xlist)){
tmp <- Xlist[[a]]
assays <- c(assays, rep(names(Xlist)[a], nrow(tmp)))
assay.ids <- c(assay.ids, rep(a, nrow(tmp)))
analytes <- c(analytes, rownames(tmp))
spear.ids <- c(spear.ids, paste0("sid-", sprintf("%02d", a), "-", sprintf("%06d", 1:nrow(tmp))))
}
df <- data.frame(
Assay = assays,
Assay.id = assay.ids,
Analyte = analytes,
SPEAR.id = spear.ids
)
self$data$spear.ids <- df
return(invisible(self))
}
#' Randomly generate identifiers for analytes (to prevent misidentification)
#' @param data Name of MAE stored under $data (defaults to "train")
#'@export
get.concatenated.X = function(data = "train"){
return(X <- t(do.call("rbind", as.list(MultiAssayExperiment::experiments(self$data[[data]])))))
}
#' Randomly generate identifiers for analytes (to prevent misidentification)
#' @param data Name of MAE stored under $data (defaults to "train")
#'@export
get.Y = function(data = "train"){
if(self$params$family == "multinomial"){
Y <- as.matrix(private$one.hot.encode.multinomial())
} else if(self$params$family == "ordinal" | self$params$family == "binomial"){
Y <- as.matrix(private$encode.ordinal())
} else { # gaussian
Y <- as.matrix(MultiAssayExperiment::colData(self$data[[data]])[,which(colnames(MultiAssayExperiment::colData(self$data[[data]])) %in% self$params$response), drop = FALSE])
}
return(Y)
}
#' Randomly generate fold.ids to be used for reproducibility in `$run.cv.spear(...)`. See `method` for different ways to generate fold.ids (for cross-validation).
#' @param method How to generate the fold ids? `method = "balanced"` (default) will try to achieve maximum balance amongst classes in `Y` (when `family = "ordinal"` or `family = "multinomial"`). `method = "random"` will randomly assign a number 1-`num.folds`.
#'@export
generate.fold.ids = function(method = "balanced"){
if(self$params$family == "gaussian"){
method = "random"
}
num.folds <- self$params$num.folds
Y <- private$get.Y()
if(method == "balanced"){
if(self$params$family == "multinomial"){
Y <- apply(Y, 1, which.max)
} else {
if(ncol(Y) > 1){
warning("More than one column in Y provided. Balancing fold.ids with respect to $options$current.response.idx (", self$options$current.response.idx, " | ", self$params$response[self$options$current.response.idx], ")")
}
Y <- Y[,self$options$current.response.idx]
}
df <- as.data.frame(Y)
nfolds = self$params$num.folds
N <- nrow(Y)
df$foldids = NA
for(i in sort(unique(Y))){
N.temp <- sum(Y == i)
loc <- which(Y == i)
vals <- sample(x = rep(1:nfolds, ceiling(N.temp/nfolds)), size = N.temp, replace = F)
df$foldids[loc] <- vals
}
output <- df$foldids
names(output) <- rownames(Y)
return(output)
} else if(method == "random"){
nfolds = self$params$num.folds
N <- nrow(Y)
fold.ids <- sample(x = rep(1:nfolds, ceiling(N/nfolds)), size = N, replace = F)
names(fold.ids) <- rownames(Y)
return(fold.ids)
} else {
stop("ERROR: method not recognized. Can be 'balanced' or 'random'")
}
}
#' Get predictions from a SPEARobject and a dataset `data`.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param cv If `data = "train"`, get factor scores generated from `$run.cv.spear`? If `$run.spear` was used or if `data != "train"` this parameter is ignored. Defaults to `TRUE`.
#'@export
get.predictions = function(data = "train", cv = TRUE){
X <- private$get.concatenated.X(data = data)
Y <- private$get.Y(data = data)
if(data == "train" & self$fit$type == "cv" & cv){
# Get CV Predictions:
factor.scores <- self$get.factor.scores(data = data, cv = TRUE)
preds = matrix(NA, nrow = nrow(X), ncol = ncol(Y))
for(i in 1:nrow(preds)){
preds[i,] <- factor.scores[i,] %*% self$fit$projection.coefs.y.cv.scaled[,,self$params$fold.ids[i],self$options$current.weight.idx]
}
rownames(preds) <- rownames(X)
colnames(preds) <- colnames(Y)
} else {
# Get in.sample Predictions:
factor.scores <- self$get.factor.scores(data = data, cv = FALSE)
preds = matrix(NA, nrow = nrow(X), ncol = ncol(Y))
for(i in 1:nrow(preds)){
preds[i,] <- factor.scores[i,] %*% self$fit$projection.coefs.y.scaled[,,self$options$current.weight.idx]
}
rownames(preds) <- rownames(X)
colnames(preds) <- colnames(Y)
}
# Return signal if gaussian, otherwise use signal with cutoffs
if(self$params$family == "gaussian"){
return(preds)
} else if(self$params$family == "multinomial"){
signal <- preds
intercept = sapply(self$fit$intercepts.scaled, function(z) z[self$options$current.weight.idx])
Pmat = t(apply(signal, 1, function(z) exp(z+intercept)/sum(exp(z+intercept))))
predictions <- matrix(apply(Pmat, 1, which.max) - 1, ncol=1)
predictions.encoded = matrix(0, ncol = ncol(Pmat), nrow = nrow(Pmat))
for(i in 1:nrow(predictions.encoded)){
predictions.encoded[i,predictions[i]] <- 1
}
rownames(predictions) <- rownames(preds)
colnames(predictions) <- "pred.class"
rownames(predictions.encoded) <- rownames(preds)
colnames(predictions.encoded) <- colnames(Y)
results.combined <- list(class.predictions = predictions,
encoded.predictions = predictions.encoded,
class.probabilities = Pmat,
signal = signal)
return(results.combined)
}
# If family == "binomial" or "ordinal"
results = list()
# Go column by column (once for each response variable...)
for(j in 1:ncol(preds)){
resp.name <- colnames(preds)[j]
# Get signal (predictions) for current response...
signal <- matrix(preds[,j], ncol = 1)
rownames(signal) <- rownames(preds)
colnames(signal) <- c(resp.name)
# Get intercepts (scaled)
intercept = self$fit$intercepts.scaled[[j]][self$options$current.weight.idx,]
# Set up Probability Matrices:
Pmat0 = matrix(0, ncol = length(intercept), nrow = length(signal))
Pmat = matrix(0, ncol = length(intercept) + 1, nrow = length(signal))
# Use signal and intercept to calculate probabilities:
for(k in 1:ncol(Pmat0)){
Pmat0[,k] = 1.0/(1+exp(-signal - intercept[k]))
}
for(k in 1:ncol(Pmat)){
if(k == 1){
Pmat[,1] = 1-Pmat0[,1]
}else if(k == ncol(Pmat)){
Pmat[,k] = Pmat0[,k-1]
}else{
Pmat[,k] = Pmat0[,k-1] - Pmat0[,k]
}
}
# Format Pmat to get class.predictions
class.predictions <- matrix(apply(Pmat, 1, which.max) - 1, ncol=1)
rownames(class.predictions) <- rownames(preds)
colnames(class.predictions) <- c(resp.name)
rownames(Pmat) <- rownames(preds)
colnames(Pmat) <- paste0("Class", 0:(ncol(Pmat)-1))
rownames(Pmat0) <- rownames(preds)
# Return class.predictions, Pmat, and signal
results[[resp.name]] <- list(class.predictions = class.predictions,
class.separations = Pmat0,
class.probabilities = Pmat,
signal = signal)
}
return(results)
}
#' Assess the predictions from the SPEAR model using a variety of metrics. Utilizes the 'yardstick' package under the 'tidymodels' umbrella.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param cv If `data = "train"`, get factor scores generated from `$run.cv.spear`? If `$run.spear` was used or if `data != "train"` this parameter is ignored. Defaults to `TRUE`.
#'@export
assess <- function(data = "train", cv = TRUE){
family <- self$params$family
Y <- private$get.Y(data = data)
if(family == "gaussian"){
# Make data.frame of true x estimates:
preds <- self$get.predictions(data = data, cv = cv)
for(p in 1:ncol(Y)){
resp <- colnames(Y)[p]
vals <- unname(unlist(Y))
estimates <- preds[,p]
tidy.pred.df <- data.frame(
truth = vals,
estimate = estimates,
Response = resp
)
tidy.pred.df <- dplyr::group_by(tidy.pred.df, Response)
}
} else if(family == "multinomial"){
preds <- self$get.predictions(data = data, cv = cv)
true <- apply(Y, 1, which.max)-1
estimates <- preds$class.predictions
# make factor:
true <- factor(true)
estimates <- factor(estimates, levels = levels(true))
tidy.pred.df <- data.frame(
truth = true,
estimate = estimates
)
tidy.prob.df <- preds$class.probabilities
tidy.pred.df <- cbind(tidy.pred.df, tidy.prob.df)
colnames(tidy.pred.df) <- c("truth", "estimate", levels(true))
} else {
# Ordinal or Binomial
all.pred.df <- data.frame()
for(resp in colnames(Y)){
preds <- self$get.predictions(data = data, cv = cv)[[resp]]
true <- Y
estimates <- preds$class.predictions
# make factor:
true <- factor(true)
estimates <- factor(estimates, levels = levels(true))
tmp.df <- data.frame(
truth = true,
estimate = estimates
)
tidy.prob.df <- preds$class.probabilities
tmp.df <- cbind(tmp.df, tidy.prob.df, resp)
colnames(tmp.df) <- c("truth", "estimate", levels(true), "Response")
all.pred.df <- rbind(all.pred.df, tmp.df)
}
all.pred.df <- dplyr::group_by(all.pred.df, Response)
tidy.pred.df <- all.pred.df
}
return(tidy.pred.df)
}
#' Choose a combination of weights for a SPEAR object. Print `SPEAR$params$weights` to see all combinations trained.
#' @param w.idx Use a manual weight index (row from `SPEAR$params$weights`)? Defaults to `NULL`
#' @param method Which method to use? Needs to be from a `$train.spear(cv = TRUE)` object. Can be `"min"` (overall lowest mean cross validated loss), or `"sd"` (take highest weight within 1 sd cross validated loss).
#' @examples
#' SPEARobj <- SPEAR$new(...)
#'
#' SPEARobj$train.spear(cv = TRUE)
#'
#' # Set the weights
#' SPEARobj$set.weights()
#' SPEARobj$set.weights(w.x = 0.1)
#' SPEARobj$set.weights(w.idx = 1)
#' SPEARobj$set.weights(method = "sd")
#'@export
set.weights = function(w.idx = NULL, w.x = NULL, w.y = NULL, method = "sd"){
private$check.fit()
if(!is.null(w.idx)){
if(!w.idx %in% 1:nrow(self$params$weights)){
stop("ERROR: w.idx supplied not valid (must be between 1 and ", nrow(self$params$weights), "). Try SPEAR$params$weights to see all combinations (rows).")
} else {
if(!self$options$quiet){
cat("Setting current weight index to ", w.idx, "\n",
"w.x: ", self$params$weights[w.idx,1],
sep = "")
}
self$options$current.weight.idx = w.idx
}
}else if(!is.null(w.x) | !is.null(w.y)){
if(is.null(w.x)){w.x <- 1}
if(is.null(w.y)){w.y <- 1}
sums <- sapply(1:nrow(self$params$weights), function(i){return(c(self$params$weights[i,1] - w.x, self$params$weights[i,2] - w.y))})
if(!any(colSums(sums) == 0)){
warning("*** Warning: w.x = ", w.x, " | w.y = ", w.y, " not found among possible weight combinations (SPEARobj$params$weights). Using closest SPEAR weight...\n")
}
w.idx = which.min(abs(colSums(sums)))
if(!self$options$quiet){
cat("Setting current weight index to ", w.idx, "\n",
"w.x: ", self$params$weights[w.idx,1],
sep = "")
}
self$options$current.weight.idx = w.idx
} else {
# method:
w.idx <- private$get.widx.from.method(method = method)
if(!self$options$quiet){
cat("Setting current weight index to ", w.idx, "\n",
"w.x: ", self$params$weights[w.idx,1], "\n",
"w.y: ", self$params$weights[w.idx,2], "\n",
sep = "")
}
self$options$current.weight.idx = w.idx
}
if(!self$options$quiet){cat("\n")}
return(invisible(self))
}
#' Get a w.idx from a method (private function, can't be called publicly, use `set.weights` instead)
#' @param method Which method to use?
#'@export
get.widx.from.method <- function(method){
if(self$fit$type != "cv"){
stop("ERROR: $fit needs to come from $train.spear(cv = TRUE), not (cv = FALSE) to use a w.idx selection method.")
} else if(is.null(self$fit$loss)){
stop("ERROR: $cv.evaluate needs to be run to calculate the loss.")
}
# min -----------
if(method == "min"){
if(ncol(self$fit$loss$cvm) > 1){
overall.cvm <- apply(self$fit$loss$cvm, 1, rowSums)
} else {
overall.cvm <- self$fit$loss$cvm
}
w.idx <- which.min(overall.cvm)
} else if(method == "sd"){
if(ncol(self$fit$loss$cvm) > 1){
overall.cvm <- rowSums(self$fit$loss$cvm)
overall.cvsd <- rowSums(self$fit$loss$cvsd)
} else {
overall.cvm <- self$fit$loss$cvm
overall.cvsd <- self$fit$loss$cvsd
}
tmp.w.idx <- which.min(overall.cvm)
tmp.cvsd <- overall.cvsd[tmp.w.idx]
w.idx <- min(which(overall.cvm <= overall.cvm[tmp.w.idx] + tmp.cvsd))
} else {
stop("ERROR: method not recognized. See documentation for available methods to select a weight index.")
}
return(w.idx)
}
#' Plot factor scores from a SPEARobject for a particular dataset `data`.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param cv If `data = "train"`, get factor scores generated from `$run.cv.spear`? If `$run.spear` was used or if `data != "train"` this parameter is ignored. Defaults to `FALSE`. NOTE: There is an element of randomness if the factor scores are not correlated with the response, so it is recomended to view the factor scores with `cv = FALSE`
#' @param group What to plot on the x.axis? Defaults to "response" (indicated by $options$current.response.idx), but can be any feature name found in colnames(SPEARobj$data$train$X)
#' @param factors Which factors to show? Defaults to `NULL` (all factors). Use a vector of integers (i.e. `c(1, 2, 4)`) to specify fewer factors.
#' @param plot.type Which type of plot (for `ordinal`, `binomial`, and `multinomial` families)? Can be `"violin"` (default) or `"boxplot"`. Ignored if `family == "gaussian"`.
#' @param show.legend Show the legend of the plot (for `group`)? Defaults to `FALSE`
#'@export
plot.factor.scores = function(data = "train", cv = FALSE, group = "response", factors = NULL, plot.type = "violin", show.legend = FALSE, scale = TRUE){
fs <- self$get.factor.scores(data = data, cv = cv)
if(scale){
fs <- apply(fs, 2, scale)
}
Y <- private$get.Y(data = data)
if(is.null(factors)){
factors <- 1:ncol(fs)
} else if(any(!factors %in% 1:ncol(fs))){
stop("ERROR: factors argument incorrect. Must be a single integer or vector of number spanning 1 to ", ncol(fs))
}
fs <- fs[,factors, drop = FALSE]
df <- as.data.frame(fs)
if(group == "response"){
# Gaussian:
if(self$params$family == "gaussian"){
df$group <- Y[,self$options$current.response.idx]
group.label <- self$params$response[self$options$current.response.idx]
}
# Multinomial or Ordinal
else {
df$group <- MultiAssayExperiment::colData(self$data[[data]])[,self$params$response[self$options$current.response.idx]]
group.label <- names(self$options$color.scheme$Y)[[self$options$current.response.idx]]
}
}
# TODO: add support to plot w.r.t. an analyte...
# else if(!group %in% colnames(self$data[[data]]$X)){
# stop("ERROR: group parameter not recognized. Can be 'response' for the current response index, or any feature name found in self$data$train$X.")
#} else {
# df$group <- self$data[[data]]$X[,group]
# group.label <- group
#}
# Plot df:
df$id <- rownames(df)
df.melt <- reshape2::melt(df, id.vars = c("id", "group"))
g <- ggplot2::ggplot(df.melt)
# Gaussian:
if(self$params$family == "gaussian"){
g <- g + ggplot2::geom_point(ggplot2::aes(x = group, y = value)) +
ggplot2::geom_smooth(ggplot2::aes(x = group, y = value), formula = y ~ x, color = "red", method = "lm")
} else {
if(plot.type == "boxplot"){
g <- g + ggplot2::geom_jitter(ggplot2::aes(x = group, y = value, fill = group), shape = 21, stroke = .3, size = 2) +
ggplot2::geom_boxplot(ggplot2::aes(x = group, y = value, fill = group), alpha = .2, outlier.shape = NA)
} else if(plot.type == "violin"){
total.mean.df <- data.frame()
for(f in unique(df.melt$variable)){
means <- sapply(sort(unique(df.melt$group)), function(val){
return(mean(dplyr::filter(df.melt, variable == f, group == val)$value))
})
num.groups <- length(unique(df.melt$group))
mean.df <- data.frame(x = 1:num.groups - .5,
xend = 1:num.groups + .5,
y = means,
yend = means,
variable = f)
total.mean.df <- rbind(total.mean.df, mean.df)
}
g <- g +
ggplot2::geom_jitter(ggplot2::aes(x = group, y = value, fill = group), shape = 21, stroke = .3, size = 2, width = .15) +
ggplot2::geom_violin(ggplot2::aes(x = group, y = value, group = group, fill = group), alpha = .2) +
ggplot2::geom_segment(data = total.mean.df, ggplot2::aes(x = x, xend = xend, y = y, yend = yend), size = .85)
} else {
stop("ERROR: plot.type provided not recognized. Can be 'boxplot' or 'violin' (default).")
}
if(group == "response" & (self$params$family != "gaussian")){
colors <- self$options$color.scheme$Y[[self$options$current.response.idx]]
g <- g + ggplot2::scale_color_manual(values = colors) + ggplot2::scale_fill_manual(values = colors)# +
#ggplot2::scale_x_discrete(breaks = 0:(length(colors)-1), labels = colnames(self$data$train$Y))
}
}
g <- g + ggplot2::facet_wrap(ggplot2::vars(factor(variable, levels = paste0("Factor", factors))), scales = "free") +
ggplot2::ylab("Factor Score") +
ggplot2::xlab(group.label) +
ggplot2::labs(color = group.label, fill = group.label) +
ggplot2::theme_classic() +
ggplot2::theme(strip.background = ggplot2::element_rect(size = .3, fill = "grey90"),
plot.title = ggplot2::element_text(hjust = .5, size = 10),
axis.line = ggplot2::element_blank(),
axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = .5),
panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=.5),
legend.position = "bottom")
if(!show.legend){
g <- g + ggplot2::guides(color ="none", fill = "none")
}
return(g)
}
#' Plot variance explained for a SPEAR object.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param min.val Minimum percentage to be plotted. Must be a positive numeric value. Defaults to `.01`.
#'@export
plot.variance.explained = function(data = "train", min.val = .01){
if(min.val < 0){
stop("ERROR: min.val must be >= 0...")
}
contributions <- self$get.variance.explained(data = data)
if(all(contributions < min.val)){
stop("ERROR: No assays found to explain sufficient variance for min.val. Try min.val = 0.")
}
projection.coefs.melt <- reshape2::melt(contributions)
g <- ggplot2::ggplot(projection.coefs.melt) +
ggplot2::geom_tile(ggplot2::aes(x = Var2, y = factor(Var1, levels = paste0("Factor", self$params$num.factors:1)), fill = ifelse(value >= min.val, value, NA)), color = "grey50") +
ggplot2::geom_text(data = dplyr::filter(projection.coefs.melt, value >= min.val),
ggplot2::aes(x = Var2, y = Var1, label = round(value, 2)), color = "black") +
ggplot2::scale_fill_gradient2(low = "white", high = "#6831A3", na.value = "white") +
ggplot2::labs(fill = "Var. Exp.") +
ggplot2::theme_void() + ggplot2::theme(axis.text = ggplot2::element_text())
# return plot, suppressing warnings for NA values
return(suppressWarnings(g))
}
#' Plot variance explained for a SPEAR object.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param min.val Minimum value for a coefficient to be plotted. Must be a positive numeric value. Defaults to `.001`.
#'@export
plot.factor.contributions = function(min.val = .001){
if(min.val < 0){
stop("ERROR: min.val must be >= 0...")
}
contributions <- self$get.factor.contributions()
if(all(contributions < min.val)){
stop("ERROR: No factor coefficients found within cutoff of min.val. Try min.val = 0.")
}
projection.coefs.melt <- reshape2::melt(contributions)
g <- ggplot2::ggplot(projection.coefs.melt) +
ggplot2::geom_tile(ggplot2::aes(x = Var2, y = factor(Var1, levels = paste0("Factor", self$params$num.factors:1)), fill = ifelse(abs(value) >= min.val, value, NA)), color = "grey50") +
ggplot2::geom_text(data = dplyr::filter(projection.coefs.melt, abs(value) >= min.val),
ggplot2::aes(x = Var2, y = Var1, label = round(value, 3)), color = "black") +
ggplot2::scale_fill_gradient2(low = "#316CA3", high = "darkred", na.value = "white") +
ggplot2::labs(fill = "Coef.") +
ggplot2::theme_void() + ggplot2::theme(axis.text = ggplot2::element_text())
# return plot, suppressing warnings for NA values
return(g)
}
#' Get factor contributions to response prediction from a SPEAR object.
#'@export
get.factor.contributions = function(){
return(self$fit$projection.coefs.y.scaled[,,self$options$current.weight.idx, drop = FALSE])
}
#' Get variance explained per assay from a SPEAR object.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#'@export
get.variance.explained = function(data = "train"){
w.idx <- self$options$current.weight.idx
# X:
W.to_X <- self$fit$projection.coefs.x[,,w.idx]
Uhat <- self$get.factor.scores(data = data, cv = FALSE)
W.to_X.list <- list()
ind <- 1
for(d in 1:length(self$data[[data]])){
W.to_X.list[[d]] <- W.to_X[ind:(ind - 1 + nrow(self$data[[data]][[d]])),]
ind <- ind + nrow(self$data[[data]][[d]])
}
factor_contributions = array(NA,dim = c(self$params$num.factors, length(self$data[[data]])))
for(k in 1:self$params$num.factors){
for(o in 1:length(self$data[[data]])){
factor.contributions <- c()
X.tilde <- Uhat[,k] %*% t(W.to_X.list[[o]][,k])
# Get var.explained per feature:
X <- t(self$data[[data]][[o]])
X.tilde <- X.tilde + apply(X, 2, function(col){return(rnorm(n = length(col), mean = 0, sd = sqrt(var(col))/nrow(X)))})
factor_contributions[k,o] <- mean(sapply(1:ncol(X), function(cid){
tmp <- cor(X[,cid], X.tilde[,cid], method = "pearson")
if(is.na(tmp)){return(0)};
if(tmp < 0){return(0)};
return(tmp^2)
}))
}
}
colnames(factor_contributions) <- names(self$data[[data]])
rownames(factor_contributions) <- paste0("Factor", 1:nrow(factor_contributions))
return(factor_contributions)
}
#' Estimate the number of factors needed for the SPEAR model.
#' @param simplify Only estimate the minimum and maximum number of factors (don't perform iterative factor construction)? Defaults to `FALSE`.
#' @param assign Should the SPEAR model parameter `$params$num.factors` be updated by the recommendation? Defaults to `TRUE`
#' @param num.iterations How many iterations to run to get average number of factors? Defaults to `10`.
#' @param max.num.factors Maximum number of factors? If `NULL` (default), will return `floor(N/10)` where `N` = number of samples in `X`
#' @param min.num.factors Minimum number of factors? If `NULL` (default), will calculate from SVD.
#' @param seed An integer provided for reproducibility, defaults to `NULL` (will not set seed)
#' @param SNR Signal-to-noise ratio parameter used for factor estimation. Defaults to `.1`.
#'@export
estimate.num.factors = function(assign = TRUE, num.iterations = 10, max.num.factors = NULL,
min.num.factors = NULL, SNR = 0.1, quiet = FALSE, simplify = FALSE){
# Combine datasets if a list is provided:
X <- private$get.concatenated.X()
# Get N (# samples) and P (total # features)
p = ncol(X)
n = nrow(X)
# Run SVD on X
if(!quiet){cat("--------------------\n")}
if(is.null(min.num.factors)){
if(!self$options$quiet){cat("Running initial SVD for min:\n")}
tmp1 = svd(X)$d
sum0 = cumsum(tmp1^2)
sum0 = sum0/sum0[length(sum0)]
min.num.factors = which(sum0>0.5)[1]
min.method <- paste0("SVD")
} else {
min.method <- "fixed"
}
if(is.null(max.num.factors)){
max.num.factors = floor(n/10)
max.method <- paste0("floor(", n, "/10)")
} else {
max.method <- "fixed"
}
# if simplify, just return min number of factors calculated
if(simplify){
if(min.method == "fixed"){stop("ERROR: Can't supply min.num.factors while simplify is TRUE. Needs to be NULL.")}
if(!self$options$quiet){cat("min: ", min.num.factors," factors [", min.method, "]\n", sep = "")}
if(assign){
if(!self$options$quiet){cat("Assigning: $params$num.factors = ", min.num.factors, sep = "")}
self$params$num.factors <- min.num.factors
}
return(invisible(self))
}
# Otherwise, run through iterative procedure, first check for min < max:
if(!self$options$quiet){cat("min: ", min.num.factors," factors [", min.method, "]\n", sep = "")}
if(!self$options$quiet){cat("max: ", max.num.factors," factors [", max.method, "]\n", sep = "")}
if(!self$options$quiet){cat("--------------------\n")}
# Max is greater than min. Return min:
if(max.num.factors <= min.num.factors){
cat("max factors (", max.num.factors, ") <= min factors (", min.num.factors, ")\n", sep = "")
cat("Suggested: ", min.num.factors, " factors (min)\n", sep = "")
if(assign){
if(!self$options$quiet){cat("Assigning: $params$num.factors = ", min.num.factors, sep = "")}
self$params$num.factors <- min.num.factors
}
return(invisible(self))
}else{
# Test iterations:
residuals = matrix(1, ncol =max.num.factors, nrow = num.iterations)
for(k in 1:num.iterations){
if(!quiet){cat("Iter ", k, ":", num.iterations, "\t", sep = "")}
E = matrix(rnorm(n*p), nrow = n, ncol = p)
X1fake = X+E*sqrt(0.01*(1-SNR))
X2fake = X-E*sqrt(100*(1-SNR))
tmp2 = svd(X1fake)
R = X2fake
for(i in 1:max.num.factors){
R = R - tmp2$u[,i]%*%t(tmp2$v[,i])*tmp2$d[i]
residuals[k,i] = mean(R^2)
}
if(!self$options$quiet){cat("Best = ", which.min(residuals[k,]), " factors\n", sep = "")}
}
residuals_mean = apply(residuals,2,mean)
id_select = which.min(residuals_mean)
if(!self$options$quiet){cat("----------------\n")}
if(!self$options$quiet){cat("min: ", min.num.factors, " factors\n", sep = "")}
if(!self$options$quiet){cat("avg: ", id_select, " factors\n", sep = "")}
if(!self$options$quiet){cat("max: ", max.num.factors, " factors\n", sep = "")}
if(!self$options$quiet){cat("----------------\n")}
if(id_select < min.num.factors){
if(!self$options$quiet){cat("Suggested: ", min.num.factors, " factors (min)\n", sep = "")}
id_select = min.num.factors
} else if(id_select > max.num.factors){
if(!self$options$quiet){cat("Suggested: ", max.num.factors, " factors (max)\n", sep = "")}
id_select = min.num.factors
} else {
if(!self$options$quiet){cat("Suggested: ", id_select, " factors (avg)\n", sep = "")}
}
if(assign){
if(!self$options$quiet){cat("Assigning: $params$num.factors = ", id_select, sep = "")}
self$params$num.factors <- id_select
}
return(invisible(self))
}
}
#' Get the loss from the cross validation. Must be called from a SPEARobject trained with `$run.cv.spear()` then `$cv.evaluate()`.
#'@export
get.cv.loss <- function(){
if(self$fit$type != "cv"){
stop("ERROR: Needs to be a cv.spear object (run SPEARobj$run.cv.spear()...)")
}
loss.output <- self$fit$loss
# set column end for multinomial:
response.labels <- self$params$response
# set initial dfs
loss.output$min <- data.frame()
loss.output$sd <- data.frame()
# Get best w.idxs
min.widx = private$get.widx.from.method(method = "min")
sd.widx = private$get.widx.from.method(method = "sd")
for(j in 1:length(response.labels)){
response = response.labels[j]
# min
cvm = loss.output$cvm[min.widx,j]
cvsd = loss.output$cvsd[min.widx,j]
loss.temp <- data.frame(response = response,
widx = min.widx,
cvm = cvm,
cvsd = cvsd)
loss.output$min <- rbind(loss.output$min, loss.temp)
# sd
cvm = loss.output$cvm[sd.widx,j]
cvsd = loss.output$cvsd[sd.widx,j]
loss.temp <- data.frame(response = response,
widx = sd.widx,
cvm = cvm,
cvsd = cvsd)
loss.output$sd <- rbind(loss.output$sd, loss.temp)
}
rownames(loss.output$min) <- NULL
rownames(loss.output$sd) <- NULL
return(loss.output)
}
#' Get the loss from the cross validation. Must be called from a SPEARobject trained with `$run.cv.spear()` then `$cv.evaluate()`.
#'@export
plot.cv.loss <- function(show.sd = TRUE, show.min = TRUE){
if(self$fit$type != "cv"){
stop("ERROR: Needs to be a cv.spear object (run SPEARobj$run.cv.spear()...)")
}
# CVM
cvm.df <- self$fit$loss$cvm
cvm.df.melt <- reshape2::melt(cvm.df)
colnames(cvm.df.melt)[which(colnames(cvm.df.melt) == "value")] <- "cvm"
# CVSD
cvsd.df <- self$fit$loss$cvsd
cvsd.df.melt <- reshape2::melt(cvsd.df)
colnames(cvsd.df.melt)[which(colnames(cvsd.df.melt) == "value")] <- "cvsd"
# Join together CVM and CVSD:
tmp <- suppressMessages(dplyr::inner_join(cvm.df.melt, cvsd.df.melt,
id.vars = c("Var1", "Var2")))
# Generate labels for x:
x.labs <- sapply(1:nrow(cvm.df), function(i){
return(paste0("w.x = ", self$params$weights[i,1], "\n", "idx = ", i))
})
# Get sd / min idx:
sd.idx <- private$get.widx.from.method(method = "sd")
min.idx <- private$get.widx.from.method(method = "min")
# Generate plot:
plt <- ggplot2::ggplot(tmp) +
ggplot2::geom_point(ggplot2::aes(x = Var1, y = cvm), size = 3) +
ggplot2::geom_errorbar(ggplot2::aes(x = Var1, ymin = cvm - cvsd, ymax = cvm + cvsd), size = .3, width = .2) +
ggplot2::scale_x_discrete(labels = x.labs) +
ggplot2::theme_void() + ggplot2::theme(axis.text = ggplot2::element_text(),
panel.grid.major.y = ggplot2::element_line(size = .2, color = "grey50"),
panel.border = ggplot2::element_rect(fill = NA),
strip.text = ggplot2::element_text(vjust = .5, size = 10)) +
ggplot2::facet_wrap(ggplot2::vars(Var2))
if(show.sd){
plt <- plt + ggplot2::geom_point(data = dplyr::filter(tmp, Var1 == paste0("widx", sd.idx)), ggplot2::aes(x = Var1, y = cvm), size = 6, color = "blue")
}
if(show.min){
plt <- plt + ggplot2::geom_point(data = dplyr::filter(tmp, Var1 == paste0("widx", min.idx)), ggplot2::aes(x = Var1, y = cvm), size = 4.5, color = "red")
}
return(plt)
}
#' Choose a response to be preferred for plotting functions. Will set `SPEARobj$options$current.response.idx` (which can be set manually as well)
#' @param response Which response to use? Can be an integer (which column number/index?) or string (which column name)? If string, must be found in `colnames(SPEARobj$data$train$Y)`. Defaults to NULL (`1`)
#' @examples
#' # Use the column number / index
#' SPEARobj$set.response(1)
#'
#' # use the column name instead
#' SPEARobj$set.response("response_name")
#'@export
set.response = function(response = NULL){
private$check.fit()
if(self$params$family == "multinomial"){
response.idx = 1
} else {
if(is.null(response)){
response.idx = 1
} else if(is.character(response)){
if(!response %in% self$params$response){
stop("ERROR: response not found in $params$response. Please check...")
}
response.idx = which(response %in% self$params$response)
} else if(is.numeric(response)){
if(!response %in% 1:length(self$params$response)){
stop("ERROR: response number not within the indices of 1:", length(self$params$response), ". Please check...")
}
response.idx = response
} else {
stop("ERROR: response provided not recognized... Must be an integer (i.e. 1) or character (i.e. column name).")
}
}
if(!self$options$quiet){
cat("Setting current response index to ", response.idx, "\n",
"response: ", self$params$response[response.idx], "\n\n",
sep = "")
}
self$options$current.response.idx = response.idx
return(invisible(self))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.