SPEAR: SPEAR - Sparse Supervised Bayesian Factor Model for Multi-omic Analyses

Documented in add.data assess check.fit check.fold.ids create.params encode.ordinal estimate.num.factors generate.fold.ids generate.spear.ids get.analyte.scores get.concatenated.X get.cv.loss get.data get.factor.contributions get.factor.scores get.predictions get.signatures get.variance.explained get.widx.from.method get.Y one.hot.encode.multinomial plot.cv.loss plot.factor.contributions plot.factor.scores plot.variance.explained print.out remove.data set.color.scheme set.response set.weights update.dimnames

#' Add a dataset to a SPEAR object.
#' @param mae MultiAssayExperiment with multi-omics data. Columns of ExperimentList must match (correspond to sample identifiers) and link to rows of colData (with response/responses).
#' @param response Column name(s) of colData from MultiAssayExperiment that will be the response(s). More than one response should be inputted as a vector.
#' @param name Name for the stored dataset in the SPEAR object. Access via `SPEAR$get.data(__name__)`
#' @examples
#' SPEAR <- SPEAR$new(...)
#' 
#' SPEAR$add.data(MultiAssayExperiment(...), response = "death", name = "dataset1")
#' 
#'@export
add.data = function(data, response = NULL, name = NULL){
  
  # Set name:
  if(is.null(name)){
    d.tmp = 1
    name = paste0("dataset", d.tmp)
    while(name %in% names(self$data)){
      d.tmp = d.tmp + 1
      name = paste0("dataset", d.tmp)
    }
  } else if(name %in% names(self$data)){
    stop("ERROR: name provided already exists in names(SPEAR$data). Please remove this dataset manually via SPEAR$remove.data(...) and then try updating again.\n")
  }
  
  if(name != "train"){
    # X...
    # Check that length is the same
    if(length(data) != length(self$data$train)){
      stop("ERROR: Number of datasets provided (", length(data), ") is not equal to the number of datasets used to train the SPEAR object (", length(self$data$train), "). SPEAR requires new samples to have the same assays used for training.")
    }
    # Check that number of analytes are the same
    if(any(sapply(1:length(data), function(d){return(nrow(data[[d]]) != nrow(self$data$train[[d]]))}))){
      stop("ERROR: One or more datasets provided has a different number of analytes than those in SPEAR$data$train. SPEAR requires all new samples have the same analytes used for training.")
    }
  }
  # Check that columns all match columns
  if(name != "train"){
    if(any(sapply(1:length(data), function(d){return(any(rownames(data[[d]]) != rownames(SPEAR$data$train[[d]])))}))){
      stop("ERROR: One or more analyte (row) names in mae experiments provided were not found in the data used to train this SPEAR object (SPEAR$data$train). All analytes must match to add a dataset.")
    }
  }
  # response...
  if(!is.null(response)){
    c.data <- MultiAssayExperiment::colData(data)
    if(!all(response %in% colnames(c.data))){
      stop("ERROR: `response` provided was not found in the MultiAssayExperiment::colData(data). Needs to be a matching column(s) from the data.frame...")
    }
  } else if(name == "train"){
    stop("ERROR: response is a required parameter for the train (initial) dataset. Please choose a response from colnames(MultiAssayExperiment::colData(data))")
  }
  
    # name and store:
  self$data[[name]] <- data
  
  # FUTURE: use ids in case of names not matching...
  # if train: generate SPEARids
  #if(name == "train"){
  #  private$generate.spear.ids()
  #}
  
  if(!self$options$quiet & name != "train"){cat("Saved dataset to $data$", name, "\n\n", sep = "")}
  return(invisible(self))
}

#' Remove a dataset from a SPEAR object.
#' @param name Name for the stored MultiAssayExperiment dataset in the SPEAR object.
#' @examples
#' SPEAR <- SPEAR$new(...)
#' 
#' # add a dataset under SPEAR$data$dataset1:
#' SPEAR$add.data(MultiAssayExperiment(...), response = "death", name = "dataset1")
#' 
#' # remove the dataset stored under SPEAR$data$dataset1:
#' SPEAR$remove.data("dataset1")
#' 
#'@export
remove.data = function(name = NULL){
  if(is.null(name)){
    stop("ERROR: no name provided...")
  } else if(!name %in% names(self$data)){
    stop("ERROR: name `", name, "` not found amongst possible data names. Use `names(SPEAR$data)` to see datasets.")
  }
  
  # Remove dataset:
  self$data[[name]] <- NULL
  
  if(!self$options$quiet & name != "train"){cat("Removed dataset from $data$", name, "\n\n", sep = "")}
  return(invisible(self))
}

#' Extract a dataset from a SPEAR object.
#' @param name Name for the stored MultiAssayExperiment dataset in the SPEAR object.
#' @examples
#' SPEAR <- SPEAR$new(...)
#' 
#' # add a dataset under SPEAR$data$dataset1:
#' SPEAR$add.data(MultiAssayExperiment(...), response = "death", name = "dataset1")
#' 
#' # extract the dataset stored under SPEAR$data$dataset1:
#' data <- SPEAR$get.data("dataset1")
#' 
#'@export
get.data = function(name = NULL){
  if(is.null(name)){
    stop("ERROR: no name provided...")
  } else if(!name %in% names(self$data)){
    stop("ERROR: name `", name, "` not found amongst possible data names. Use `names(SPEAR$data)` to see datasets.")
  }
  return(self$data[[name]])
}

#' Set the colors for a SPEAR object, to be used in plotting functions
#' @param colors.x Which colors to use for assays (X)? Must be a vector of length SPEAR$data$train If not named, will assume order matches.
#' @param colors.y Which colors to use for the response(s) (Y)? Must be a vector of length ncol(SPEARobj$data$train$Y). If not named, will assume order matches.
#' @examples
#' SPEARobj <- make.SPEARobject(...)
#' 
#' # Option 1: set to defaults:
#' SPEARobj$set.color.scheme()
#' 
#' # Option 2: provide colors:
#' colors.x <- c("red", "blue", "green", ...) # one for each dataset in SPEARobj$data$train$Xlist
#' colors.y <- c("yellow", ...) # one for each column in SPEARobj$data$train$Y
#' SPEARobj$set.color.scheme(colors.x = colors.x, colors.y = colors.y)
#' 
#' # Option 3: provide colors as named list:
#' colors.x <- c("red", "blue", "green", ...) # one for each dataset in SPEARobj$data$train$Xlist
#' names(colors.x) <- names(SPEARobj$data$train$Xlist)
#' colors.y <- c("yellow", ...) # one for each column in SPEARobj$data$train$Y
#' names(colors.y) <- colnames(SPEARobj$data$train$Y)
#' SPEARobj$set.color.scheme(colors.x = colors.x, colors.y = colors.y)
#' 
set.color.scheme = function(colors.x = NULL, colors.y = NULL){
  
  colorlist <- list()
  
  # X:
  num.assays <- length(self$data$train)
  colorvec.assays <- RColorBrewer::brewer.pal(n = 9, name = "Set1")[1:num.assays]
  names(colorvec.assays) <- names(self$data$train)
  
  # Y:
  if(self$params$family == "gaussian"){
    num.responses <- length(self$params$response)
    if(length(self$params$response) < 3){
      colorvec.responses <- RColorBrewer::brewer.pal(n = 3, name = "Reds")[1:num.responses]
    } else {
      colorvec.responses <- RColorBrewer::brewer.pal(n = num.responses, name = "Reds")
    }
    names(colorvec.responses) <- self$params$response
  } else {
    Y <- private$get.Y()
    num.responses <- length(self$params$response)
    colorvec.responses <- list()
    palettes <- c("Reds", "Blues", "Purples", "Greens") # Up to 4 currently...
    for(r in 1:length(self$params$response)){
      num.responses.tmp <- length(levels(MultiAssayExperiment::colData(self$data$train)[[self$params$response[r]]]))
      if(num.responses.tmp < 3){
        colorvec.responses[[self$params$response[r]]] <- RColorBrewer::brewer.pal(n = 3, name = palettes[r])[c(1,num.responses.tmp)]
      } else {
        colorvec.responses[[self$params$response[r]]] <- RColorBrewer::brewer.pal(n = num.responses.tmp, name = palettes[r])
      }
      names(colorvec.responses[[self$params$response[r]]]) <- levels(MultiAssayExperiment::colData(self$data$train)[[self$params$response[r]]])
    }
  }
  # Save:
  colorlist$X <- colorvec.assays
  colorlist$Y <- colorvec.responses
  self$options$color.scheme <- colorlist
  return(invisible(self))
}


#' Print out a variety of summary information about a SPEARobject
#' @param type Which type of summary to print? Can be "data". Defaults to NULL.
#' @param remove.formatting Remove text color/bold font face. Defaults to FALSE.
#' @param quiet Do not print anything. Defaults to FALSE. 
#'@export
print.out = function(type = NULL, remove.formatting = FALSE, quiet = FALSE){
  if(quiet){
    return(NULL)
  }

  if(is.null(type)){
    cat("")
  }
  # ---------------------------------
  if(type == "data"){
    cat(paste0("Detected ", length(self$data$train), " datasets:\n"))
    for(i in 1:length(self$data$train)){
      cat(names(self$data$train)[i], "\tSamples: ", ncol(self$data$train[[i]]), "\tAnalytes: ", nrow(self$data$train[[i]]), "\n")
    }
    cat(paste0("Detected ", length(self$params$response), " response ", ifelse(length(self$params$response) == 1, "variable", "variables"), ":\n"))
    for(i in 1:length(self$params$response)){
      cat(self$params$response[i], "\tSamples:  ", nrow(MultiAssayExperiment::colData(self$data$train)), "\tType: ", self$params$family, "\n")
    }
  }
  # ---------------------------------
  if(type == "help"){
    cat('For assistance, browse the SPEAR vignettes with any of these commands...\n> SPEAR$SPEAR.help()\n> SPEAR::SPEAR.help()\n> browseVignettes("SPEAR")\n\n')
  }
  
}


#' One-hot encode a matrix or data.frame (Y)
#' 
#'@export
one.hot.encode.multinomial <- function(){
  # get levels for factor:
  vals <- levels(MultiAssayExperiment::colData(self$data$train)[[self$params$response]])

  Y.multi <- matrix(0, nrow = nrow(MultiAssayExperiment::colData(self$data$train)), ncol = length(vals))
  for(i in 1:nrow(Y.multi)){
    Y.multi[i, which(vals == MultiAssayExperiment::colData(self$data$train)[i,self$params$response])] <- 1
  }
  colnames(Y.multi) <- vals
  rownames(Y.multi) <- rownames(MultiAssayExperiment::colData(self$data$train))
  # return
  return(Y.multi)
}


#' Encode a matrix or data.frame (Y) into ordinal (0, 1, 2, ...)
#' 
#'@export
encode.ordinal <- function(){
  # get levels for factor:
  vals <- levels(MultiAssayExperiment::colData(self$data$train)[,self$params$response])
  encoded <- 0:(length(vals)-1)
  names(encoded) <- vals
  
  Y.ord <- matrix(0, nrow = nrow(MultiAssayExperiment::colData(self$data$train)), ncol = 1)
  for(i in 1:nrow(Y.ord)){
    Y.ord[i,1] <- encoded[MultiAssayExperiment::colData(self$data$train)[i,self$params$response]]
  }
  
  colnames(Y.ord) <- self$params$response
  rownames(Y.ord) <- rownames(MultiAssayExperiment::colData(self$data$train))
  # return
  return(Y.ord)
}


#' Update the dimension names for all SPEARobject matrices. Used internally.
#'@export
update.dimnames = function(){
  Y <- private$get.Y()
  dimnames(self$fit$regression.coefs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights)))
  dimnames(self$fit$projection.coefs.x) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights))) 
  dimnames(self$fit$projection.coefs.y) = list(paste0("Factor", 1:self$params$num.factors), colnames(Y), paste0("w.idx.", 1:nrow(self$params$weights))) 
  dimnames(self$fit$nonzero.probs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights))) 
  dimnames(self$fit$projection.probs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights))) 
  dimnames(self$fit$marginal.probs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights))) 
  dimnames(self$fit$joint.probs) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("w.idx.", 1:nrow(self$params$weights)))
  if(self$fit$type == "cv"){
    dimnames(self$fit$regression.coefs.cv) = list(colnames(self$data$train$X), paste0("Factor", 1:self$params$num.factors), paste0("Fold", 1:self$params$num.folds), paste0("w.idx.", 1:nrow(self$params$weights)))
    dimnames(self$fit$projection.coefs.cv) = list(paste0("Factor", 1:self$params$num.factors), colnames(Y), paste0("Fold", 1:self$params$num.folds), paste0("w.idx.", 1:nrow(self$params$weights)))
  }
}


#' Create the parameters for SPEAR. Private method.
#'@export
create.params = function(px, py, pz, n, num_factors, n_weights, npath){
  params = list()
  params$post_mu = array(0, dim = c(px,num_factors,n_weights))
  params$post_sigma2 = array(0, dim = c(px,num_factors,n_weights))
  params$post_pi = array(0, dim = c(px,num_factors,n_weights))
  params$post_tmuX = array(0, dim = c(px,num_factors,n_weights))
  params$post_tsigma2X = array(0, dim = c(px,num_factors,n_weights))
  params$post_tpiX = array(0, dim = c(px,num_factors,n_weights))
  params$post_tpiX_marginal = array(0, dim = c(px,num_factors,n_weights))
  params$post_tmuY = array(0, dim = c(py,num_factors,n_weights))
  params$post_tsigma2Y = array(0, dim = c(py,num_factors,n_weights))
  params$post_tpiY = array(0, dim = c(py,num_factors,n_weights))
  params$tauY = array(0, dim = c(py,num_factors,n_weights))
  params$tauZ = array(0, dim = c(pz,num_factors,n_weights))
  params$log_pi = array(0, dim = c(pz,num_factors,n_weights))
  params$log_minus_pi= array(0, dim = c(pz,num_factors,n_weights))
  params$nuYmat = array(0, dim = c(n, py,n_weights))
  params$nuXmat = array(0, dim = c(n, px,n_weights))
  params$meanFactors = array(0, dim = c(n,num_factors,n_weights))
  params$post_a0  = array(0, dim = c(npath,num_factors,n_weights))
  params$post_a1 = array(0, dim = c(npath,num_factors,n_weights))
  params$post_b0 = array(0, dim = c(npath,num_factors,n_weights))
  params$post_b1 = array(0, dim = c(npath,num_factors,n_weights))
  params$post_a2x  = array(0, dim = c(px,n_weights))
  params$post_b2x = array(0, dim = c(px,n_weights))
  params$post_a2y  = array(0, dim = c(py,n_weights))
  params$post_b2y = array(0, dim = c(py,n_weights))
  params$ordering = array(0, dim = c(num_factors,n_weights))
  return(params)
}



#' Check if a fit object is generated and if a weight has been set...
#'@export
check.fit = function(){
  # Check if a fit object exists:
  if(is.null(self$fit)){
    stop("ERROR: $fit is NULL. Run $run.cv.spear(...) or $run.spear() to generate the $fit object.")
  }
  if(is.null(self$options$current.weight.idx)){
    stop("ERROR: $options$current.weight.idx is NULL. Set this parameter manually [1 through nrow($params$weights)] or use $set.weights()")
  }
}

#' Confirm fold.ids will work in cv.spear 
#' @param fold.ids Fold ids
#'@export
check.fold.ids = function(fold.ids){
  Y <- private$get.Y()
  return(any(sapply(unique(fold.ids), function(k){
    for(j in 1:ncol(Y)){
      subset = Y[which(fold.ids != k),j]
      if(self$params$family != "gaussian" & any(table(subset) < 2)){
        return(TRUE)
      }
    }
    return(FALSE)
  })))
}



#' Get factor scores from a SPEAR object for a particular dataset `data`.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param cv If `data = "train"`, get factor scores generated from cross validation? If `$run.spear` was used or if `data != "train"` this parameter is ignored. Defaults to `FALSE`.
#'@export
get.factor.scores = function(data = "train", cv = FALSE){
  private$check.fit()
  X <- private$get.concatenated.X(data = data)
  if(data == "train" & self$fit$type == "cv" & cv){
    factor.scores.cv <- matrix(NA, nrow = nrow(X), ncol = self$params$num.factors)
    for(i in 1:length(self$params$fold.ids)){
      factor.scores.cv[i,] <- X[i,] %*% self$fit$regression.coefs.cv[,, self$params$fold.ids[i], self$options$current.weight.idx]
    }
    rownames(factor.scores.cv) <- rownames(X)
    colnames(factor.scores.cv) <- paste0("Factor", 1:ncol(factor.scores.cv))
    return(factor.scores.cv)
  } else if(cv == TRUE){
    warning("Warning: Can't return CV factor scores (data != 'train'). Returning factor scores from all data.")
  }
  factor.scores <- X %*% self$fit$regression.coefs[,,self$options$current.weight.idx]
  rownames(factor.scores) <- rownames(X)
  colnames(factor.scores) <- paste0("Factor", 1:ncol(factor.scores))
  return(factor.scores)
}


#' Get features from a SPEAR object.
#' @param rank How to rank features? Defaults to `"probability"` (joint.probability) followed by projection.coefficient. Can also be "`coefficient`" (only projection.coefficient magnitude).
#' @param factors Which factors to return features for? Accepts integers (i.e. `factors = c(1,2,4)` or `factors = 1`) Defaults to `NULL` (all).
#' @param assays Which assays? Check names via `names(SPEAR$data$train)`. Defaults to `NULL` (all)
#' @param coefficient.cutoff What projection.coefficient value to use as the cutoff? Defaults to 0
#' @param probability.cutoff What joint.probability value to use as the cutoff? Defaults to .95
#' @param correlation Return columns with factor score correlations? Accepts `"spearman"`, `"pearson"`, `"kendall"`, or "`none`" (default)
#' @param simplify Simplify data.frame output to only include Factor, Analyte, Assay, joint.probability and projection.coefficient? Defaults to `TRUE`
#'@export
get.analyte.scores = function(rank = "probability", factors = NULL, assays = NULL, coefficient.cutoff = 0, probability.cutoff = .95, correlation = "none", simplify = TRUE){
  private$check.fit()
  w.idx <- self$options$current.weight.idx
  if(is.null(factors)){
    factors <- 1:self$params$num.factors
  }
  factors.string <- paste0("Factor", factors)
  if(is.null(assays)){
    assays <- names(self$data$train)
  } else {
    if(any(!assays %in% names(self$data$train))){
      stop("ERROR: assays provided not recognized. Need to be names found in names(SPEAR$data$train)")
    }
  }
  X <- private$get.concatenated.X()
  features <- list()
  w.joint.probabilities = self$fit$joint.probs[,,w.idx]
  w.marginal.probabilities = self$fit$marginal.probs[,,w.idx]
  w.projection.probabilities = self$fit$projection.probs[,,w.idx]
  w.nonzero.probabilities = self$fit$nonzero.probs[,,w.idx]
  w.regression.coefficients = self$fit$regression.coefs[,,w.idx]
  w.projection.coefficients = self$fit$projection.coefs.x[,,w.idx]
  colnames(w.joint.probabilities) <- paste0("Factor", 1:ncol(w.joint.probabilities))
  rownames(w.joint.probabilities) <- colnames(X)
  colnames(w.marginal.probabilities) <- paste0("Factor", 1:ncol(w.marginal.probabilities))
  rownames(w.marginal.probabilities) <- colnames(X)
  colnames(w.projection.probabilities) <- paste0("Factor", 1:ncol(w.projection.probabilities))
  rownames(w.projection.probabilities) <- colnames(X)
  colnames(w.nonzero.probabilities) <- paste0("Factor", 1:ncol(w.nonzero.probabilities))
  rownames(w.nonzero.probabilities) <- colnames(X)
  colnames(w.regression.coefficients) <- paste0("Factor", 1:ncol(w.joint.probabilities))
  rownames(w.regression.coefficients) <- colnames(X)
  colnames(w.projection.coefficients) <- paste0("Factor", 1:ncol(w.joint.probabilities))
  rownames(w.projection.coefficients) <- colnames(X)

  # by factor
  num.omics <- length(self$data$train)
  factor.features <- list()
  for(f in 1:self$params$num.factors){
    temp <- list()
    factor <- paste0("Factor", f)
    s <- 1
    for(o in 1:num.omics){
      omic.features <- list()
      w.joint.probabilities.current <- w.joint.probabilities[s:(s + nrow(self$data$train[[o]]) - 1),factor]
      w.marginal.probabilities.current <- w.marginal.probabilities[s:(s + nrow(self$data$train[[o]]) - 1),factor]
      w.projection.probabilities.current <- w.projection.probabilities[s:(s + nrow(self$data$train[[o]]) - 1),factor]
      w.nonzero.probabilities.current <- w.nonzero.probabilities[s:(s + nrow(self$data$train[[o]]) - 1),factor]
      w.regression.coefficients.current <- w.regression.coefficients[s:(s + nrow(self$data$train[[o]]) - 1),factor]
      w.projection.coefficients.current <- w.projection.coefficients[s:(s + nrow(self$data$train[[o]]) - 1),factor]
      s <- s + nrow(self$data$train[[o]])
      # Round loading probabilities to 7 decimals (to rank 1's):
      #w.joint.probabilities.current <- round(w.joint.probabilities.current, 7)
      #w.marginal.probabilities.current <- round(w.marginal.probabilities.current, 7)
      #w.projection.probabilities.current <- round(w.projection.probabilities.current, 7)
      feat.order <- order(w.joint.probabilities.current, abs(w.projection.coefficients.current), decreasing = TRUE)
      w.regression.coefficients.current <- w.regression.coefficients.current[feat.order]
      w.projection.coefficients.current <- w.projection.coefficients.current[feat.order]
      w.joint.probabilities.current <- w.joint.probabilities.current[feat.order]
      w.marginal.probabilities.current <- w.marginal.probabilities.current[feat.order]
      w.projection.probabilities.current <- w.projection.probabilities.current[feat.order]
      w.nonzero.probabilities.current <- w.nonzero.probabilities.current[feat.order]
      temp[[names(self$data$train)[o]]]$features <- names(w.joint.probabilities.current)
      temp[[names(self$data$train)[o]]]$joint.probabilities <- w.joint.probabilities.current
      temp[[names(self$data$train)[o]]]$marginal.probabilities <- w.marginal.probabilities.current
      temp[[names(self$data$train)[o]]]$projection.probabilities <- w.projection.probabilities.current
      temp[[names(self$data$train)[o]]]$nonzero.probabilities <- w.nonzero.probabilities.current
      temp[[names(self$data$train)[o]]]$projection.coefficients <- w.projection.coefficients.current
      temp[[names(self$data$train)[o]]]$regression.coefficients <- w.regression.coefficients.current
    }
    features[[factor]] <- temp
  }
  fs <- self$get.factor.scores(cv = FALSE)
  fs.cv <- self$get.factor.scores(cv = TRUE)
  results.list <- list()
  for(f in factors.string){
    for(o in assays){
      feature.vec <- features[[f]][[o]]
      passed.coeff.cutoff <- abs(feature.vec$projection.coefficients) >= coefficient.cutoff
      passed.prob.cutoff <- feature.vec$joint.probabilities >= probability.cutoff
      passed.total.cutoff <- passed.coeff.cutoff & passed.prob.cutoff
      if(sum(passed.total.cutoff) > 0){
        res <- as.data.frame(feature.vec)[passed.coeff.cutoff & passed.prob.cutoff,]
        res$omic <- o
        res$factor <- f
        res$cor.in.sample <- NA
        res$cor.cv <- NA
        res$pval.in.sample <- NA
        res$pval.cv <- NA
        # Correlations:
        if(correlation != "none"){
          for(r in 1:nrow(res)){
            vals <- self$data$train$Xlist[[o]][,which(colnames(self$data$train$Xlist[[o]]) == res$features[r])]
            res$cor.in.sample[r] <- cor(vals, fs[,f], method = correlation)
            res$cor.cv[r] <- cor(vals, fs.cv[,f], method = correlation)
            res$pval.in.sample[r] <- suppressWarnings(cor.test(vals, fs[,f], method = correlation)$p.value)
            res$pval.cv[r] <- suppressWarnings(cor.test(vals, fs.cv[,f], method = correlation)$p.value)
          }
          colnames(res) <- c("Analyte", "joint.probability", "marginal.probability", "projection.probability", "nonzero.probability", "projection.coefficient", "regression.coefficient", "Assay", "Factor", "factor.correlation", "factor.cv.correlation", "factor.cor.pvalue", "factor.cv.cor.pvalue")
          rownames(res) <- NULL
          res <- dplyr::select(res, 
                               Factor,
                               Analyte, 
                               Assay, 
                               regression.coefficient,
                               projection.coefficient, 
                               joint.probability,
                               marginal.probability,
                               projection.probability,
                               nonzero.probability,
                               factor.correlation, 
                               factor.cor.pvalue, 
                               factor.cv.correlation, 
                               factor.cv.cor.pvalue)
        } else {
          colnames(res) <- c("Analyte", "joint.probability", "marginal.probability", "projection.probability", "nonzero.probability", "projection.coefficient", "regression.coefficient", "Assay", "Factor")
          rownames(res) <- NULL
          res <- dplyr::select(res, 
                               Factor,
                               Analyte, 
                               Assay, 
                               regression.coefficient,
                               projection.coefficient, 
                               joint.probability,
                               marginal.probability,
                               projection.probability,
                               nonzero.probability)
        }
        results.list[[length(results.list) + 1]] <- res
      }
    }
  }
  if(length(results.list) == 0){
    warning("*** Warning: No features found within cutoffs for chosen Factors/assays Returning data.frame with 0 rows. Try broadening the criteria.")
    total.results <- data.frame( Factor = character(0),
                                 Analyte = character(0), 
                                 Assay = character(0), 
                                 regression.coefficient = numeric(0),
                                 projection.coefficient = numeric(0), 
                                 joint.probability = numeric(0),
                                 marginal.probability = numeric(0),
                                 projection.probability = numeric(0),
                                 nonzero.probability = numeric(0))
  } else {
    total.results <- do.call("rbind", results.list)
    total.results <- dplyr::arrange(total.results, Factor, -joint.probability, -abs(projection.coefficient))
  }
  
  # Return simplified matrix:
  if(simplify){
    total.results <- dplyr::select(total.results, Factor, Analyte, Assay, joint.probability, projection.coefficient)
  }
  return(total.results)
}




#' Get features from a SPEARobject.
#' @param rank How to rank features? Defaults to `"probability"` (joint.probability) followed by projection.coefficient. Can also be "`coefficient`" (only projection.coefficient magnitude).
#' @param factors Which factors to return signatures for? Accepts integers (i.e. `factors = c(1,2,4)` or `factors = 1`) Defaults to `NULL` (all).
#' @param assays Which assays? Check names via `names(SPEAR$data$train)`. Defaults to `NULL` (all)
#' @param coefficient.cutoff What projection.coefficient value to use as the cutoff? Defaults to 0
#' @param probability.cutoff What joint.probability value to use as the cutoff? Defaults to .95
#' @param correlation Return columns with factor score correlations? Accepts `"spearman"`, `"pearson"`, `"kendall"`, or "`none`" (default)
#'@export
get.signatures = function(rank = "probability", factors = NULL, assays = NULL, coefficient.cutoff = 0, probability.cutoff = .95, correlation = "none"){
  total.results <- self$get.analyte.scores(rank = rank, factors = factors, assays = assays, coefficient.cutoff = coefficient.cutoff, probability.cutoff = probability.cutoff, correlation = correlation)
  if(nrow(total.results) == 0){
    stop("ERROR: No analytes found for the criteria selected. Try to loosen the cutoffs or select a different factor.")
  }
  if(is.null(factors)){
    factors <- 1:self$params$num.factors
  }
  factor.vec <- paste0("Factor", factors)
  
  assay.vec <- names(self$data$train)
  
  # Return list of signatures per assay:
  sigs <- list()
  for(f in 1:length(factor.vec)){
    tmp <- list()
    for(a in 1:length(assay.vec)){
      filt <- dplyr::filter(total.results, Factor == factor.vec[f], Assay == assay.vec[a])
      if(nrow(filt) > 0){
        tmp[[assay.vec[a]]] <- filt$Analyte
      } else {
        tmp[[assay.vec[a]]] <- character(0)
      }
    }
    sigs[[factor.vec[f]]] <- tmp
  }
  
  return(sigs)
}



#' Randomly generate identifiers for analytes (to prevent misidentification)
#'@export
generate.spear.ids = function(){
  Xlist <- as.list(MultiAssayExperiment::experiments(self$data$train))
  assays = c()
  assay.ids = c()
  analytes = c()
  spear.ids <- c()
  for(a in 1:length(Xlist)){
    tmp <- Xlist[[a]]
    assays <- c(assays, rep(names(Xlist)[a], nrow(tmp)))
    assay.ids <- c(assay.ids, rep(a, nrow(tmp)))
    analytes <- c(analytes, rownames(tmp))
    spear.ids <- c(spear.ids, paste0("sid-", sprintf("%02d", a), "-", sprintf("%06d", 1:nrow(tmp))))
  }

  df <- data.frame(
    Assay = assays,
    Assay.id = assay.ids,
    Analyte = analytes,
    SPEAR.id = spear.ids
  )
  
  self$data$spear.ids <- df
  
  return(invisible(self))
}


#' Randomly generate identifiers for analytes (to prevent misidentification)
#' @param data Name of MAE stored under $data (defaults to "train")
#'@export
get.concatenated.X = function(data = "train"){
  return(X <- t(do.call("rbind", as.list(MultiAssayExperiment::experiments(self$data[[data]])))))
}

#' Randomly generate identifiers for analytes (to prevent misidentification)
#' @param data Name of MAE stored under $data (defaults to "train")
#'@export
get.Y = function(data = "train"){
  if(self$params$family == "multinomial"){
    Y <- as.matrix(private$one.hot.encode.multinomial())
  } else if(self$params$family == "ordinal" | self$params$family == "binomial"){
    Y <- as.matrix(private$encode.ordinal())
  } else { # gaussian
    Y <- as.matrix(MultiAssayExperiment::colData(self$data[[data]])[,which(colnames(MultiAssayExperiment::colData(self$data[[data]])) %in% self$params$response), drop = FALSE])
  }
  return(Y)
}



#' Randomly generate fold.ids to be used for reproducibility in `$run.cv.spear(...)`. See `method` for different ways to generate fold.ids (for cross-validation).
#' @param method How to generate the fold ids? `method = "balanced"` (default) will try to achieve maximum balance amongst classes in `Y` (when `family = "ordinal"` or `family = "multinomial"`). `method = "random"` will randomly assign a number 1-`num.folds`.
#'@export
generate.fold.ids = function(method = "balanced"){
  
  if(self$params$family == "gaussian"){
    method = "random"
  }
  
  num.folds <- self$params$num.folds
  
  Y <- private$get.Y()
  
  if(method == "balanced"){
    if(self$params$family == "multinomial"){
      Y <- apply(Y, 1, which.max)
    } else {
      if(ncol(Y) > 1){
        warning("More than one column in Y provided. Balancing fold.ids with respect to $options$current.response.idx (", self$options$current.response.idx, " | ", self$params$response[self$options$current.response.idx], ")")
      }
      Y <- Y[,self$options$current.response.idx]
    }
    df <- as.data.frame(Y)
    nfolds = self$params$num.folds
    N <- nrow(Y)
    df$foldids = NA
    for(i in sort(unique(Y))){
      N.temp <- sum(Y == i)
      loc <- which(Y == i)
      vals <- sample(x = rep(1:nfolds, ceiling(N.temp/nfolds)), size = N.temp, replace = F)
      df$foldids[loc] <- vals
    }
    output <- df$foldids
    names(output) <- rownames(Y)
    return(output)
    
  } else if(method == "random"){
    
    nfolds = self$params$num.folds
    N <- nrow(Y)
    fold.ids <- sample(x = rep(1:nfolds, ceiling(N/nfolds)), size = N, replace = F)
    names(fold.ids) <- rownames(Y)
    return(fold.ids)
    
  } else {
    stop("ERROR: method not recognized. Can be 'balanced' or 'random'")
  }
}








#' Get predictions from a SPEARobject and a dataset `data`.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param cv If `data = "train"`, get factor scores generated from `$run.cv.spear`? If `$run.spear` was used or if `data != "train"` this parameter is ignored. Defaults to `TRUE`.
#'@export
get.predictions = function(data = "train", cv = TRUE){
  X <- private$get.concatenated.X(data = data)
  Y <- private$get.Y(data = data)
  if(data == "train" & self$fit$type == "cv" & cv){
    # Get CV Predictions:
    factor.scores <- self$get.factor.scores(data = data, cv = TRUE)
    preds = matrix(NA, nrow = nrow(X), ncol = ncol(Y))
    for(i in 1:nrow(preds)){
      preds[i,] <- factor.scores[i,] %*% self$fit$projection.coefs.y.cv.scaled[,,self$params$fold.ids[i],self$options$current.weight.idx]
    }
    rownames(preds) <- rownames(X)
    colnames(preds) <- colnames(Y)
  } else {
    # Get in.sample Predictions:
    factor.scores <- self$get.factor.scores(data = data, cv = FALSE)
    preds = matrix(NA, nrow = nrow(X), ncol = ncol(Y))
    for(i in 1:nrow(preds)){
      preds[i,] <- factor.scores[i,] %*% self$fit$projection.coefs.y.scaled[,,self$options$current.weight.idx]
    }
    rownames(preds) <- rownames(X)
    colnames(preds) <- colnames(Y)
  }
  
  # Return signal if gaussian, otherwise use signal with cutoffs
  if(self$params$family == "gaussian"){
    return(preds)
  } else if(self$params$family == "multinomial"){
    signal <- preds
    intercept = sapply(self$fit$intercepts.scaled, function(z) z[self$options$current.weight.idx])
    Pmat = t(apply(signal, 1, function(z) exp(z+intercept)/sum(exp(z+intercept))))
    predictions <- matrix(apply(Pmat, 1, which.max) - 1, ncol=1)
    predictions.encoded = matrix(0, ncol = ncol(Pmat), nrow = nrow(Pmat))
    for(i in 1:nrow(predictions.encoded)){
      predictions.encoded[i,predictions[i]] <- 1
    }
    rownames(predictions) <- rownames(preds)
    colnames(predictions) <- "pred.class"
    rownames(predictions.encoded) <- rownames(preds)
    colnames(predictions.encoded) <- colnames(Y)
    results.combined <- list(class.predictions = predictions,
                             encoded.predictions = predictions.encoded,
                             class.probabilities = Pmat,
                             signal = signal)
    return(results.combined)
  }
  
  # If family == "binomial" or "ordinal"
  results = list()
  # Go column by column (once for each response variable...)
  for(j in 1:ncol(preds)){
    resp.name <- colnames(preds)[j]
    
    # Get signal (predictions) for current response...
    signal <- matrix(preds[,j], ncol = 1)
    rownames(signal) <- rownames(preds)
    colnames(signal) <- c(resp.name)
    
    # Get intercepts (scaled)
    intercept = self$fit$intercepts.scaled[[j]][self$options$current.weight.idx,]
    
    # Set up Probability Matrices:
    Pmat0 = matrix(0, ncol = length(intercept), nrow = length(signal))
    Pmat = matrix(0, ncol = length(intercept) + 1, nrow = length(signal))
    
    # Use signal and intercept to calculate probabilities:
    for(k in 1:ncol(Pmat0)){
      Pmat0[,k] = 1.0/(1+exp(-signal - intercept[k]))
    }
    for(k in 1:ncol(Pmat)){
      if(k == 1){
        Pmat[,1] = 1-Pmat0[,1]
      }else if(k == ncol(Pmat)){
        Pmat[,k] = Pmat0[,k-1]
      }else{
        Pmat[,k] = Pmat0[,k-1] - Pmat0[,k]
      }
    }
    
    # Format Pmat to get class.predictions
    class.predictions <- matrix(apply(Pmat, 1, which.max) - 1, ncol=1)
    rownames(class.predictions) <- rownames(preds)
    colnames(class.predictions) <- c(resp.name)
    rownames(Pmat) <- rownames(preds)
    colnames(Pmat) <- paste0("Class", 0:(ncol(Pmat)-1))
    rownames(Pmat0) <- rownames(preds)
    
    # Return class.predictions, Pmat, and signal
    results[[resp.name]] <- list(class.predictions = class.predictions,
                                 class.separations = Pmat0,
                                 class.probabilities = Pmat,
                                 signal = signal)
  }
  return(results)
}






#' Assess the predictions from the SPEAR model using a variety of metrics. Utilizes the 'yardstick' package under the 'tidymodels' umbrella.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param cv If `data = "train"`, get factor scores generated from `$run.cv.spear`? If `$run.spear` was used or if `data != "train"` this parameter is ignored. Defaults to `TRUE`.
#'@export
assess <- function(data = "train", cv = TRUE){
  
  family <- self$params$family
  Y <- private$get.Y(data = data)
  if(family == "gaussian"){
    # Make data.frame of true x estimates:
    preds <- self$get.predictions(data = data, cv = cv)
    for(p in 1:ncol(Y)){
      resp <- colnames(Y)[p]
      vals <- unname(unlist(Y))
      estimates <- preds[,p]
      tidy.pred.df <- data.frame(
        truth = vals,
        estimate = estimates,
        Response = resp
      )
      tidy.pred.df <- dplyr::group_by(tidy.pred.df, Response)
    }
    
  } else if(family == "multinomial"){
    preds <- self$get.predictions(data = data, cv = cv)
    true <- apply(Y, 1, which.max)-1
    estimates <- preds$class.predictions
    # make factor:
    true <- factor(true)
    estimates <- factor(estimates, levels = levels(true))
    tidy.pred.df <- data.frame(
      truth = true,
      estimate = estimates
    )
    tidy.prob.df <- preds$class.probabilities
    tidy.pred.df <- cbind(tidy.pred.df, tidy.prob.df)
    colnames(tidy.pred.df) <- c("truth", "estimate", levels(true))
  } else {
    # Ordinal or Binomial
    all.pred.df <- data.frame()
    for(resp in colnames(Y)){
      preds <- self$get.predictions(data = data, cv = cv)[[resp]]
      true <- Y
      estimates <- preds$class.predictions
      # make factor:
      true <- factor(true)
      estimates <- factor(estimates, levels = levels(true))
      tmp.df <- data.frame(
        truth = true,
        estimate = estimates
      )
      tidy.prob.df <- preds$class.probabilities
      tmp.df <- cbind(tmp.df, tidy.prob.df, resp)
      colnames(tmp.df) <- c("truth", "estimate", levels(true), "Response")
      all.pred.df <- rbind(all.pred.df, tmp.df)
    }
    all.pred.df <- dplyr::group_by(all.pred.df, Response)
    tidy.pred.df <- all.pred.df
  }
  return(tidy.pred.df)
}




#' Choose a combination of weights for a SPEAR object. Print `SPEAR$params$weights` to see all combinations trained.
#' @param w.idx Use a manual weight index (row from `SPEAR$params$weights`)? Defaults to `NULL`
#' @param method Which method to use? Needs to be from a `$train.spear(cv = TRUE)` object. Can be `"min"` (overall lowest mean cross validated loss), or `"sd"` (take highest weight within 1 sd cross validated loss).
#' @examples
#' SPEARobj <- SPEAR$new(...)
#' 
#' SPEARobj$train.spear(cv = TRUE)
#' 
#' # Set the weights
#' SPEARobj$set.weights()
#' SPEARobj$set.weights(w.x = 0.1)
#' SPEARobj$set.weights(w.idx = 1)
#' SPEARobj$set.weights(method = "sd")
#'@export
set.weights = function(w.idx = NULL, w.x = NULL, w.y = NULL, method = "sd"){
  private$check.fit()

  if(!is.null(w.idx)){
    if(!w.idx %in% 1:nrow(self$params$weights)){
      stop("ERROR: w.idx supplied not valid (must be between 1 and ", nrow(self$params$weights), "). Try SPEAR$params$weights to see all combinations (rows).")
    } else {
      if(!self$options$quiet){
        cat("Setting current weight index to ", w.idx, "\n", 
            "w.x: ", self$params$weights[w.idx,1],
            sep = "")
      }
      self$options$current.weight.idx = w.idx
    }
    
  }else if(!is.null(w.x) | !is.null(w.y)){
    if(is.null(w.x)){w.x <- 1}
    if(is.null(w.y)){w.y <- 1}
    
    sums <- sapply(1:nrow(self$params$weights), function(i){return(c(self$params$weights[i,1] - w.x, self$params$weights[i,2] - w.y))})
    
    if(!any(colSums(sums) == 0)){
      warning("*** Warning: w.x = ", w.x, " | w.y = ", w.y, " not found among possible weight combinations (SPEARobj$params$weights). Using closest SPEAR weight...\n")
    }
    w.idx = which.min(abs(colSums(sums)))
    
    if(!self$options$quiet){
      cat("Setting current weight index to ", w.idx, "\n", 
          "w.x: ", self$params$weights[w.idx,1],
          sep = "")
    }
    self$options$current.weight.idx = w.idx
    
  } else {
    # method:
    w.idx <- private$get.widx.from.method(method = method)
    if(!self$options$quiet){
      cat("Setting current weight index to ", w.idx, "\n", 
          "w.x: ", self$params$weights[w.idx,1], "\n",
         "w.y: ", self$params$weights[w.idx,2], "\n",
          sep = "")
    }
    self$options$current.weight.idx = w.idx
  }
  if(!self$options$quiet){cat("\n")}
  return(invisible(self))
}

#' Get a w.idx from a method (private function, can't be called publicly, use `set.weights` instead)
#' @param method Which method to use?
#'@export
get.widx.from.method <- function(method){
  if(self$fit$type != "cv"){
    stop("ERROR: $fit needs to come from $train.spear(cv = TRUE), not (cv = FALSE) to use a w.idx selection method.")
  } else if(is.null(self$fit$loss)){
    stop("ERROR: $cv.evaluate needs to be run to calculate the loss.")
  }
  
  # min -----------
  if(method == "min"){
    if(ncol(self$fit$loss$cvm) > 1){
      overall.cvm <- apply(self$fit$loss$cvm, 1, rowSums)
    } else {
      overall.cvm <- self$fit$loss$cvm
    }
    w.idx <- which.min(overall.cvm)
    
  } else if(method == "sd"){
    if(ncol(self$fit$loss$cvm) > 1){
      overall.cvm <- rowSums(self$fit$loss$cvm)
      overall.cvsd <- rowSums(self$fit$loss$cvsd)
    } else {
      overall.cvm <- self$fit$loss$cvm
      overall.cvsd <- self$fit$loss$cvsd
    }
    tmp.w.idx <- which.min(overall.cvm)
    tmp.cvsd <- overall.cvsd[tmp.w.idx]
    w.idx <- min(which(overall.cvm <= overall.cvm[tmp.w.idx] + tmp.cvsd))
    
  } else {
    stop("ERROR: method not recognized. See documentation for available methods to select a weight index.")
  }
  
  return(w.idx)
}



#' Plot factor scores from a SPEARobject for a particular dataset `data`.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param cv If `data = "train"`, get factor scores generated from `$run.cv.spear`? If `$run.spear` was used or if `data != "train"` this parameter is ignored. Defaults to `FALSE`. NOTE: There is an element of randomness if the factor scores are not correlated with the response, so it is recomended to view the factor scores with `cv = FALSE`
#' @param group What to plot on the x.axis? Defaults to "response" (indicated by $options$current.response.idx), but can be any feature name found in colnames(SPEARobj$data$train$X)
#' @param factors Which factors to show? Defaults to `NULL` (all factors). Use a vector of integers (i.e. `c(1, 2, 4)`) to specify fewer factors.
#' @param plot.type Which type of plot (for `ordinal`, `binomial`, and `multinomial` families)? Can be `"violin"` (default) or `"boxplot"`. Ignored if `family == "gaussian"`.
#' @param show.legend Show the legend of the plot (for `group`)? Defaults to `FALSE`
#'@export
plot.factor.scores = function(data = "train", cv = FALSE, group = "response", factors = NULL, plot.type = "violin", show.legend = FALSE, scale = TRUE){
  fs <- self$get.factor.scores(data = data, cv = cv)
  if(scale){
    fs <- apply(fs, 2, scale)
  }
  Y <- private$get.Y(data = data)
  if(is.null(factors)){
    factors <- 1:ncol(fs)
  } else if(any(!factors %in% 1:ncol(fs))){
    stop("ERROR: factors argument incorrect. Must be a single integer or vector of number spanning 1 to ", ncol(fs)) 
  }
  fs <- fs[,factors, drop = FALSE]
  
  df <- as.data.frame(fs)
  
  if(group == "response"){
    # Gaussian:
    if(self$params$family == "gaussian"){
      df$group <- Y[,self$options$current.response.idx]
      group.label <- self$params$response[self$options$current.response.idx]
    }
    # Multinomial or Ordinal
    else {
      df$group <- MultiAssayExperiment::colData(self$data[[data]])[,self$params$response[self$options$current.response.idx]]
      group.label <- names(self$options$color.scheme$Y)[[self$options$current.response.idx]]
    }
    
  }
  # TODO: add support to plot w.r.t. an analyte...
  # else if(!group %in% colnames(self$data[[data]]$X)){
  #  stop("ERROR: group parameter not recognized. Can be 'response' for the current response index, or any feature name found in self$data$train$X.")
  #} else {
  #  df$group <- self$data[[data]]$X[,group]
  #  group.label <- group
  #}
  
  # Plot df:
  df$id <- rownames(df)
  df.melt <- reshape2::melt(df, id.vars = c("id", "group"))
  
  g <- ggplot2::ggplot(df.melt)
  
  # Gaussian:
  if(self$params$family == "gaussian"){
    g <- g + ggplot2::geom_point(ggplot2::aes(x = group, y = value)) +
      ggplot2::geom_smooth(ggplot2::aes(x = group, y = value), formula = y ~ x, color = "red", method = "lm")
  } else {
    if(plot.type == "boxplot"){
      g <- g + ggplot2::geom_jitter(ggplot2::aes(x = group, y = value, fill = group), shape = 21, stroke = .3, size = 2) +
        ggplot2::geom_boxplot(ggplot2::aes(x = group, y = value, fill = group), alpha = .2, outlier.shape = NA)
    } else if(plot.type == "violin"){
      total.mean.df <- data.frame()
      for(f in unique(df.melt$variable)){
        means <- sapply(sort(unique(df.melt$group)), function(val){
          return(mean(dplyr::filter(df.melt, variable == f, group == val)$value))
        })
        num.groups <- length(unique(df.melt$group))
        
        mean.df <- data.frame(x = 1:num.groups - .5,
                              xend = 1:num.groups + .5,
                              y = means,
                              yend = means,
                              variable = f)
        total.mean.df <- rbind(total.mean.df, mean.df)
      }
      
      g <- g + 
        ggplot2::geom_jitter(ggplot2::aes(x = group, y = value, fill = group), shape = 21, stroke = .3, size = 2, width = .15) +
        ggplot2::geom_violin(ggplot2::aes(x = group, y = value, group = group, fill = group), alpha = .2) +
        ggplot2::geom_segment(data = total.mean.df, ggplot2::aes(x = x, xend = xend, y = y, yend = yend), size = .85)
      
    } else {
      stop("ERROR: plot.type provided not recognized. Can be 'boxplot' or 'violin' (default).")
    }
    if(group == "response" & (self$params$family != "gaussian")){
      colors <- self$options$color.scheme$Y[[self$options$current.response.idx]]
      g <- g + ggplot2::scale_color_manual(values = colors) + ggplot2::scale_fill_manual(values = colors)# +
        #ggplot2::scale_x_discrete(breaks = 0:(length(colors)-1), labels = colnames(self$data$train$Y))
    }
  }
  
  
  g <- g + ggplot2::facet_wrap(ggplot2::vars(factor(variable, levels = paste0("Factor", factors))), scales = "free") +
    ggplot2::ylab("Factor Score") +
    ggplot2::xlab(group.label) +
    ggplot2::labs(color = group.label, fill = group.label) +
    ggplot2::theme_classic() +
    ggplot2::theme(strip.background = ggplot2::element_rect(size = .3, fill = "grey90"),
                   plot.title = ggplot2::element_text(hjust = .5, size = 10),
                   axis.line = ggplot2::element_blank(),
                   axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = .5),
                   panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=.5),
                   legend.position = "bottom")
  
  if(!show.legend){
    g <- g + ggplot2::guides(color ="none", fill = "none")
  }
  
  return(g)
}





#' Plot variance explained for a SPEAR object.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param min.val Minimum percentage to be plotted. Must be a positive numeric value. Defaults to `.01`.
#'@export
plot.variance.explained = function(data = "train", min.val = .01){
  if(min.val < 0){
    stop("ERROR: min.val must be >= 0...")
  }
  contributions <- self$get.variance.explained(data = data)
  if(all(contributions < min.val)){
    stop("ERROR: No assays found to explain sufficient variance for min.val. Try min.val = 0.")
  }
  projection.coefs.melt <- reshape2::melt(contributions)
  g <- ggplot2::ggplot(projection.coefs.melt) +
    ggplot2::geom_tile(ggplot2::aes(x = Var2, y = factor(Var1, levels = paste0("Factor", self$params$num.factors:1)), fill = ifelse(value >= min.val, value, NA)), color = "grey50") +
    ggplot2::geom_text(data = dplyr::filter(projection.coefs.melt, value >= min.val),
                       ggplot2::aes(x = Var2, y = Var1, label = round(value, 2)), color = "black") +
    ggplot2::scale_fill_gradient2(low = "white", high = "#6831A3", na.value = "white") +
    ggplot2::labs(fill = "Var. Exp.") +
    ggplot2::theme_void() + ggplot2::theme(axis.text = ggplot2::element_text())
  # return plot, suppressing warnings for NA values
  return(suppressWarnings(g))
}



#' Plot variance explained for a SPEAR object.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#' @param min.val Minimum value for a coefficient to be plotted. Must be a positive numeric value. Defaults to `.001`.
#'@export
plot.factor.contributions = function(min.val = .001){
  if(min.val < 0){
    stop("ERROR: min.val must be >= 0...")
  }
  contributions <- self$get.factor.contributions()
  if(all(contributions < min.val)){
    stop("ERROR: No factor coefficients found within cutoff of min.val. Try min.val = 0.")
  }
  projection.coefs.melt <- reshape2::melt(contributions)
  g <- ggplot2::ggplot(projection.coefs.melt) +
    ggplot2::geom_tile(ggplot2::aes(x = Var2, y = factor(Var1, levels = paste0("Factor", self$params$num.factors:1)), fill = ifelse(abs(value) >= min.val, value, NA)), color = "grey50") +
    ggplot2::geom_text(data = dplyr::filter(projection.coefs.melt, abs(value) >= min.val),
                       ggplot2::aes(x = Var2, y = Var1, label = round(value, 3)), color = "black") +
    ggplot2::scale_fill_gradient2(low = "#316CA3", high = "darkred", na.value = "white") +
    ggplot2::labs(fill = "Coef.") +
    ggplot2::theme_void() + ggplot2::theme(axis.text = ggplot2::element_text())
  # return plot, suppressing warnings for NA values
  return(g)
}





#' Get factor contributions to response prediction from a SPEAR object.
#'@export
get.factor.contributions = function(){
  return(self$fit$projection.coefs.y.scaled[,,self$options$current.weight.idx, drop = FALSE])
}




#' Get variance explained per assay from a SPEAR object.
#' @param data Which dataset to use? Can be any dataset listed under `$data$____`. Defaults to `"train"`.
#'@export
get.variance.explained = function(data = "train"){
  w.idx <- self$options$current.weight.idx

  # X:
  W.to_X <- self$fit$projection.coefs.x[,,w.idx]
  Uhat <- self$get.factor.scores(data = data, cv = FALSE)
  W.to_X.list <- list()
  ind <- 1
  for(d in 1:length(self$data[[data]])){
    W.to_X.list[[d]] <- W.to_X[ind:(ind - 1 + nrow(self$data[[data]][[d]])),]
    ind <- ind + nrow(self$data[[data]][[d]])
  }
  factor_contributions = array(NA,dim = c(self$params$num.factors, length(self$data[[data]])))
  for(k in 1:self$params$num.factors){
    for(o in 1:length(self$data[[data]])){
      factor.contributions <- c()
      X.tilde <- Uhat[,k] %*% t(W.to_X.list[[o]][,k])
      # Get var.explained per feature:
      X <- t(self$data[[data]][[o]])
      X.tilde <- X.tilde + apply(X, 2, function(col){return(rnorm(n = length(col), mean = 0, sd = sqrt(var(col))/nrow(X)))})
      factor_contributions[k,o] <- mean(sapply(1:ncol(X), function(cid){
        tmp <- cor(X[,cid], X.tilde[,cid], method = "pearson")
        if(is.na(tmp)){return(0)};
        if(tmp < 0){return(0)};
        return(tmp^2)
      }))
    }
  }
  colnames(factor_contributions) <- names(self$data[[data]])
  rownames(factor_contributions) <- paste0("Factor", 1:nrow(factor_contributions))
  return(factor_contributions)
}




#' Estimate the number of factors needed for the SPEAR model.
#' @param simplify Only estimate the minimum and maximum number of factors (don't perform iterative factor construction)? Defaults to `FALSE`.
#' @param assign Should the SPEAR model parameter `$params$num.factors` be updated by the recommendation? Defaults to `TRUE`
#' @param num.iterations How many iterations to run to get average number of factors? Defaults to `10`.
#' @param max.num.factors Maximum number of factors? If `NULL` (default), will return `floor(N/10)` where `N` = number of samples in `X`
#' @param min.num.factors Minimum number of factors? If `NULL` (default), will calculate from SVD.
#' @param seed An integer provided for reproducibility, defaults to `NULL` (will not set seed)
#' @param SNR Signal-to-noise ratio parameter used for factor estimation. Defaults to `.1`.
#'@export
estimate.num.factors = function(assign = TRUE, num.iterations = 10, max.num.factors = NULL,
                                min.num.factors = NULL, SNR = 0.1, quiet = FALSE, simplify = FALSE){
  
  # Combine datasets if a list is provided:
  X <- private$get.concatenated.X()
  
  # Get N (# samples) and P (total # features)
  p = ncol(X)
  n = nrow(X)
  
  # Run SVD on X
  if(!quiet){cat("--------------------\n")}
  if(is.null(min.num.factors)){
    if(!self$options$quiet){cat("Running initial SVD for min:\n")}
    tmp1 = svd(X)$d
    sum0 = cumsum(tmp1^2)
    sum0 = sum0/sum0[length(sum0)]
    min.num.factors = which(sum0>0.5)[1]
    min.method <- paste0("SVD")
  } else {
    min.method <- "fixed"
  }
  if(is.null(max.num.factors)){
    max.num.factors = floor(n/10)
    max.method <- paste0("floor(", n, "/10)")
  } else {
    max.method <- "fixed"
  }
  
  # if simplify, just return min number of factors calculated
  if(simplify){
    if(min.method == "fixed"){stop("ERROR: Can't supply min.num.factors while simplify is TRUE. Needs to be NULL.")}
    if(!self$options$quiet){cat("min: ", min.num.factors," factors [", min.method, "]\n", sep = "")}
    if(assign){
      if(!self$options$quiet){cat("Assigning: $params$num.factors = ", min.num.factors, sep = "")}
      self$params$num.factors <- min.num.factors
    }
    return(invisible(self))
  }
  
  # Otherwise, run through iterative procedure, first check for min < max:
  if(!self$options$quiet){cat("min: ", min.num.factors," factors [", min.method, "]\n", sep = "")}
  if(!self$options$quiet){cat("max: ", max.num.factors," factors [", max.method, "]\n", sep = "")}
  if(!self$options$quiet){cat("--------------------\n")}
  
  # Max is greater than min. Return min:
  if(max.num.factors <= min.num.factors){
    cat("max factors (", max.num.factors, ") <= min factors (", min.num.factors, ")\n", sep = "")
    cat("Suggested: ", min.num.factors, " factors (min)\n", sep = "")
    if(assign){
      if(!self$options$quiet){cat("Assigning: $params$num.factors = ", min.num.factors, sep = "")}
      self$params$num.factors <- min.num.factors
    }
    return(invisible(self))
  }else{
    # Test iterations:
    residuals = matrix(1, ncol =max.num.factors, nrow = num.iterations)
    for(k in 1:num.iterations){
      if(!quiet){cat("Iter ", k, ":", num.iterations, "\t", sep = "")}
      E = matrix(rnorm(n*p), nrow = n, ncol = p)
      X1fake = X+E*sqrt(0.01*(1-SNR))
      X2fake = X-E*sqrt(100*(1-SNR))
      tmp2 = svd(X1fake)
      R = X2fake
      for(i in 1:max.num.factors){
        R = R - tmp2$u[,i]%*%t(tmp2$v[,i])*tmp2$d[i]
        residuals[k,i] = mean(R^2)
      }
      if(!self$options$quiet){cat("Best = ", which.min(residuals[k,]), " factors\n", sep = "")}
    }
    residuals_mean = apply(residuals,2,mean)
    id_select = which.min(residuals_mean)
    if(!self$options$quiet){cat("----------------\n")}
    if(!self$options$quiet){cat("min: ", min.num.factors, " factors\n", sep = "")}
    if(!self$options$quiet){cat("avg: ", id_select, " factors\n", sep = "")}
    if(!self$options$quiet){cat("max: ", max.num.factors, " factors\n", sep = "")}
    if(!self$options$quiet){cat("----------------\n")}
    if(id_select < min.num.factors){
      if(!self$options$quiet){cat("Suggested: ", min.num.factors, " factors (min)\n", sep = "")}
      id_select = min.num.factors
    } else if(id_select > max.num.factors){
      if(!self$options$quiet){cat("Suggested: ", max.num.factors, " factors (max)\n", sep = "")}
      id_select = min.num.factors
    } else {
      if(!self$options$quiet){cat("Suggested: ", id_select, " factors (avg)\n", sep = "")}
    }
    if(assign){
      if(!self$options$quiet){cat("Assigning: $params$num.factors = ", id_select, sep = "")}
      self$params$num.factors <- id_select
    }
    return(invisible(self))
  }
}




#' Get the loss from the cross validation. Must be called from a SPEARobject trained with `$run.cv.spear()` then `$cv.evaluate()`.
#'@export
get.cv.loss <- function(){
  if(self$fit$type != "cv"){
    stop("ERROR: Needs to be a cv.spear object (run SPEARobj$run.cv.spear()...)")
  }
  loss.output <- self$fit$loss
  
  # set column end for multinomial:
  response.labels <- self$params$response
  
  # set initial dfs
  loss.output$min <- data.frame()
  loss.output$sd <- data.frame()
  
  # Get best w.idxs
  min.widx = private$get.widx.from.method(method = "min")
  sd.widx = private$get.widx.from.method(method = "sd")
  
  for(j in 1:length(response.labels)){
    response = response.labels[j]
    # min
    cvm = loss.output$cvm[min.widx,j]
    cvsd = loss.output$cvsd[min.widx,j]
    loss.temp <- data.frame(response = response,
                            widx = min.widx,
                            cvm = cvm,
                            cvsd = cvsd)
    loss.output$min <- rbind(loss.output$min, loss.temp)
    # sd
    cvm = loss.output$cvm[sd.widx,j]
    cvsd = loss.output$cvsd[sd.widx,j]
    loss.temp <- data.frame(response = response,
                            widx = sd.widx,
                            cvm = cvm,
                            cvsd = cvsd)
    loss.output$sd <- rbind(loss.output$sd, loss.temp)
  }
  rownames(loss.output$min) <- NULL
  rownames(loss.output$sd) <- NULL
  
  return(loss.output)
}



#' Get the loss from the cross validation. Must be called from a SPEARobject trained with `$run.cv.spear()` then `$cv.evaluate()`.
#'@export
plot.cv.loss <- function(show.sd = TRUE, show.min = TRUE){
  if(self$fit$type != "cv"){
    stop("ERROR: Needs to be a cv.spear object (run SPEARobj$run.cv.spear()...)")
  }
  # CVM
  cvm.df <- self$fit$loss$cvm
  cvm.df.melt <- reshape2::melt(cvm.df)
  colnames(cvm.df.melt)[which(colnames(cvm.df.melt) == "value")] <- "cvm"
  # CVSD
  cvsd.df <- self$fit$loss$cvsd
  cvsd.df.melt <- reshape2::melt(cvsd.df)
  colnames(cvsd.df.melt)[which(colnames(cvsd.df.melt) == "value")] <- "cvsd"
  
  # Join together CVM and CVSD:
  tmp <- suppressMessages(dplyr::inner_join(cvm.df.melt, cvsd.df.melt,
                                            id.vars = c("Var1", "Var2")))
  
  # Generate labels for x:
  x.labs <- sapply(1:nrow(cvm.df), function(i){
    return(paste0("w.x = ", self$params$weights[i,1], "\n", "idx = ", i))
  })
  
  # Get sd / min idx:
  sd.idx <- private$get.widx.from.method(method = "sd")
  min.idx <- private$get.widx.from.method(method = "min")
  
  # Generate plot:
  plt <- ggplot2::ggplot(tmp) +
    ggplot2::geom_point(ggplot2::aes(x = Var1, y = cvm), size = 3) +
    ggplot2::geom_errorbar(ggplot2::aes(x = Var1, ymin = cvm - cvsd, ymax = cvm + cvsd), size = .3, width = .2) +
    ggplot2::scale_x_discrete(labels = x.labs) +
    ggplot2::theme_void() + ggplot2::theme(axis.text = ggplot2::element_text(),
                                           panel.grid.major.y = ggplot2::element_line(size = .2, color = "grey50"),
                                           panel.border = ggplot2::element_rect(fill = NA),
                                           strip.text = ggplot2::element_text(vjust = .5, size = 10)) +
    ggplot2::facet_wrap(ggplot2::vars(Var2))
  if(show.sd){
    plt <- plt + ggplot2::geom_point(data = dplyr::filter(tmp, Var1 == paste0("widx", sd.idx)), ggplot2::aes(x = Var1, y = cvm), size = 6, color = "blue")
  }
  if(show.min){
    plt <- plt + ggplot2::geom_point(data = dplyr::filter(tmp, Var1 == paste0("widx", min.idx)), ggplot2::aes(x = Var1, y = cvm), size = 4.5, color = "red")
  }
  return(plt)
}


#' Choose a response to be preferred for plotting functions. Will set `SPEARobj$options$current.response.idx` (which can be set manually as well)
#' @param response Which response to use? Can be an integer (which column number/index?) or string (which column name)? If string, must be found in `colnames(SPEARobj$data$train$Y)`. Defaults to NULL (`1`)
#' @examples
#' # Use the column number / index
#' SPEARobj$set.response(1)
#' 
#' # use the column name instead
#' SPEARobj$set.response("response_name")
#'@export
set.response = function(response = NULL){
  private$check.fit()
  
  if(self$params$family == "multinomial"){
    response.idx = 1
  } else {
    if(is.null(response)){
      response.idx = 1
    } else if(is.character(response)){
      if(!response %in% self$params$response){
        stop("ERROR: response not found in $params$response. Please check...")
      }
      response.idx = which(response %in% self$params$response)
    } else if(is.numeric(response)){
      if(!response %in% 1:length(self$params$response)){
        stop("ERROR: response number not within the indices of 1:", length(self$params$response), ". Please check...")
      }
      response.idx = response
    } else {
      stop("ERROR: response provided not recognized... Must be an integer (i.e. 1) or character (i.e. column name).")
    }
  }
  
  if(!self$options$quiet){
    cat("Setting current response index to ", response.idx, "\n", 
        "response: ", self$params$response[response.idx], "\n\n",
        sep = "")
  }
  
  self$options$current.response.idx = response.idx
  
  return(invisible(self))
}
jgygi/SPEAR documentation built on July 5, 2023, 5:35 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
jgygi/SPEAR
SPEAR - Sparse Supervised Bayesian Factor Model for Multi-omic Analyses

R/2_helpers.R
In jgygi/SPEAR: SPEAR - Sparse Supervised Bayesian Factor Model for Multi-omic Analyses

R Package Documentation

Browse R Packages

We want your feedback!

jgygi/SPEAR SPEAR - Sparse Supervised Bayesian Factor Model for Multi-omic Analyses

R/2_helpers.R In jgygi/SPEAR: SPEAR - Sparse Supervised Bayesian Factor Model for Multi-omic Analyses

R Package Documentation

Browse R Packages

We want your feedback!

jgygi/SPEAR
SPEAR - Sparse Supervised Bayesian Factor Model for Multi-omic Analyses

R/2_helpers.R
In jgygi/SPEAR: SPEAR - Sparse Supervised Bayesian Factor Model for Multi-omic Analyses