R/spiec-easi.R

Defines functions spiec.easi.list multi.spiec.easi spiec.easi.default .check_pulsar_params .spiec.easi.norm spiec.easi.otu_table spiec.easi.phyloseq .data.checks .phy2mat spiec.easi

Documented in multi.spiec.easi spiec.easi spiec.easi.default spiec.easi.list spiec.easi.otu_table spiec.easi.phyloseq

#' SPIEC-EASI pipeline
#'
#' Run the whole SPIEC-EASI pipeline, from data transformation, sparse inverse covariance estimation and model selection.
#' Inputs are a non-normalized OTU table and pipeline options.
#' @export
#' @importFrom pulsar pulsar batch.pulsar getMaxCov getLamPath
spiec.easi <- function(data, ...) {
  UseMethod('spiec.easi', data)
}

.phy2mat <- function(OTU) {
  if (inherits(OTU, 'phyloseq'))
    OTU <- OTU@otu_table
  if (inherits(OTU, 'otu_table')) {
    if (OTU@taxa_are_rows) OTU <- t(OTU@.Data)
    else OTU <- OTU@.Data
  }
  return(OTU)
}

.data.checks <- function(data) {
  ## data checks ##
  if (inherits(data, 'list')) {
    sink <- lapply(data, .data.checks)
    return(NULL)
  }
  if (isTRUE(all.equal(rowSums(data), rep(1L, nrow(data))))) {
    warning('Data is normalized, but raw counts are expected')
  }

  if (any(data<0)) {
    warning('Negative values detected, but raw counts are expected')
  }
  return(NULL)
}

#' @method spiec.easi phyloseq
#' @rdname spiec.easi
#' @export
spiec.easi.phyloseq <- function(data, ...) {
  if (!requireNamespace('phyloseq', quietly=TRUE)) {
    stop('\'Phyloseq\' package is not installed. See doi.org/doi:10.18129/B9.bioc.phyloseq')
  }
  spiec.easi(.phy2mat(data), ...)
}

#' @method spiec.easi otu_table
#' @rdname spiec.easi
#' @export
spiec.easi.otu_table <- function(data, ...) {
  if (!requireNamespace('phyloseq', quietly=TRUE)) {
    stop('\'Phyloseq\' package is not installed. See doi.org/doi:10.18129/B9.bioc.phyloseq')
  }
  spiec.easi(.phy2mat(data), ...)

}


#' @noRd
.spiec.easi.norm <- function(data) {
# internal function to normalize a data matrix
  if (inherits(data, 'matrix')) {
    ## standard data pipeline
    return(t(clr(data+1, 1)))
  } else if (inherits(data, 'list')) {
    ## multi domain spiec.easi, data must be list of numeric matrices
    return(do.call('cbind', lapply(data, .spiec.easi.norm)))
  } else {
    stop('input data must be a numeric matrix')
  }
}


#' @name pulsar.params
#' @title pulsar params
#' @description The values to the \code{pulsar.params}/\code{icov.select.params} argument in the \code{\link{spiec.easi}} function must be a list with values from pulsar and/or batch.pulsar. See the pulsar docs for detailed instructions.
#'
#' List of arguments, data type, default. Description
#' \itemize{
#'    \item thresh, numeric, 0.05. Threshold for StARS criterion.
#'    \item subsample.ratio, numeric, 0.8. Subsample size for StARS.
#'    \item rep.num, numeric, 20. Number of subsamples for StARS.
#'    \item seed, numeric, NULL. Set the random seed for subsample set.
#'    \item ncores, numeric, 1. Number of cores for parallel.
#' }
#' With \code{pulsar.select='batch'}, additional arguments:
#' \itemize{
#'    \item wkdir, dir path, current directory. Working directory for process running jobs.
#'    \item regdir, dir path, temp directory. Directory for storing the registry files.
#'    \item init, string, 'init'. String for differentiating the init registry for batch mode pulsar.
#'    \item conffile, string / file path, ''. Path to config file or string that identifies a default config file.
#'    \item job.res, list, empty list. Named list to specify job resources for an hpc.
#'    \item cleanup, boolean, FALSE. Remove registry files.
#'}
#' @seealso \code{\link[pulsar]{pulsar}} \code{\link[pulsar]{batch.pulsar}} \code{\link{spiec.easi}}
NULL

#' @noRd
.check_pulsar_params <- function(fun, args=list()) {
  if (!inherits(args, 'list') || (length(args) >0 && is.null(names(args))) || any('' %in% names(args))) {
    stop('pulsar.params must be a named list')
  }
  if (length(args)==0) return(TRUE)
  fun   <- match.fun(fun)
  forms <- formals(fun)

  ## disallowed arguments provided by spiec.easi
  nargs    <- c("data", "fun", "fargs", "criterion")
  extrargs <- intersect(names(args), nargs)
  if (length(extrargs)>0) {
    argstr <- paste0(paste0("'", extrargs, "'"), collapse=", ")
    sstr <- ifelse(length(extrargs)==1, '', 's')
    stop(sprintf("Disallowed argument%s to \'pulsar.params\': %s", sstr, argstr))
  }
  ## check provided args
  allforms <- setdiff(names(forms), nargs)
  extrargs <- setdiff(names(args), allforms)
  if (length(extrargs)>0) {
    argstr <- paste0(paste0("'", extrargs, "'"), collapse=", ")
    sstr <- ifelse(length(extrargs)==1, '', 's')
    stop(sprintf("Unrecognized argument%s to \'pulsar.params\': %s", sstr, argstr))
  }
  return(TRUE)
}


#' @param data For a matrix, non-normalized count OTU/data table with samples on rows and features/OTUs in columns. Can also by phyloseq or otu_table object.
#' @param method estimation method to use as a character string. Currently either 'glasso' or 'mb' (meinshausen-buhlmann's neighborhood selection)
#' @param sel.criterion character string specifying criterion/method for model selection. Accepts 'stars' [default], 'bstars' (Bounded StARS)
#' @param verbose flag to show progress messages
#' @param pulsar.select flag to perform model selection. Choices are TRUE/FALSE/'batch'
#' @param pulsar.params list of further arguments to \code{\link{pulsar}} or \code{\link{batch.pulsar}}. See the documentation for \code{\link{pulsar.params}}.
#' @param icov.select deprecated.
#' @param icov.select.params deprecated.
#' @param lambda.log should values of lambda be distributed logarithmically (\code{TRUE}) or linearly ()\code{FALSE}) between \code{lamba.min} and \code{lambda.max}?
#' @param ... further arguments to \code{\link{sparseiCov}} / \code{huge}
#' @method spiec.easi default
#' @rdname spiec.easi
#' @seealso multi.spiec.easi
#' @export
spiec.easi.default <- function(data, method='glasso', sel.criterion='stars',
                        verbose=TRUE, pulsar.select=TRUE, pulsar.params=list(),
                        icov.select=pulsar.select,
                        icov.select.params=pulsar.params,
                        lambda.log=TRUE, ...) {

  args <- list(...)
  if (verbose) msg <- .makeMessage("Applying data transformations...")
  else msg <- .makeMessage('')

  switch(method,
        glasso = {
                    message(msg, appendLF=verbose)
                    estFun <- "sparseiCov"
                    args$method <- method
                    X <- .spiec.easi.norm(data)
                    if (is.null(args[['lambda.max']]))
                      args$lambda.max <- getMaxCov(cor(X))
                 },

        mb     = {
                    message(msg, appendLF=verbose)
                    estFun <- "sparseiCov"
                    args$method <- method
                    X <- .spiec.easi.norm(data)
                    if (is.null(args[['lambda.max']]))
                      args$lambda.max <- getMaxCov(cor(X))
                  },

        slr    = {
                    # if (!require('irlba'))
                      # stop('irlba package required')
                    if (length(args$r) > 1) { #TODO: add beta vector option
                      tmp <- lapply(args$r, function(r) {
                        if (verbose)
                          message(sprintf("SPIEC-EASI SLR, with rank r=%s", r))
                        args$r <- r
                        args2 <- c(list(data=data, method='slr',
                            sel.criterion=sel.criterion, verbose=verbose,
                            pulsar.params=pulsar.params,
                            pulsar.select=pulsar.select), args)
                        do.call(spiec.easi, args2)
                      })
                      names(tmp) <- paste0("rank", args$r)
                      return(tmp)
                    }
                    message(msg, appendLF=verbose)
                    estFun <- "sparseLowRankiCov"
                    X <- .spiec.easi.norm(data)
                    if (is.null(args[['lambda.max']]))
                      args$lambda.max <- getMaxCov(cov(X))
                  },

        coat   = {
                    message(msg, appendLF=verbose)
                    estFun <- "coat"
                    X <- .spiec.easi.norm(data)
                    if (is.null(args[['lambda.max']]))
                      args$lambda.max <- getMaxCov(X)
                  },

        ising  = {
                    if (inherits(data, 'list'))
                      stop('method "ising" does not support list data')

                    message(msg, appendLF=verbose)
                    estFun <- "neighborhood.net"
                    args$method <- method
                    X <- sign(data) ;
                    if (is.null(args[['lambda.max']]))
                      args$lambda.max <- max(abs(t(scale(X)) %*% X)) / nrow(X)
                  },

        poisson= {
                  if (inherits(data, 'list'))
                    stop('method "poisson" does not support list data')

                    message(msg, appendLF=verbose)
                    estFun <- "neighborhood.net"
                    args$method <- method
                    X <- data ;
                    if (is.null(args[['lambda.max']]))
                      args$lambda.max <- max(abs(t(scale(X)) %*% X)) / nrow(X)
                  }
    )

  if (is.null(args[[ "lambda" ]])) {
    if (is.null(args[[ "lambda.min.ratio" ]])) args$lambda.min.ratio <- 1e-3
    if (is.null(args[[ "nlambda" ]])) args$nlambda <- 20
    args$lambda <- getLamPath(args$lambda.max, args$lambda.max*args$lambda.min.ratio,
                              args$nlambda, log=lambda.log)
    args$lambda.min.ratio <- args$nlambda <- args$lambda.max <- NULL
  }

  ocall <- match.call(expand.dots=FALSE)
  ## if pulsar options are not specified, check for deprecated icov.select options are
  if (is.null(ocall[["pulsar.select"]]) && is.null(ocall[["pulsar.params"]])) {
    pulsar.select <- icov.select
    pulsar.params <- icov.select.params
  }

  if (!is.null(pulsar.params[[ "data" ]]))
    stop("supply data directly to spiec.easi, not pulsar.params")
  if (!is.null(pulsar.params[[ "criterion" ]]))
    stop("supply sel.criterion directly to spiec.easi, not pulsar.params")


  if (pulsar.select=="batch") {
    fun <- "batch.pulsar"
    call <- quote(batch.pulsar(data=X, fun=match.fun(estFun), fargs=args))
    pulsar.select <- TRUE
  } else {
    fun <- "pulsar"
    call <- quote(pulsar(data=X, fun=match.fun(estFun), fargs=args))
  }

  if (pulsar.select) {
    ## process pulsar.params defaults
    flag <- .check_pulsar_params(fun, pulsar.params)

    pulsar.params$criterion <-
      switch(sel.criterion,
             stars = "stars",
            bstars = "stars",
  #          gstars = c("stars", "gcd"), #TODO: process gstars option
            stop("Unknown selection criterion"))

    if (sel.criterion %in% c("bstars", "gstars"))
      pulsar.params$lb.stars <- pulsar.params$ub.stars <- TRUE
    if (is.null(pulsar.params[[ "thresh" ]])) pulsar.params$thresh <- 0.05


    obj <- list(call=call)
    class(obj) <- 'pulsar'
    call <- do.call('update',
              c(pulsar.params, list(object=obj, evaluate=FALSE)))

    if (verbose)
      message(sprintf("Selecting model with %s using ", fun), sel.criterion, "...")
    est <- eval(call, environment())
    if (sel.criterion == "gstars")
      opt.index <- pulsar::opt.index(est) <- pulsar::get.opt.index(est, 'gcd')
    else
      opt.index <- pulsar::opt.index(est, 'stars')
  } else
    est <- structure(list(call=call, envir=environment()), class='pulsar')

  if (verbose) message("Fitting final estimate with ", method, "...")
  suppressWarnings(
  fit <- pulsar::refit(est)
  )
  if (pulsar.select) {
    fit$select <- est
  }
  fit$lambda <- args$lambda
  fit$fun    <- call(estFun)[[1]]
  if (verbose) message('done')

  return(fit)
}


#' multi domain SPIEC-EASI
#'
#' A SPIEC-EASI pipeline for inferring a sparse inverse covariance matrix within and between multiple compositional datasets, under joint sparsity penalty.
#'
#' Can also run \code{spiec.easi} on a list and S3 will dispatch the proper function.
#' @param datalist list of non-normalized count OTU/data tables (stored in a matrix, data.frame or phyloseq/otu_table) with samples on rows and features/OTUs in columns
#' @param method estimation method to use as a character string. Currently either 'glasso' or 'mb' (meinshausen-buhlmann's neighborhood selection)
#' @param sel.criterion character string specifying criterion/method for model selection. Accepts 'stars' and 'bstars' [default]
#' @param verbose flag to show progress messages
#' @param pulsar.select flag to perform model selection. Choices are TRUE/FALSE/'batch'
#' @param pulsar.params list of further arguments to \code{\link{pulsar}} or \code{\link{batch.pulsar}}. See the documentation for \code{\link{pulsar.params}}.
#' @param ... further arguments to \code{\link{sparseiCov}} / \code{huge}
#' @seealso spiec.easi
#' @export
multi.spiec.easi <- function(datalist, method='glasso', sel.criterion='stars',
                        verbose=TRUE, pulsar.select=TRUE, pulsar.params=list(),
                        ...) {
## functional wrapper for spiec.easi.list
  spiec.easi.list(datalist, method=method, sel.criterion=sel.criterion,
                  verbose=verbose, pulsar.select=pulsar.select,
                  pulsar.params=pulsar.params, ...)
}

#' @method spiec.easi list
#' @param data non-normalized count OTU/data table with samples on rows and features/OTUs in columns. Can also be list of phyloseq objects.
#' @rdname multi.spiec.easi
#' @export
spiec.easi.list <- function(data, ...) {
  classes <- sapply(data, class)
  if (length(unique(classes)) != 1)
    warning('input list contains data of mixed classes.')

  ## convert phyloseq objects to matrices
  if (any('phyloseq' %in% classes) || any('otu_table' %in% classes))
    data <- lapply(data, .phy2mat)

  ## Finally, check the number of rows (samples) are equal
  ## and sample names are identical (sample names can be NULL)
  ssizes <- lapply(data, nrow)
  snames <- lapply(data, row.names)
  list.equal <- function(li) sum(duplicated(li)) == length(li)-1

  if (!list.equal(snames) || !list.equal(ssizes))
    stop("Do not run multi.spiec.easi with unidentical sample scheme")
  spiec.easi.default(data, ...)
}
zdk123/SpiecEasi documentation built on Oct. 20, 2023, 6:49 a.m.