# R/csSAMfit.R In shenorrLab/csSAM: csSAM - cell-specific Significance Analysis of Microarrays

```# Interface to Fit csSAM Models
#
# Author: Renaud Gaujoux
###############################################################################

# TODO: move this function to pkgmaker or other generic package
.aggregate_list <- function(x){

.tips <- function(x){
unlist(sapply(names(x), function(n){
if( is.list(x[[n]]) ) file.path(n, .tips(x[[n]])) else n
}), use.names = FALSE)
}

.traverse <- function(x, name){
res <- x
lapply(name, function(n) res <<- res[[n]])
res
}

t <- .tips(x[[1L]])
tl <- strsplit(t, "/")
res_flat <- lapply(tl, function(n){
xl <- lapply(x, .traverse, n)
fun <- if( !is.null(dim(xl[[1]])) ){
if( nrow(xl[[1]]) == 1L ) rbind else cbind
}else `c`
do.call(fun, xl)
})
res_flat <- setNames(res_flat, t)
#      res_flat <- sapply(res_flat, function(x){
#                  if( is.null(dim(x)) && all(x == x[[1L]]) ) x[1L]
#                  else x
#              }, simplify = FALSE)

# reshape into a nested list
res <- list()
#    .nest <- function(n, np = NULL){
#        tip <- paste0(np, n, collapse = "/")
#        if( !length(n) ) return(res_flat[[tip]])
#        if( length(n) > 1 ){
#            res[[tip]] <<- .nest(n[-1], c(np, n[1]))
#        }else res[[n]] <<- res_flat[[tip]]
#    }

.nest <- function(flat, children){
if( !length(flat) ) return()
s <- split(seq_along(flat), sapply(children, head, 1L))
res <- sapply(s, function(i){
ch_i <- children[i]
l1 <- lengths(ch_i) == 1L
elmt <- flat[i[l1]]
elmt <- c(elmt, .nest(flat[i[!l1]], lapply(ch_i[!l1], tail, -1L, simplify = FALSE)))
if( length(elmt) == 1L ) elmt <- elmt[[1]]
elmt
#       if( length(j) == 1L ) res_flat[[j]]
#       else .nest(j, lapply(children[j], tail, -1L, simplify = FALSE))
}, simplify = FALSE)
res
}
.nest(res_flat, tl)[names(x[[1]])]
}

# internal definition of basis() (original definition is in NMF)
basis <- function(x) x\$basis
`basis<-` <- function(x, value){
x\$basis <- value
x
}
basisfit <- function(x) x\$basisfit

#' Fits a csSAM Model
#'
#'
#' @param x target matrix assayed on mixed samples.
#' @param ... arguments passed down to the relevant S3 method.
#'
#' @return a `csSAMfit` object, that is a list with the following elements:
#'   *
#'   *
#'   *
#' @export
csSAMfit <- function(x, ...){
UseMethod('csSAMfit')
}

#' @describeIn csSAMfit convenient formula interface that can take advantage of expression and
#' sample annotation data stored in [Biobase::ExpressionSet] objects.
#'
#' @export
csSAMfit.formula <- function(x, data = NULL, ...){
# parse formula
model <- csFormula(x, data = data)
# call workhorse function
csSAMfit.default(model\$target, cc = t(model\$x), y = model\$data, covariates = model\$covariates, ...)

}

#' @export
csSAMfit.Formula <- csSAMfit.formula

#' @describeIn csSAMfit workhorse method that takes model elements in separate
#' arguments.
#'
#' @inheritParams csSamWrapper
#' @param covariates matrix of covariates to include in the model, e.g., for correcting
#' for gender, age or batch effects.
#' @param fit fitting method, that dictates the type of model that is fitted:
#'   * auto: automatic selection based on input data and model specification (e.g., presence of covariates)
#'   * block: two linear models are fitted within each group of samples. Only works when the variable
#' of interest is a factor (categorical variable).
#'   * lm: full interaction linear model.
#'   * monovariate: model is fitted including one cell type at a time.
#' This could be a work around for when there are many more cell types than samples in a given group.
#'
#' @param keep.all logical that indicates if the statistics of all the permutations should be also
#' returned in slot `rhatperm`.
#' @param verbose single logical or integer that controls verbosity levels.
#' @param .rng specification of the initial RNG settings, that are set using [rngtools::setRNG].
#'
#' @import pkgmaker
#' @importFrom rngtools setRNG
#' @export
csSAMfit.default <- function(x, cc, y = NULL, covariates = NULL
, nperms = 200, alternative = c('all', 'two.sided', 'greater', 'less')
, fit = c('auto', 'block', 'lm', 'monovariate')
, standardize=TRUE, medianCenter=TRUE
, logRm =FALSE, logBase = 2, nonNeg=NULL
, keep.all = FALSE, verbose = FALSE
, .rng = NULL, ...){

# check for extra arguments
if( length(extra <- list(...)) )
stop("Unused arguments: ", str_out(names(extra), Inf))

# seed computation if requested
if( !is.null(.rng) ){
orng <- setRNG(.rng)
on.exit( setRNG(orng) )
}

# set local verbose level
if( !missing(verbose) || verbose ){
ol <- lverbose(verbose)
}

## map arguments to those of csSamWrapper
if( is.null(cc) ) stop("Cannot fit csSAM model: missing proportion data.")

Y <- x; x <- cc

if( isExpressionSet(Y) ){
if( !requireNamespace('Biobase') )
stop("Missing dependency: package 'Biobase' is required to fit csSAM model on ExpressionSet objects.")
Y <- Biobase::exprs(Y)
}
if( isExpressionMix(x) ) x <- coef(x)
# data contains the definition of the effect of interest
if( is.null(y) ) y <- factor(rep(1, ncol(Y)))
if( !is.numeric(y) ){
if( !is.factor(y) ) y <- factor(y, levels=unique(y))
# remove absent levels
y <- droplevels(y)

} else stop("Invalid effect variable y: model does not handle variables of class [", class(y), ']')

# transpose/copy for csSAM
G <- t(Y); cc <- t(x)
##

# show details of effect variable
if( is.factor(y) ){
# group composition
n <- summary(addNA(y), maxsum=Inf)
vmessage("Groups: ", str_out(n, Inf, use.names=TRUE, sep=' | '))

} else{
# range
n <- summary(y)
vmessage("Variable: ", str_out(n, Inf, use.names=TRUE, sep=' | '))

}
# show cell types
vmessage("Cell type(s): ", if( is.null(colnames(cc)) ) ncol(cc) else str_out(colnames(cc), total = TRUE))

# remove NA-labeled samples
if( length(i_rm <- which(is.na(y))) ){
wnote('Dropping samples with NA group/value: ', str_out(rownames(G)[i_rm]), ' [', length(i_rm), ']')
y <- y[-i_rm]
cc <- cc[-i_rm, , drop=FALSE]
G <- G[-i_rm, , drop=FALSE]

}

res_object <- structure(list(basis = numeric(), coefficients = t(cc)), class = 'csSAMfit')

# perform monovariate fit if requested
fit <- match.arg(fit)
vmessage("Fitting mode: ", fit)
if( fit == 'monovariate' && ncol(cc) > 1L ){

res <- matrix(NA, nrow(Y), ncol(cc), dimnames = list(rownames(Y), colnames(cc)))
ca <- match.call()
ca[[1L]] <- as.name('csSAMfit')
ca[['x']] <- t(G)
ca[['y']] <- y
ca[['verbose']] <- max(verbose-1, 0)
progress <- iterCount(n = ncol(cc), title='  Fitting monovariate models', extra = colnames(cc), verbose = verbose)
bfit <- sapply(colnames(cc), function(ct){
progress()
ca[['cc']] <- t(cc[, ct, drop = FALSE])
mono_fit <- eval(ca)
res[, ct] <<- basis(mono_fit)
mono_fit

}, simplify = FALSE)
progress(ncol(cc))

#    d <- dimnames(x)
res_object\$basisfit <- .aggregate_list(sapply(bfit, basisfit, simplify = FALSE))
basis(res_object) <- res
#    dimnames(x) <- d

return(res_object)
}

# remove samples that have missing data in any one of the cell types
if( length(i_rm <- which(apply(cc, 1L, anyNA))) ){
wnote('Dropping samples with missing proportions: ', str_out(rownames(G)[i_rm], total = TRUE))
y <- y[-i_rm]
cc <- cc[-i_rm, , drop=FALSE]
G <- G[-i_rm, , drop=FALSE]
}

# show details after filtering
if( is.factor(y) ){
# group composition
if( !identical(n, n2 <- summary(addNA(y), maxsum=Inf)) )
vmessage("Groups (filtered): ", str_out(n2, Inf, use.names=TRUE, sep=' | '))

}else{
# range
if( !identical(n, n2 <- summary(y)) )
vmessage("Variable (filtered): ", str_out(n2, Inf, use.names=TRUE, sep=' | '))

}
n <- n2

vmessage("Data (filtered): ", sprintf("%s features x %s samples", ncol(G), nrow(G)))

# choose version based on inputs
if( fit == 'auto' ){
# covariates require version 2 (interaction model fit)
fit <- 'block'
if( !is.null(covariates) ){
vmessage("Model has extra covariates: fitting lm interaction model")
fit <- 'lm'

} else if( is.numeric(y) ){
vmessage("Model has numeric effect: fitting lm interaction model")
fit <- 'lm'

} else if( is.factor(y) && nlevels(y) > 2L ){
vmessage("Model has factor effect with more than 2 levels: fitting lm interaction model")
fit <- 'lm'

}

}
version <- c(block = 1, lm = 2, monovariate = 1)[fit]

# use factor encoding
#  if( is.factor(y) && version == 1 ) y <- as.integer(y)

# detect negative values
if( version == 1 && is.null(nonNeg) ){
if( nonNeg <- (min(G, na.rm = TRUE) * min(cc, na.rm = TRUE) >= 0) ) vmessage("Input values: positives")
else vmessage("Input values: mixed signs")
}
vmessage("Fitting model with ", if( isFALSE(nonNeg) ) "mixed sign" else "nonnegative", " effects")

#	numset = length(unique(y))
numset <- nlevels(y)
# check parameters
if( numset > 2L ){
if( version != 2 ){
stop("csSAM - Cannot handle more than 2 groups of samples [", numset, "]")
}else{
vmessage("Model with more than 2 groups: switching to version 2")
version <- 2
}

}
if( nrow(G) != nrow(cc) ){
stop("csSAM - Incompatible dimensions: number of samples in the"
, " proportion matrix [", nrow(cc), "]"
, " should match the number of samples in the target matrix [", nrow(G),"]")
}
#

# fit csSAM model
fitObject <- .csSAMfit(G, cc, y, covariates = covariates, version = version, stat.only = FALSE
, standardize = standardize, medianCenter = medianCenter, nonNeg = nonNeg, logRm = logRm, logBase = logBase)

# compute FDR
if( nperms && is.null(fitObject\$stat) ){
warning("No cell type-specific differences were computed: FDR computation was skipped")
nperms <- 0L
}
if( nperms ){
alternative <- match.arg(alternative, several.ok = TRUE)
vmessage("Computing FDR using ", nperms, " permutations ... ", appendLF=FALSE)
elapsed <- system.time({
fdr.csSAM <- fdrCsSAM(G,cc,y, fitObject\$stat
, nperms = nperms, alternative = alternative
, standardize = standardize, medianCenter = medianCenter
, logRm = logRm, logBase = logBase, nonNeg = nonNeg
, verbose = max(verbose - 1L, 0), version = version, covariates = covariates
#              , block = block, rblock = rblock
)
#          fdr.csSAM\$alternative <- alternative
})
vmessage("OK")
lmessage(2L, "Timing:")
if( verbose >= 2L ) show(elapsed)

# remove permuation results if not requested (to avoid memory issue)
if( !keep.all ) fdr.csSAM\$rhatperm <- NULL
}

# set result basis slot
b <- t(fitObject\$coefficients)
rownames(b) <- rownames(Y)
if( ncol(b) == ncol(cc) ){ # single interaction
colnames(b) <- colnames(cc)
basis(res_object) <- b

}else{ # multi-interaction -> return an array of differences
lev <- gsub(".*:([^:]+)\$", "\\1", colnames(b))
resA <- array(NA_real_, dim = c(nrow(b), ncol(cc), length(lev)), dimnames = c(rownames(b), colnames(cc), list(lev)))
for(i in seq(lev)){
resA[,,i] <- b[, grep(sprintf(":%s\$", lev[i]), colnames(b))]
}
basis(res_object) <- resA
}

# attach fit result list
res_object\$basisfit <- c(fitObject, list(version = version, method = fit), if( nperms ) fdr.csSAM)
res_object\$call <- match.call()
res_object\$nperms <- nperms
res_object\$y <- y
res_object\$n <- ncol(G)

# return
res_object
}

# Internal workhorse function: assumes all parameters are in the correct format
# It is used when computing FDRs.
.csSAMfit <- function(G, cc, y, covariates = NULL, version = 1, standardize = TRUE, medianCenter = TRUE, nonNeg = TRUE, logRm = FALSE, logBase = 2
, stat.only = TRUE, verbose = FALSE){

# set local verbose level
if( !missing(verbose) || verbose ){
ol <- lverbose(verbose)
}

if( version == 2 ){
vmessage("Fitting linear interaction model ... ", appendLF=FALSE)
csSAMfit <- csSAM_single(cc, G, y, covariates = covariates, standardize = standardize, medianCenter = medianCenter, stat.only = stat.only)
vmessage('OK')

}else{
vmessage("Fitting linear model within each group ... ", appendLF=FALSE)
#    sets <- split(1:nrow(G), y)
n <- summary(y)
deconv <- sapply(levels(y), function(l){
i_set <- y %in% l
cc_set <- cc[i_set, , drop=FALSE]
G_set <- G[i_set, , drop=FALSE]
if( nrow(G_set) <= ncol(cc_set) ){
stop("csSAM - Insufficient sample size [", nrow(G_set), "]"
, " in group '", l, "':"
, " must be >= ", ncol(cc) + 1, " (i.e. number of cell types + 1).")
}

csfit(cc_set, G_set, logRm = logRm, logBase = logBase)
}, simplify = FALSE)
vmessage('OK')

# early exit if only one group
if( length(deconv) == 1L ){
res_object <- list(stats = NULL)
res_object\$coefficients <- coef(deconv[[1L]])
res_object\$residuals <- residuals(deconv[[1L]])
res_object\$se <- deconv[[1L]]\$se
return(res_object)
}

#rhat <- array(dim = c(numcell,numgene))
csSAMfit <- sapply(2:length(deconv), function(i){
vmessage(sprintf("Computing csSAM model statistics [%s vs %s] ... ", names(n)[i], names(n)[1L]), appendLF=FALSE)
csSAMfit <- csSAM(coef(deconv[[1L]]), deconv[[1L]]\$se, n[1L]
, coef(deconv[[i]]), deconv[[i]]\$se, n[i]
, standardize = standardize, medianCenter = medianCenter, nonNeg = nonNeg, stat.only = stat.only)
#          colnames(csSAMfit\$stat) <- colnames(G)
vmessage("OK")
csSAMfit
}, simplify = FALSE)
if( nlevels(y) == 2L ) csSAMfit <- csSAMfit[[1L]]
if( !stat.only ) csSAMfit\$csfit <- deconv
}

# return result
csSAMfit
}
```
shenorrLab/csSAM documentation built on May 29, 2019, 9:23 p.m.