scone: Single Cell Overview of Normalized Expression data

Documented in lm_adjust make_design .parse_row

#' @rdname get_params
#'   
#' @param x an object of class \code{\link{SconeExperiment}}.
#'   
#' @return A data.frame containing workflow parameters for each scone workflow.
#'   
#' @export
#' 
setMethod(
  f = "get_params",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
    return(x@scone_params)
  })

#' @rdname get_scores
#'   
#' @param x an object of class \code{\link{SconeExperiment}}.
#'   
#' @return \code{get_scores} returns a matrix with all (non-missing) scone 
#'   scores, ordered by average score rank.
#'   
#' @export
#' 
setMethod(
  f = "get_scores",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
      scores <- t(na.omit(t(x@scone_scores[,-NCOL(x@scone_scores)])))
      return(scores)
})

#' @rdname get_scores
#'   
#' @return \code{get_score_ranks} returns a vector of average score ranks.
#'   
#' @export
#' 
setMethod(
  f = "get_score_ranks",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
    return(x@scone_scores[,NCOL(x@scone_scores)])
  })


#' @rdname get_negconruv
#'   
#' @param x an object of class \code{\link{SconeExperiment}}.
#'   
#' @return NULL or a logical vector.
#'   
#' @return For \code{get_negconruv} the returned vector indicates which genes
#'   are negative controls to be used for RUV.
#'   
#' @export
#' 
setMethod(
  f = "get_negconruv",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
    if(length(x@which_negconruv) == 0) {
      return(NULL)
    } else {
      return(rowData(x)[,x@which_negconruv])
    }
  }
)

#' @rdname get_negconruv
#'   
#' @return For \code{get_negconeval} the returned vector indicates which genes
#'   are negative controls to be used for evaluation.
#'
#' @export
#' 
setMethod(
  f = "get_negconeval",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
    if(length(x@which_negconeval) == 0) {
      return(NULL)
    } else {
      return(rowData(x)[,x@which_negconeval])
    }
  }
)

#' @rdname get_negconruv
#'   
#' @return For \code{get_poscon} the returned vector indicates which genes are 
#'   positive controls to be used for evaluation.
#'
#' @export
#'
setMethod(
  f = "get_poscon",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
    if(length(x@which_poscon) == 0) {
      return(NULL)
    } else {
      return(rowData(x)[,x@which_poscon])
    }
  }
)

#' @rdname get_qc
#'   
#' @param x an object of class \code{\link{SconeExperiment}}.
#'   
#' @return NULL or the quality control (QC) metric matrix.
#'
#' @export
#'
setMethod(
  f = "get_qc",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
    if(length(x@which_qc) == 0) {
      return(NULL)
    } else {
      retval <- as.matrix(colData(x)[, x@which_qc, drop=FALSE])
      return(retval)
    }
  }
)

#' @rdname get_bio
#'   
#' @param x an object of class \code{\link{SconeExperiment}}.
#'   
#' @return NULL or a factor containing bio or batch covariate.
#'
#' @export
#'
setMethod(
  f = "get_bio",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
    if(length(x@which_bio) == 0) {
      return(NULL)
    } else {
      return(colData(x)[,x@which_bio])
    }
  }
)

#' @rdname get_bio
#'
#' @export
#'
setMethod(
  f = "get_batch",
  signature = signature(x = "SconeExperiment"),
  definition = function(x) {
    if(length(x@which_batch) == 0) {
      return(NULL)
    } else {
      return(colData(x)[,x@which_batch])
    }
  }
)

#' Parse rows
#' 
#' This function is used internally in scone to parse the variables used to
#' generate the design matrices.
#' 
#' @param pars character. A vector of parameters corresponding to a row of
#'   workflow parameters.
#' @param bio factor. The biological covariate.
#' @param batch factor. The batch covariate.
#' @param ruv_factors list. A list containing the factors of unwanted variation
#'   (RUVg) for all upstream workflows.
#' @param qc matrix. The principal components of the QC metric matrix.
#'   
#' @return A list with the variables to be passed to make_design.
#'   
#' @keywords internal
#'   
.parse_row <- function(pars, bio, batch, ruv_factors, qc) {
  
  # Define upstream workflow: imputation x scaling
  sc_name <- paste(pars[1:2], collapse="_")

  W <- out_bio <- out_batch <- NULL
  
  if(pars[3]!="no_uv") {
    parsed <- strsplit(as.character(pars[3]), "=")[[1]]
    if(grepl("ruv", parsed[1])) {
      W <- ruv_factors[[sc_name]][,seq_len(as.numeric(parsed[2]))]
    } else {
      W <- qc[,seq_len(as.numeric(parsed[2]))]
    }
  }

  if(pars[4]=="bio") {
    out_bio <- bio
  }

  if(pars[5]=="batch") {
    out_batch <- batch
  }

  return(list(sc_name=sc_name, W=W, bio=out_bio, batch=out_batch))
}

#' Make a Design Matrix
#' 
#' This function builds a design matrix for the Adjustment Normalization Step,
#' in which covariates are two (possibly nested) categorical factors and one or
#' more continuous variables.
#' 
#' @details If nested=TRUE a nested design is used, i.e. the batch variable is
#'   assumed to be nested within the bio variable. Here, nested means that each
#'   batch is composed of samples from only *one* level of bio, while each
#'   level of bio may contain multiple batches.
#'   
#' @export
#' 
#' @param bio factor. The biological covariate.
#' @param batch factor. The batch covariate.
#' @param W numeric. Either a vector or matrix containing one or more
#'   continuous covariates (e.g. RUVg factors).
#' @param nested logical. Whether or not to consider a nested design
#'   (see details).
#'   
#' @return The design matrix.
#'   
#' @examples 
#' 
#' bio = as.factor(rep(c(1,2),each = 2))
#' batch = as.factor(rep(c(1,2),2))
#' design_mat = make_design(bio,batch, W = NULL)
#' 
make_design <- function(bio, batch, W, nested=FALSE) {
  
  if(nested & (is.null(bio) | is.null(batch))) {
    stop("Nested design can be used only if both batch and bio are specified.")
  }
  
  if(!is.null(bio)) {
    if(class(bio)!="factor") {
      stop("bio must be a factor.")
    }
  }
  
  if(!is.null(batch)){
    if(class(batch)!="factor") {
      stop("batch must be a factor.")
    }
  }

  f <- "~ 1"
  if(!is.null(bio)) {
    f <- paste(f, "bio", sep="+")
  }
  if(!is.null(batch)) {
    f <- paste(f, "batch", sep="+")
  }
  if(!is.null(W)) {
    f <- paste(f, "W", sep="+")
  }

  if(is.null(bio) & is.null(batch) & is.null(W)) {
    return(NULL)
  } else if (!is.null(bio) & !is.null(batch) & nested) {
    
    n_vec <- tapply(batch, bio, function(x) nlevels(droplevels(x)))

    mat = matrix(0,nrow = sum(n_vec),ncol = sum(n_vec - 1))
    xi = 1
    yi = 1
    for(i in 1:length(n_vec)){
      if(n_vec[i] > 1){
        cs = contr.sum(n_vec[i])
        dd = dim(cs)
        mat[xi:(xi + dd[1] - 1),yi:(yi + dd[2] - 1)] = cs
        xi = xi + dd[1]
        yi = yi + dd[2]
      }else{
        xi = xi + 1
      }
    }

    return(model.matrix(as.formula(f), 
                        contrasts=list(bio=contr.sum, batch=mat)))
  } else {
    return(model.matrix(as.formula(f)))
  }
}

#' Linear Adjustment Normalization
#' 
#' Given a matrix with log expression values and a design matrix, this function
#' fits a linear model and removes the effects of the batch factor
#' as well as of the linear variables encoded in W.
#' 
#' @details The function assumes that the columns of the design matrix
#'   corresponding to the variable for which expression needs to be adjusted,
#'   start with either the word "batch" or the letter "W" (case sensitive). Any
#'   other covariate (including the intercept) is kept.
#'   
#' @importFrom limma lmFit
#' @export
#' 
#' @param log_expr matrix. The log gene expression (genes in row, samples in
#'   columns).
#' @param design_mat matrix. The design matrix (usually the result of
#'   make_design).
#' @param batch factor. A factor with the batch information, identifying batch
#'   effect to be removed.
#' @param weights matrix. A matrix of weights.
#' @return The corrected log gene expression.
#'   
#' @examples
#' 
#' set.seed(141)
#' bio = as.factor(rep(c(1,2),each = 2))
#' batch = as.factor(rep(c(1,2),2))
#' design_mat = make_design(bio,batch, W = NULL)
#' 
#' log_expr = matrix(rnorm(20),ncol = 4)
#' adjusted_log_expr = lm_adjust(log_expr = log_expr,
#'   design_mat = design_mat,
#'   batch = batch)
#' 
lm_adjust <- function(log_expr, design_mat, batch=NULL, weights=NULL) {
  lm_object <- lmFit(log_expr, design = design_mat, weights = weights)

  uvind <- grep("^W", colnames(design_mat))
  bind <- grep("^batch", colnames(design_mat))

  if(length(uvind)) {
    uv_term <- t(design_mat[,uvind] %*% t(lm_object$coefficients[,uvind]))
  } else {
    uv_term <- 0
  }

  if(length(bind)) {
    if(is.character(attr(design_mat,"contrasts")$batch)) {
      contr <- get(attr(design_mat,"contrasts")$batch)(nlevels(batch))
    } else {
      contr <- attr(design_mat,"contrasts")$batch
    }
    batch_term <- t(contr %*% t(lm_object$coefficients[,bind]))[,batch]
  } else {
    batch_term <- 0
  }

  return(log_expr - batch_term  - uv_term)
}