R/NormMixClus-functions.R
In coseq: Co-Expression Analysis of Sequencing Data

Documented in NormMixClus NormMixClusK NormMixParam

#' Normal mixture model estimation and selection for a series of cluster numbers
#'
#' Perform co-expression and co-abudance analysis of high-throughput
#' sequencing data, with or without data transformation, using a Normal
#' mixture models. The output of \code{NormMixClus} is an S4 object of
#' class \code{RangedSummarizedExperiment}.
#'
#' @param y_profiles (\emph{n} x \emph{q}) matrix of observed profiles for \emph{n}
#' observations and \emph{q} variables
#' @param K Number of clusters (a single value or a sequence of values).
#' @param subset Optional vector providing the indices of a subset of
#' genes that should be used for the co-expression analysis (i.e., row indices
#' of the data matrix \code{y}.
#' @param parallel If \code{FALSE}, no parallelization. If \code{TRUE}, parallel
#' execution using BiocParallel (see next argument \code{BPPARAM}). A note on running
#' in parallel using BiocParallel: it may be advantageous to remove large, unneeded objects
#' from the current R environment before calling the function, as it is possible that R's
#' internal garbage collection will copy these files while running on worker nodes.
#' @param BPPARAM Optional parameter object passed internally to \code{bplapply} when
#' \code{parallel=TRUE}. If not specified, the parameters last registered with \code{register}
#' will be used.
#' @param seed If desired, an integer defining the seed of the random number generator. If
#' \code{NULL}, a random seed is used.
#' @param ... Additional optional parameters to be passed to \code{\link{NormMixClusK}}.
#'
#' @return
#' An S4 object of class \code{coseqResults}, with conditional
#' probabilities of cluster membership for each gene in each model stored as a list of assay
#' data, and corresponding {log likelihood, ICL value, number of
#' clusters, and form of Gaussian model} for each model stored as metadata.
#'
#' @author Andrea Rau, Cathy Maugis-Rabusseau
#'
#' @example inst/examples/NormMixClus.R
#' @importFrom BiocParallel bplapply
#' @importFrom BiocParallel bpparam
#' @export

NormMixClus <- function(y_profiles, K, subset=NULL, parallel=TRUE, BPPARAM=bpparam(), 
                        seed = NULL, ...){

  subset.index <- subset

  ## Parse ellipsis function
  providedArgs <- list(...)
  arg.user <- list(alg.type="EM", init.runs=50, init.type="small-em", init.iter=20,
                       iter=1000, cutoff=0.001, GaussianModel="Gaussian_pk_Lk_Ck",
                       verbose=TRUE, digits=3)
  arg.user[names(providedArgs)] <- providedArgs

  y_profiles <- as.data.frame(y_profiles)
  ## In case only a subset of data are to be used for analysis
  if(!is.null(subset.index)) {
    y_profiles <- y_profiles[subset.index,]
  }

  all.results <- vector("list", length = length(K))
  names(all.results) <- paste0("K=", K)
  if(arg.user$verbose) {
    cat("Running K =", min(K), "...\n")
  }
  all.results[[1]] <- suppressWarnings(NormMixClusK(y_profiles, K=min(K), alg.type=arg.user$alg.type,
                                                     init.runs=arg.user$init.runs,
                                                     init.type=arg.user$init.type,
                                                     init.iter=arg.user$init.iter,
                                                     iter=arg.user$iter,
                                                     cutoff=arg.user$cutoff,
                                                     GaussianModel=arg.user$GaussianModel,
                                                     digits=arg.user$digits, seed=seed))
  index <- 2
  remainingK <- K[-which.min(K)]
  if (length(remainingK)) {
    ## In the case where parallelization is NOT used
    if(!parallel) {
      for (k in remainingK) {
        if(arg.user$verbose) {
          cat("Running K =", k, "...\n")
        }
        all.results[[index]] <- suppressWarnings(NormMixClusK(y_profiles=y_profiles, K=k,
                                                               alg.type=arg.user$alg.type,
                                                               init.runs=arg.user$init.runs,
                                                               init.type=arg.user$init.type,
                                                               init.iter=arg.user$init.iter,
                                                               iter=arg.user$iter,
                                                               cutoff=arg.user$cutoff,
                                                               GaussianModel=arg.user$GaussianModel,
                                                               digits=arg.user$digits,
                                                              seed=seed))
        index <- index + 1
      }
      ## In the case where parallelization IS used
    } else if(parallel) {
      tmp <- bplapply(remainingK, function(ii, N_y_profiles, N_alg.type, N_init.runs,
                                           N_init.type, N_init.iter, N_iter, N_cutoff,
                                           N_GaussianModel, N_digits, N_verbose, N_FUNC, N_seed) {
        if(N_verbose == TRUE) {
          cat("Running K =", ii, "...\n")
        }
        res <- suppressWarnings(N_FUNC(y_profiles=N_y_profiles, K=as.numeric(ii),
                                              alg.type=N_alg.type, init.runs=N_init.runs,
                                              init.type=N_init.type, init.iter=N_init.iter,
                                              iter=N_iter, cutoff=N_cutoff,
                                              GaussianModel=N_GaussianModel, digits=N_digits,
                                       seed=N_seed))

        return(res)},
        N_y_profiles=y_profiles, N_alg.type=arg.user$alg.type, N_init.runs = arg.user$init.runs,
        N_init.type=arg.user$init.type, N_init.iter=arg.user$init.iter, N_iter=arg.user$iter,
        N_cutoff=arg.user$cutoff, N_GaussianModel=arg.user$GaussianModel, N_digits=arg.user$digits,
        N_verbose=arg.user$verbose, N_FUNC=NormMixClusK, N_seed = seed,
        BPPARAM=BPPARAM)
      if(!sum(unlist(lapply(tmp, nrow)))) {
        stop(paste("All models of form", arg.user$GaussianModel, "resulted in estimation errors.
This is likely due to singular covariance matrices when this form of Gaussian mixture is used.
Try rerunning coseq with either a spherical (pk_Lk_Bk) or diagonal (pk_Lk_I)
covariance matrix form instead:

coseq(..., GaussianModel = \"Gaussian_pk_Lk_Bk\")
coseq(..., GaussianModel = \"Gaussian_pk_Lk_I\")"))
      }
      Kmods <- paste0("K=", unlist(lapply(tmp, function(x) nbCluster(x))))
      all.results[match(Kmods, names(all.results))] <- tmp[which(unlist(lapply(tmp, function(xx)
        length(nbCluster(xx)) != 0)) == TRUE)]
    }
  }

  if(!sum(unlist(lapply(all.results, nrow)))) {
    stop(paste("All models of form", arg.user$GaussianModel, "resulted in estimation errors.
This is likely due to singular covariance matrices when this form of Gaussian mixture is used.
Try rerunning coseq with either a spherical (pk_Lk_Bk) or diagonal (pk_Lk_I)
covariance matrix form instead:

coseq(..., GaussianModel = \"Gaussian_pk_Lk_Bk\")
coseq(..., GaussianModel = \"Gaussian_pk_Lk_I\")"))
  }

  nbClust.all <- unlist(lapply(all.results, function(x) nbCluster(x)))
  logLike.all <- unlist(lapply(all.results, function(x) likelihood(x)))
  ICL.all <- unlist(lapply(all.results, function(x) ICL(x)))
  ICL.choose <- names(ICL.all)[which.min(ICL.all)]
  select.results <- all.results[[ICL.choose]]

  pp <- lapply(all.results, function(x) {
    if(is.null(x)) NULL
    if(!is.null(x)) assay(x)
    })
  names(pp) <- names(all.results)

  select.results <- SummarizedExperiment(assay(select.results),
                                         metadata=list(nbCluster=nbClust.all,
                                                       logLike=logLike.all,
                                                       ICL=ICL.all,
                                                       nbClusterError=K[!K %in% nbClust.all],
                                                       GaussianModel=arg.user$GaussianModel))
  
  if(sum(is.nan(assay(select.results))) == prod(dim(assay(select.results)))) {
    stop(paste0("The selected model, which is of the form ", arg.user$GaussianModel, ", resulted in estimation errors.
This is likely due to singular covariance matrices when this form of Gaussian mixture is used.
Try rerunning coseq with either a spherical (pk_Lk_Bk) or diagonal (pk_Lk_I)
covariance matrix form instead:

coseq(..., GaussianModel = \"Gaussian_pk_Lk_Bk\")
coseq(..., GaussianModel = \"Gaussian_pk_Lk_I\")"))
  }
  
  RES <- coseqResults(as(select.results, "RangedSummarizedExperiment"), allResults=pp)
  return(RES)
}


#' Normal mixture model estimation
#'
#' Perform co-expression and co-abudance analysis of high-throughput
#' sequencing data, with or without data transformation, using a Normal
#' mixture models for single number of clusters \emph{K}.
#' The output of \code{NormMixClusK} is an S4 object of
#' class \code{RangedSummarizedExperiment}.
#'
#' @param y_profiles y (\emph{n} x \emph{q}) matrix of observed profiles for \emph{n}
#' observations and \emph{q} variables
#' @param K Number of clusters (a single value).
#' @param init.type Type of initialization strategy to be used:
#' \dQuote{\code{small-em}} for the Small-EM strategy, \dQuote{\code{random}}, \dQuote{\code{CEM}},
#' or \dQuote{\code{SEMMax}}
#' @param GaussianModel One of the 28 forms of Gaussian models defined in Rmixmod,
#' by default equal to the \code{"Gaussian_pk_Lk_Ck"} (i.e., a general family model with free
#' proportions, free volume, free shape, and free orientation)
#' @param init.runs Number of runs to be used for the Small-EM strategy, with a default value of 50
#' @param init.iter Number of iterations to be used within each run for the
#' Small-EM strategry, with a default value of 20
#' @param alg.type Algorithm to be used for parameter estimation:
#' \dQuote{\code{EM}}, \dQuote{\code{CEM}}, \dQuote{\code{SEM}}
#' @param cutoff Cutoff to declare algorithm convergence
#' @param iter Maximum number of iterations to be run for the chosen algorithm
#' @param verbose If \code{TRUE}, verbose output is created
#' @param digits Integer indicating the number of decimal places to be used for the
#' \code{probaPost} output
#' @param seed If desired, an integer defining the seed of the random number generator. If
#' \code{NULL}, a random seed is used.
#'
#' @return
#' An S4 object of class \code{RangedSummarizedExperiment}, with conditional
#' probabilities of cluster membership for each gene stored as assay data, and
#' {log likelihood, ICL value, number of
#' clusters, and form of Gaussian model} stored as metadata.
#'
#' @author Cathy Maugis-Rabusseau, Andrea Rau
#'
#' @importFrom Rmixmod mixmodGaussianModel
#' @importFrom Rmixmod mixmodStrategy
#' @importFrom Rmixmod mixmodCluster
#' @example inst/examples/NormMixClus.R
#'
#' @export

NormMixClusK <- function(y_profiles, K, alg.type="EM", init.runs=50,
                         init.type="small-em", GaussianModel="Gaussian_pk_Lk_Ck",
                         init.iter=20, iter=1000, cutoff=0.001, verbose=TRUE, digits=3,
                         seed = NULL) {

  if(!is.data.frame(y_profiles)) y_profiles <- as.data.frame(y_profiles)

  models <- mixmodGaussianModel(listModels=c(GaussianModel))
  # strategy
  if(init.type == "small-em") init.type <- "smallEM"
  strats<-mixmodStrategy(algo=alg.type, nbTry = 1,
                         initMethod = init.type, nbTryInInit = init.runs,
                         nbIterationInInit = init.iter, nbIterationInAlgo = iter,
                         epsilonInInit = cutoff, epsilonInAlgo = cutoff)
  if(is.null(seed)) {
    xem <- mixmodCluster(y_profiles, nbCluster=K, models=models, strategy=strats,
                         criterion="ICL")
  } else {
    xem <- mixmodCluster(y_profiles, nbCluster=K, models=models, strategy=strats,
                         criterion="ICL", seed = seed) 
  }
  pp <- round(proba(xem), digits=digits )
  if(ncol(pp)) {
    colnames(pp) <- paste0("Cluster_", seq_len(ncol(pp)))
    if(!is.null(rownames(y_profiles))) rownames(pp) <- rownames(y_profiles)
    if(is.null(rownames(y_profiles))) rownames(pp) <- seq_len(nrow(pp))
  }

  res <- SummarizedExperiment(assays=pp,
                              metadata=list(logLike=likelihood(xem),
                              ICL=ICL(xem),
                              nbCluster=nbCluster(xem),
                              GaussianModel=GaussianModel))
  return(as(res, "RangedSummarizedExperiment"))
}


#' Calculate the mean and covariance for a Normal mixture model
#'
#' Calculates the mean and covariance parameters for a normal mixture model
#' of the form pK_Lk_Ck
#'
#' @param coseqResults Object of class \code{coseqResults} or \code{RangedSummarizedExperiment}
#' (as output from the \code{NormMixClus} or \code{NormMixClusK} functions)
#' @param y_profiles y (\emph{n} x \emph{q}) matrix of observed profiles for \emph{n}
#' observations and \emph{q} variables, required for \code{x} of class \code{RangedSummarizedExperiment}
#' @param K The model used for parameter estimation for objects \code{x} of
#' class \code{coseq} or \code{NormMixClus}. When \code{NULL}, the model selected
#' by the ICL criterion is used; otherwise, \code{K} should designate the number
#' of clusters in the desired model
#' @param digits Integer indicating the number of decimal places to be used for output
#' @param plot If \code{true}, produce heatmaps to visualize the estimated per-cluster
#' correlation matrices
#' @param ... Additional optional parameters to pass to \code{corrplot}, if desired
#'
#' @return
#' \item{pi }{ Vector of dimension \emph{K} with the estimated cluster proportions from
#' the Gaussian mixture model, where \emph{K} is the number of clusters}
#' \item{mu }{ Matrix of dimension \emph{K} x \emph{d} containing the estimated mean
#' vector from the Gaussian mixture model, where \emph{d} is the
#' number of samples in the data \code{y_profiles} and \emph{K} is the number of clusters}
#' \item{Sigma }{ Array of dimension \emph{d} x \emph{d} x \emph{K} containing the
#' estimated covariance matrices from the Gaussian mixture model, where \emph{d} is the
#' number of samples in the data \code{y_profiles} and \emph{K} is the number of clusters}
#' \item{rho }{ Array of dimension \emph{d} x \emph{d} x \emph{K} containing the
#' estimated correlation matrices from the Gaussian mixture model, where \emph{d} is the
#' number of samples in the data \code{y_profiles} and \emph{K} is the number of clusters}
#'
#' @author Andrea Rau, Cathy Maugis-Rabusseau
#'
#' @example inst/examples/NormMixClus.R
#' @export
#' @importFrom stats cov2cor
#' @importFrom grDevices n2mfrow
#' @importFrom graphics mtext par

NormMixParam <- function(coseqResults, y_profiles=NULL, K=NULL, digits=3, plot=FALSE, ...) {
  x <- coseqResults
  ## TODO: object could be data.frame or matrix
  if (!is(x, "coseqResults") & !is(x, "RangedSummarizedExperiment")) {
    stop(paste0(sQuote("coseqResults")), " must be of class ",
         paste0(dQuote("coseqResults")), " or ", paste0(dQuote("RangedSummarizedExperiment")))
  }
  if(is(x, "coseqResults")) {
    if(nrow(tcounts(x))>0) y_profiles <- as.matrix(as.data.frame(tcounts(x)))
    if(!nrow(tcounts(x)) & is.null(y_profiles)) stop("y_profiles argument needed.")
    if(is.null(K)) {
      probaPost <- assay(x)
    } else {
      if(K == "ICL") probaPost <- assay(x)
      if(K != "ICL") probaPost <- coseqFullResults(x)[[paste0("K=", K)]]
    }
    GaussianModel <- metadata(x)$GaussianModel
  }
  if(is(x, "RangedSummarizedExperiment")) {
    if(is.null(y_profiles)) stop(paste0(dQuote("y_profiles")), " must
                                         be included for class ",
                                         paste0(dQuote("RangedSummarizedExperiment")))
    probaPost <- assay(x)
    GaussianModel <- metadata(x)$GaussianModel
  }

  if(GaussianModel != "Gaussian_pk_Lk_Ck")
    stop("Recalculation of Gaussian parameters currently only supported for",
         paste(dQuote("Gaussian_pk_Lk_Ck")))

  pi <- apply(probaPost,2,sum) / nrow(y_profiles)
  mu <- matrix(0, nrow=ncol(probaPost), ncol=ncol(y_profiles))
  Sigma <- array(0, dim=c(ncol(y_profiles), ncol(y_profiles), ncol(probaPost)))
  for (k in seq_len(ncol(probaPost))) {
    mu[k,]<-apply(probaPost[,k] * y_profiles,2,sum) / sum(probaPost[,k])
    Sigma[,,k] <- (t(y_profiles) - mu[k,]) %*% ( t(t(y_profiles) - mu[k,]) * probaPost[,k])
    Sigma[,,k] <- Sigma[,,k] / sum(probaPost[,k])
  }
  rho <- lapply(seq_len(ncol(probaPost)), function(xx) cov2cor(Sigma[,,xx]))
  rho <- array(unlist(rho), dim = c(dim(rho[[1]]), length(rho)))

  dimnames(rho) <- list(colnames(y_profiles), colnames(y_profiles),
                        paste("Cluster", seq_len(ncol(probaPost))))
  dimnames(Sigma) <- list(colnames(y_profiles), colnames(y_profiles),
                          paste("Cluster", seq_len(ncol(probaPost))))
  colnames(mu) <- colnames(y_profiles)
  rownames(mu) <- paste("Cluster", seq_len(ncol(probaPost)))

  if(plot) {
    par(mfrow=n2mfrow(ncol(probaPost)))
    for(kk in seq_len(ncol(probaPost))) {
      corrplot(rho[,,kk],  method="ellipse", type="upper",
               tl.pos="d", ...)
      mtext(paste("K =", kk), side=2, las=1, line=-1)
    }
  }

  param <- list(pi=round(pi, digits), mu=round(mu, digits), Sigma=round(Sigma, digits),
                rho=round(rho, digits))
  return(param)
}