R/MultiBaC.R

Defines functions genMissingOmics genModelList batchCorrection MultiBaC

Documented in batchCorrection genMissingOmics genModelList MultiBaC

#' @import ggplot2
#' @include auxfunctions.R
#' @include ASCA2f.R
NULL

#' MultiBaC
#'
#' MultiBaC performs a multi-omic, multi-batch correction
#'
#'
#' @docType package
#' @name MultiBaC
NULL
#> NULL


#' MultiBaC
#'
#' MultiBaC is a strategy to correct batch effects from multiomic datasets distributed across different labs or data acquisition events.
#' MultiBaC is the first Batch effect correction algorithm that dealing with batch effect correction in multiomics datasets.
#' MultiBaC is able to remove batch effects across different omics generated within separate batches provided that at least one common
#' omic data type is included in all the batches considered.
#'
#' @param mbac mbac object generated by createMbac.
#' @param test.comp Maximum number of components allowed for PLS models. If NULL (default), the minimal effective rank of the matrices is used as the maximum number of components.
#' @param scale Logical. Whether X and Y matrices must be scaled. By default, FALSE.
#' @param center Logical. Whether X and Y matrices must be centered. By default, TRUE.
#' @param crossval Integer: number of cross-validation segments. The number of samples (rows of 'x') must be at least >= crossvalI. If NULL (default), a leave-one-out crossvalidation is performed.
#' @param Interaction Logical. Whether to model the interaction between experimental factors and bacth factor in ARSyN models. By default, FALSE.
#' @param Variability From 0 to 1. Minimum percent of data variability that must be explained by each ARSyN model. By default, 0.90.
#' @param showplot Logical. If TRUE (default), the Q2 and the explained variance plots are shown.
#' @param showinfo Logical. If TRUE (default), the information about the function progress is shown.
#'
#' @return Custom mbac object. Elements in a mbac object:
#' \enumerate{
#'     \item ListOfBatches: A list of MultiAssayExperiment objects (one per batch).
#'     \item commonOmic: Name of the common omic between the batches.
#'     \item CorrectedData: Same structure than ListOfBatches but with the corrected data instead of the original.
#'     \item PLSmodels: PLS models created during MultiBaC method performance (one model per non-common omic data type).
#'     \item ARSyNmodels: ARSyN models created during MultiBaC performance (one per omic data type).
#'     \item InnerRelation: Table of class data.frame containing the inner correlation (i.e. correlation between the scores of X (t) and Y (u) matrices) for each PLS model across all components.
#' }
#'
#' @references
#' Ugidos, M., Tarazona, S., Prats-Montalbán, J. M., Ferrer, A., & Conesa, A. (2020). MultiBaC: A strategy to remove batch effects between different omic data types. Statistical Methods in Medical Research. https://doi.org/10.1177/0962280220907365
#'
#' @export
#'
#'
#' @examples
#' data('multiyeast')
#'
#' my_mbac <- createMbac (inputOmics = list(A.rna, A.gro, B.rna, B.ribo, C.rna, C.par),
#'                        batchFactor = c("A", "A", "B", "B", "C", "C"),
#'                        experimentalDesign = list("A" =  c("Glu+", "Glu+",
#'                        "Glu+", "Glu-", "Glu-", "Glu-"),
#'                        "B" = c("Glu+", "Glu+", "Glu-", "Glu-"),
#'                        "C" = c("Glu+", "Glu+", "Glu-", "Glu-")),
#'                        omicNames = c("RNA", "GRO", "RNA", "RIBO", "RNA", "PAR"),
#'                        commonOmic = "RNA")
#'
#' my_final_mbac <- MultiBaC (my_mbac,
#'                            test.comp = NULL, scale = FALSE,
#'                            center = TRUE, crossval = NULL,
#'                            Variability = 0.90,
#'                            Interaction = TRUE ,
#'                            showplot = FALSE,
#'                            showinfo = FALSE)
#'
MultiBaC <- function(mbac, test.comp = NULL,
                     scale = FALSE, center = TRUE,
                     showplot = TRUE, crossval = NULL,
                     Interaction = FALSE,
                     Variability = 0.90,
                     showinfo = TRUE) {

  # Input data structure ------------------------------------------------------
  if (is.null(names(mbac$ListOfBatches))) {
    stop ("Stop at input evaluation: Elements in ListOfBatches must be named")
  }

  # Create PLS models ---------------------------------------------------------
  if (showinfo) {
    message("Step 1: Create PLS models")
  }
  mbac <- genModelList(mbac, test.comp = test.comp,
                                 scale = scale, center = center,
                                 crossval = crossval,
                                 showinfo = showinfo)

  # Generate missing omics ----------------------------------------------------
  if (showinfo) {
    message("Step 2: Generating missing omics")
  }
  multiBatchDesign <- genMissingOmics(mbac)

  # Batch effect correction using ARSyNbac ---------------------------------------
  if (showinfo) {
    message("Step 3: Batch effect correction using ARSyNbac")
  }

  # Correction step
  mbac <- batchCorrection (mbac, multiBatchDesign, Interaction, Variability)

  # Inner relation of PLS model table  -----------------------------------------
  showM <- mbac$InnerRelation

  print("Table: Inner correlation between scores for each PLS model computed.")

  k1 <- knitr::kable(showM, format = "pandoc")
  if (showinfo) {
    print(k1)
  }

  # Q2 and explained batch-related variability plots ---------------------------
  if(showplot) {
    plot.mbac (mbac, plotType = "def")
  }

  # Update mbac object ---------------------------------------------------------
  return (mbac)
}


#' batchCorrection
#'
#' This function performs the ARSyNbac correction [1] for each omic contained in mulBatchDesign input object.
#'
#' @param mbac mbac object generated by createMbac. PLS models slot must be present.
#' @param multiBatchDesign A list containing the original and predicted omic for each batch. All omics must be present in every batch. Output object of genMissingOmics function
#' @param Interaction Logical. Whether to model the interaction between experimental factors and bacth factor in ARSyN models. By default, FALSE.
#' @param Variability From 0 to 1. Minimum percent of data variability that must be explained for each ARSyN model. By default, 0.90.
#'
#'
#' @return Custom mbac object. Elements in a mbac object:
#' \enumerate{
#'     \item ListOfBatches: A list of MultiAssayExperiment objects (one per batch).
#'     \item commonOmic: Name of the common omic between the batches.
#'     \item CorrectedData: Same structure than ListOfBatches but with the corrected data instead of the original.
#'     \item PLSmodels: PLS models created during MultiBaC method performance (one model per non-common omic data type).
#'     \item ARSyNmodels: ARSyN models created during MultiBaC performance (one per omic data type).
#'     \item InnerRelation: Table of class data.frame containing the inner correlation (i.e. correlation between the scores of X (t) and Y (u) matrices) for each PLS model across all components.
#' }
#'
#' @export
#' @references [1] Nueda MJ, Ferrer A, Conesa A. ARSyN: A method for the identification and removal of systematic noise in multifactorial time course microarray experiments. Biostatistics. 2012;13(3):553-566. doi:10.1093/biostatistics/kxr042
#' @examples
#' data('multiyeast')
#'
#' my_mbac <- createMbac (inputOmics = list(A.rna, A.gro, B.rna, B.ribo, C.rna, C.par),
#'                        batchFactor = c("A", "A", "B", "B", "C", "C"),
#'                        experimentalDesign = list("A" =  c("Glu+", "Glu+", "Glu+",
#'                        "Glu-", "Glu-", "Glu-"),
#'                        "B" = c("Glu+", "Glu+", "Glu-", "Glu-"),
#'                        "C" = c("Glu+", "Glu+", "Glu-", "Glu-")),
#'                        omicNames = c("RNA", "GRO", "RNA", "RIBO", "RNA", "PAR"),
#'                        commonOmic = "RNA")
#'
#' my_mbac_2 <- genModelList (my_mbac, test.comp = NULL,
#'                            scale = FALSE, center = TRUE,
#'                            crossval = NULL,
#'                            showinfo = TRUE)
#' multiBatchDesign <- genMissingOmics(my_mbac_2)
#' my_finalwise_mbac <- batchCorrection(my_mbac_2,
#'                                      multiBatchDesign = multiBatchDesign,
#'                                      Interaction = FALSE,
#'                                      Variability = 0.9)
#'

batchCorrection <- function(mbac, multiBatchDesign,
                            Interaction = FALSE,
                            Variability = 0.90) {

  # Bulit cond.factor ----------------------------------------------------------
  cond.factor <- lapply(multiBatchDesign, function(x) {
    x@colData[[1]]
  })

  inputList <- getData(multiBatchDesign)
  # Making model matrices ------------------------------------------------------
  cond_factor <- c()
  for ( i in seq_along(cond.factor) ) {
    cond_factor <- c(cond_factor, cond.factor[[i]])
  }
  cond_matrix <- stats::model.matrix(~ 0 + factor(cond_factor))

  batch_factor <- c()
  for ( i in seq_along(multiBatchDesign) ) {
    batch_factor <- c(batch_factor, rep(i, dim(inputList[[i]][[1]])[2]))
  }
  batch_matrix <- stats::model.matrix(~ 0 + factor(batch_factor))

  # Omic correction ------------------------------------------------------------
  correctedOmics <- inputList
  ascamodels <- list()
  for ( omic in names(inputList[[1]])) {
    Xunfold <- NULL
    set_lims <- 0
    for ( lab in names(inputList)) {
      Xunfold <- rbind(Xunfold, t(inputList[[lab]][[omic]]))
      set_lims <- c(set_lims, (set_lims[length(set_lims)])+dim(t(inputList[[lab]][[omic]]))[1])
    }

    # Defining Fac ---------------------------------------------------------------
    Fac0 <- c(1,2,2)
    if (Interaction) {
      names(Fac0) <- c("Model.a", "Model.bab", "Model.res")
    } else {
      names(Fac0) <- c("Model.a", "Model.b", "Model.res")
    }
    asca0 <- ASCA.2f(Xunfold, Designa = cond_matrix, Designb = batch_matrix,
                     Fac = Fac0, Interaction = Interaction,
                     Variability = Variability)
    Fac<- ARSyNcomponents(asca0,Variability=Variability, beta = 2)
    for (i in 1:length(Fac)){
      Fac0[names(Fac[i])]<-Fac[names(Fac[i])]
    }
    asca<- ASCA.2f(Xunfold, Designa = cond_matrix, Designb = batch_matrix,
                   Fac = Fac0, Interaction = Interaction,
                   Variability = Variability)
    if (Interaction) {
      Xunfold.r <- Xunfold - asca$Model.bab$TP #- asca$Model.res$TP
    } else  {
      Xunfold.r <- Xunfold - asca$Model.b$TP #- asca$Model.res$TP
    }
    ascamodels[[omic]] <- c(0,asca)

    for( i in seq_along(inputList)) {
      correctedOmics[[names(inputList)[i]]][[omic]] <- t(Xunfold.r[(set_lims[i]+1):(set_lims[i+1]),])
    }
  }

  output <- mbac$ListOfBatches
  for (lab in names(multiBatchDesign)) {
    for ( omic in names(mbac$ListOfBatches[[lab]]@ExperimentList)) {
      output[[lab]]@ExperimentList[[omic]] <- correctedOmics[[lab]][[omic]]
    }
  }

  retobj <- list("ListOfBatches" = mbac$ListOfBatches,
                 "CorrectedData" = c(lapply(output, function(x) x)),
                 "ARSyNmodels" = ascamodels,
                 "PLSmodels" = mbac$PLSmodels,
                 "InnerRelation" = mbac$InnerRelation,
                 "commonOmic" = mbac$commonOmic)

  return(mbacClass(retobj))
}

#' genModelList
#'
#' This function performs PLS models for every batch. A PLS model is generated for each non-common omic in each batch.
#'
#' @param mbac mbac object generated by *createMbac*.
#' @param test.comp Maximum number of components allowed for PLS models. If NULL (default), the minimal effective rank of the matrices is used as the maximum number of components.
#' @param scale Logical. Whether X and Y matrices must be scaled. By default, FALSE.
#' @param center Logical. Whether X and Y matrices must be centered. By default, TRUE.
#' @param crossval Integer: number of cross-validation segments. The number of samples (rows of 'x') must be at least >= crossvalI. If NULL (default) leave-one-out crossvalidation is performed.
#' @param showinfo Logical. Whether to show the information about the function progress. By default, TRUE.
#'
#' @return Custom mbac object. Elements in a mbac object:
#' \enumerate{
#'     \item ListOfBatches: A list of MultiAssayExperiment objects (one per batch).
#'     \item commonOmic: Name of the common omic between the batches.
#'     \item PLSmodels: PLS models created during MultiBaC method performance (one model per non-common omic data type).
#'     \item InnerRelation: Table of class data.frame containing the inner correlation (i.e. correlation between the scores of X (t) and Y (u) matrices) for each PLS model across all components.
#' }
#'
#' @export
#' @examples
#' data('multiyeast')
#'
#' my_mbac <- createMbac (inputOmics = list(A.rna, A.gro, B.rna, B.ribo, C.rna, C.par),
#'                        batchFactor = c("A", "A", "B", "B", "C", "C"),
#'                        experimentalDesign = list("A" =  c("Glu+", "Glu+",
#'                        "Glu+", "Glu-", "Glu-", "Glu-"),
#'                         "B" = c("Glu+", "Glu+", "Glu-", "Glu-"),
#'                          "C" = c("Glu+", "Glu+", "Glu-", "Glu-")),
#'                        omicNames = c("RNA", "GRO", "RNA", "RIBO", "RNA", "PAR"),
#'                        commonOmic = "RNA")
#'
#' my_mbac_2 <- genModelList (my_mbac, test.comp = NULL,
#'                            scale = FALSE, center = TRUE,
#'                            crossval = NULL,
#'                            showinfo = TRUE)
#'
genModelList <- function(mbac, test.comp = NULL, scale = FALSE, center = TRUE,
                         crossval = NULL, showinfo = TRUE) {

  # Get matrices ---------------------------------------------------------------
  inputList <- getData(mbac$ListOfBatches)
  commonOmic <- mbac$commonOmic

  # Create models --------------------------------------------------------------
  modelList <- list()
  q2ofmodels <- list()
  init.test.comp <- test.comp
  for ( i in names(inputList)) {
    if (showinfo) {
      message(paste0("\t - Model for batch ",i))
    }
    if ( is.null(test.comp)) {
      test.comp <- min(dim(inputList[[i]][[1]]))-1
    }
    aux <- createPLSmodel(inputList[[i]], test.comp = test.comp, messages = FALSE,
                                     scale = scale, center = center,
                                     crossval, commonOmic, showinfo = showinfo)
    modelList[[i]] <- aux[1]
    test.comp <- init.test.comp
  }

  # Inner correlation table
  corr_values <- lapply(modelList, function(l){
    lapply(l, function(x){
      aux <- c()
      for ( c in seq_len(dim(x@scoreMN)[2])) {
        aux <- c(aux, stats::cor(x@scoreMN[,c], x@uMN[,c]))
      }
      aux
    })
  })
  roff <- max(unlist(lapply(corr_values, function(x) lapply(x, length))))
  showM <- NULL
  cnames <- c()
  lab_head <- c(1,unlist(lapply(corr_values, length)))

  for ( i in seq_along(corr_values)) {
    for ( j in seq_along(corr_values[[i]])) {
      aux <- corr_values[[i]][[j]]
      if ( length(aux) < roff) aux <- c(aux, rep(NA, roff-length(aux)))
      showM <- cbind(showM, aux)
      cnames <- c(cnames, paste0(names(corr_values)[i], ": ", names(corr_values[[i]])[j]))
    }
  }
  colnames(showM) <- cnames
  rownames(showM) <- paste0("Component: ",1:dim(showM)[1])


  retobj <- list("ListOfBatches" = mbac$ListOfBatches,
                 "PLSmodels" = modelList,
                 "InnerRelation" = showM,
                 "commonOmic" = mbac$commonOmic)

  return(mbacClass(retobj))
}

#' genMissingOmics
#'
#' This function generates for all the batches the omic data they had not originally. This is the previous step to apply ARSyNbac [1] correction.
#'
#' @param mbac mbac object generated by *createMbac*. PLS models slot must be present.
#'
#' @return A list of *MultiAssayExperiment* structures. In this case, each batch contains all the omics introduced in MultiBaC. For instance, if two batches are being studying, "A" and "B", given that "A" contains "RNA-seq" and "GRO-seq" data and "B" contains "RNA-seq" and "Metabolomica" data, after applying *genMissingOmics* function batch "A" will contain "RNA-seq", "GRO-seq" and predicted "Metabolomics" data.
#'
#' @export
#' @examples
#' data('multiyeast')
#'
#' my_mbac <- createMbac (inputOmics = list(A.rna, A.gro, B.rna, B.ribo, C.rna, C.par),
#'                        batchFactor = c("A", "A", "B", "B", "C", "C"),
#'                        experimentalDesign = list("A" =  c("Glu+", "Glu+",
#'                        "Glu+", "Glu-", "Glu-", "Glu-"),
#'                        "B" = c("Glu+", "Glu+", "Glu-", "Glu-"),
#'                        "C" = c("Glu+", "Glu+", "Glu-", "Glu-")),
#'                        omicNames = c("RNA", "GRO", "RNA", "RIBO", "RNA", "PAR"),
#'                        commonOmic = "RNA")
#'
#' my_mbac_2 <- genModelList (my_mbac, test.comp = NULL,
#'                            scale = FALSE, center = TRUE,
#'                            crossval = NULL,
#'                            showinfo = TRUE)
#' multiBatchDesign <- genMissingOmics(my_mbac_2)
#'
genMissingOmics <- function(mbac) {

  # Get matrices ---------------------------------------------------------------
  inputList <- getData(mbac$ListOfBatches)
  commonOmic <- mbac$commonOmic

  # Generate missing omics -----------------------------------------------------
  missingOmics <- inputList
  for ( i in seq_along(inputList) ) {
    aux <- names(inputList)[-i]
    predictedOmics <- list()
    for ( n in aux ) {
      for ( j in seq_along(mbac$PLSmodels[[n]])) {
        Ohat <- ropls::predict(mbac$PLSmodels[[n]][[j]], newdata = t(inputList[[i]][[commonOmic]]))
        predictedOmics[[names(mbac$PLSmodels[[n]])[j]]] <- t(Ohat)
      }
    }
    missingOmics [[names(inputList)[i]]] <-  predictedOmics
  }

  # Creating omic blocks to correct --------------------------------------------
  multiBatchDesign <- lapply(names(mbac$ListOfBatches), function(l) {
    newmap <- lapply(missingOmics[[l]], function (m) {
      data.frame(primary = rownames(MultiAssayExperiment::colData(mbac$ListOfBatches[[l]])),
                 colname = colnames(m),
                 stringsAsFactors = FALSE)
    })
    newmaplist <- c(MultiAssayExperiment::mapToList(mbac$ListOfBatches[[l]]@sampleMap), newmap)
    newsamplemap <- MultiAssayExperiment::listToMap(newmaplist)
    MultiAssayExperiment::MultiAssayExperiment(experiments = c(MultiAssayExperiment::experiments(mbac$ListOfBatches[[l]]),
                                                               missingOmics[[l]]),
                                               colData = MultiAssayExperiment::colData(mbac$ListOfBatches[[l]]),
                                               sampleMap = newsamplemap)
  } )
  names(multiBatchDesign) <- names(mbac$ListOfBatches)
  return(multiBatchDesign)
}

Try the MultiBaC package in your browser

Any scripts or data that you put into this service are public.

MultiBaC documentation built on Nov. 8, 2020, 6:07 p.m.