R/withinVariation.R

Defines functions withinVariation

Documented in withinVariation

#############################################################################################################
# Authors:
#   Kim-Anh Le Cao, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
#   Benoit Gautier, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
#   Ignacio Gonzalez, Genopole Toulouse Midi-Pyrenees, France
#   Florian Rohart, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
#   Benoit Liquet, Universite de Bordeaux, France.
#
# created: 2011
# last modified: 24-05-2016
#
# Copyright (C) 2011
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#############################################################################################################

# ---------------------------------------------
# withinVariation function
# ---------------------------------------------






#' Within matrix decomposition for repeated measurements (cross-over design)
#'
#' This function is internally called by \code{pca}, \code{pls}, \code{spls},
#' \code{plsda} and \code{splsda} functions for cross-over design data, but can
#' be called independently prior to any kind of multivariate analyses.
#'
#'
#' \code{withinVariation} function decomposes the Within variation in the
#' \eqn{X} data set. The resulting \eqn{Xw} matrix is then input in the
#' \code{multilevel} function.
#'
#' One or two-factor analyses are available.
#'
#' @param X numeric matrix of predictors. \code{NA}s are allowed.
#' @param design a numeric matrix or data frame. The first column indicates the
#' repeated measures on each individual, i.e. the individuals ID. The 2nd and
#' 3rd columns are to split the variation for a 2 level factor.
#' @return \code{withinVariation} simply returns the \eqn{Xw} within matrix,
#' which can be input in the other multivariate approaches already implemented
#' in mixOmics (i.e. spls or splsda, see \code{multilevel}, but also pca or
#' ipca).
#' @author Benoit Liquet, Kim-Anh Lê Cao, Benoit Gautier, Ignacio González.
#' @seealso \code{\link{spls}}, \code{\link{splsda}}, \code{\link{plotIndiv}},
#' \code{\link{plotVar}}, \code{\link{cim}}, \code{\link{network}}.
#' @references On multilevel analysis:
#'
#' Liquet, B., Lê Cao, K.-A., Hocini, H. and Thiebaut, R. (2012) A novel
#' approach for biomarker selection and the integration of repeated measures
#' experiments from two platforms. \emph{BMC Bioinformatics} \bold{13}:325.
#'
#' Westerhuis, J. A., van Velzen, E. J., Hoefsloot, H. C., and Smilde, A. K.
#' (2010). Multivariate paired data analysis: multilevel PLSDA versus OPLSDA.
#' \emph{Metabolomics}, \bold{6}(1), 119-128.
#' @keywords regression multivariate
#' @examples
#'
#' ## Example: one-factor analysis matrix decomposition
#' #--------------------------------------------------------------
#' X <- vac18$genes
#' # in design we only need to mention the repeated measurements to split the one level variation
#' design <- data.frame(sample = vac18$sample)
#'
#' Xw <- withinVariation(X = X, design = design)
#' # multilevel PCA
#' res.pca.1level <- pca(Xw, ncomp = 3)
#'
#' # compare a normal PCA with a multilevel PCA for repeated measurements.
#' # note: PCA makes the assumptions that all samples are independent,
#' # so this analysis is flawed and you should use a multilevel PCA instead
#' res.pca <- pca(X, ncomp = 3)
#'
#' # set up colors for plotIndiv
#' col.stim <- c("darkblue", "purple", "green4","red3")
#' col.stim <- col.stim[as.numeric(vac18$stimulation)]
#'
#' # plotIndiv comparing both PCA and PCA multilevel
#' plotIndiv(res.pca, ind.names = vac18$stimulation, group = col.stim)
#' title(main = 'PCA ')
#' plotIndiv(res.pca.1level, ind.names = vac18$stimulation, group = col.stim)
#' title(main = 'PCA multilevel')
#'
#'
#' @export withinVariation
withinVariation = function(X, design){

    # need a matrix for matrix calculations
    X = as.matrix(X)
    rep.measures = factor(design[, 1])
    factors = design[, -1, drop = FALSE]

    if(any(summary(as.factor(rep.measures)) == 1))
    stop("A multilevel analysis can not be performed when at least one some sample is not repeated.")

    # calculate the variation
    # ---------------------------
    # added condition for the spls case where the condition is not needed
    # all we need is the rep.measures
    if ((ncol(factors) == 0) | (ncol(factors) == 1))
    {
      message("Splitting the variation for 1 level factor.")

      # save sample names for the output
      indiv.names = rownames(X)
      rownames(X) = as.character(rep.measures)

      # compute the mean for each unique individual
      # dealing with specific case with only one subject (leave one out case during prediction)
      X.mean.indiv = matrix(apply(X, 2, tapply, rep.measures, mean, na.rm = TRUE), # to deal with only one subject
                           nrow = length(unique(rep.measures)), ncol = dim(X)[2], dimnames = list(levels(as.factor(rep.measures)), colnames(X)))

      # fill the between matrix with those means
      Xb = X.mean.indiv[as.character(rep.measures), ]

      # compute the within matrix as a difference between the original data
      # and the between matrix
      Xw = X - Xb

      # put dimnames
      dimnames(Xw) = list(indiv.names, colnames(X))
    } else {  # for 2 levels split
      message("Splitting the variation for 2 level factors.")

      ###### off set term
      Xm = colMeans(X)

      ###### compute the mean within each subject
      Xs = apply(X, 2, tapply, rep.measures, mean, na.rm = TRUE)
      Xs = Xs[rep.measures, ]

      # for the first factor
      xbfact1 = apply(X, 2, tapply, paste0(rep.measures, factors[, 1]), mean, na.rm = TRUE)
      xbfact1 = xbfact1[paste0(rep.measures, factors[, 1]), ]

      # for the second factor
      xbfact2 = apply(X, 2, tapply, paste0(rep.measures, factors[, 2]), mean, na.rm = TRUE)
      xbfact2 = xbfact2[paste0(rep.measures, factors[, 2]), ]

      #### fixed effect
      # for the first factor
      Xmfact1 = apply(X, 2, tapply, factors[, 1], mean, na.rm = TRUE)
      Xmfact1 = Xmfact1[factors[, 1], ]

      # for the second factor
      Xmfact2 = apply(X, 2, tapply, factors[, 2], mean, na.rm = TRUE)
      Xmfact2 = Xmfact2[factors[, 2], ]

      # compute the within matrix
      Xw = X + Xs - xbfact1 - xbfact2 + Xmfact1 + Xmfact2
      Xw = sweep(Xw, 2, 2*Xm)  # see formula in refernece Liquet et al.

      # put dimnames
      dimnames(Xw) = dimnames(X)
    }

    return(invisible(Xw))
  }
ajabadi/mixOmics2 documentation built on Aug. 9, 2019, 1:08 a.m.