mixOmics2: Omics Data Integration Project

Documented in rcc

#############################################################################################################
# Authors:
#   Sebastien Dejean, Institut de Mathematiques, Universite de Toulouse et CNRS (UMR 5219), France
#   Ignacio Gonzalez, Genopole Toulouse Midi-Pyrenees, France
#   Francois Bartolo, Institut National des Sciences Appliquees et Institut de Mathematiques, Universite de Toulouse et CNRS (UMR 5219), France
#   Kim-Anh Le Cao, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
#   Florian Rohart, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
#
# created: 2009
# last modified: 03-03-2016
#
# Copyright (C) 2009
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#############################################################################################################








#' Regularized Canonical Correlation Analysis
#'
#' The function performs the regularized extension of the Canonical Correlation
#' Analysis to seek correlations between two data matrices.
#'
#' The main purpose of Canonical Correlations Analysis (CCA) is the exploration
#' of sample correlations between two sets of variables \eqn{X} and \eqn{Y}
#' observed on the same individuals (experimental units) whose roles in the
#' analysis are strictly symmetric.
#'
#' The \code{cancor} function performs the core of computations but additional
#' tools are required to deal with data sets highly correlated (nearly
#' collinear), data sets with more variables than units by example.
#'
#' The \code{rcc} function, the regularized version of CCA, is one way to deal
#' with this problem by including a regularization step in the computations of
#' CCA. Such a regularization in this context was first proposed by Vinod
#' (1976), then developped by Leurgans \emph{et al.} (1993). It consists in the
#' regularization of the empirical covariances matrices of \eqn{X} and \eqn{Y}
#' by adding a multiple of the matrix identity, that is, Cov\eqn{(X)+ \lambda_1
#' I} and Cov\eqn{(Y)+ \lambda_2 I}.
#'
#' When \code{lambda1=0} and \code{lambda2=0}, \code{rcc} performs a classical
#' CCA, if possible (i.e. when \eqn{n > p+q}.
#'
#' The shrinkage estimates \code{method = "shrinkage"} can be used to bypass
#' \code{\link{tune.rcc}} to choose the shrinkage parameters - which can be
#' long and costly to compute with very large data sets. Note that both
#' functions \code{\link{tune.rcc}} (which uses cross-validation) and the
#' whrinkage parameters (which uses the formula from Schafer and Strimmer) may
#' output different results.
#'
#' Note: when \code{method = "shrinkage"} the input data are centered and
#' scaled for the estimation of the shrinkage parameters and the calculation of
#' the regularised variance-covariance matrices in \code{\link{rcc}}.
#'
#' The estimation of the missing values can be performed by the reconstitution
#' of the data matrix using the \code{nipals} function. Otherwise, missing
#' values are handled by casewise deletion in the \code{rcc} function.
#'
#' @aliases rcc rcc.default
#' @param X numeric matrix or data frame \eqn{(n \times p)}, the observations
#' on the \eqn{X} variables. \code{NA}s are allowed.
#' @param Y numeric matrix or data frame \eqn{(n \times q)}, the observations
#' on the \eqn{Y} variables. \code{NA}s are allowed.
#' @param method One of "ridge" or "shrinkage". If "ridge", \code{lambda1} and
#' \code{lambda2} need to be supplied (see also our function tune.rcc); if
#' "shrinkage", parameters are directly estimated with Strimmer's formula, see
#' below and reference.
#' @param ncomp the number of components to include in the model. Default to 2.
#' @param lambda1,lambda2 a non-negative real. The regularization parameter for
#' the \emph{X} and \emph{Y} data. Defaults to \code{lambda1=lambda2=0}. Only
#' used if \code{method="ridge"}
#' @return \code{rcc} returns a object of class \code{"rcc"}, a list that
#' contains the following components: \item{X}{the original \eqn{X} data.}
#' \item{Y}{the original \eqn{Y} data.} \item{cor}{a vector containing the
#' canonical correlations.} \item{lambda}{a vector containing the
#' regularization parameters whether those were input if ridge method or
#' directly estimated with the shrinkage method.} \item{loadings}{list
#' containing the estimated coefficients used to calculate the canonical
#' variates in \eqn{X} and \eqn{Y}.} \item{variates}{list containing the
#' canonical variates.} \item{names}{list containing the names to be used for
#' individuals and variables.}
#' @author Sébastien Déjean, Ignacio González, Francois Bartolo.
#' @seealso \code{\link{summary}}, \code{\link{tune.rcc}},
#' \code{\link{plot.rcc}}, \code{\link{plotIndiv}}, \code{\link{plotVar}},
#' \code{\link{cim}}, \code{\link{network}} and http://www.mixOmics.org for
#' more details.
#' @references González, I., Déjean, S., Martin, P. G., and Baccini, A. (2008).
#' CCA: An R package to extend canonical correlation analysis. Journal of
#' Statistical Software, 23(12), 1-14.
#'
#' González, I., Déjean, S., Martin, P., Goncalves, O., Besse, P., and Baccini,
#' A. (2009). Highlighting relationships between heterogeneous biological data
#' through graphical displays based on regularized canonical correlation
#' analysis. Journal of Biological Systems, 17(02), 173-199.
#'
#' Leurgans, S. E., Moyeed, R. A. and Silverman, B. W. (1993). Canonical
#' correlation analysis when the data are curves. \emph{Journal of the Royal
#' Statistical Society. Series B} \bold{55}, 725-740.
#'
#' Vinod, H. D. (1976). Canonical ridge and econometrics of joint production.
#' \emph{Journal of Econometrics} \bold{6}, 129-137.
#'
#' Opgen-Rhein, R., and K. Strimmer. 2007. Accurate ranking of differentially
#' expressed genes by a distribution-free shrinkage approach. Statist.
#' emphAppl. Genet. Mol. Biol. \bold{6}:9.
#' (http://www.bepress.com/sagmb/vol6/iss1/art9/)
#'
#' Sch"afer, J., and K. Strimmer. 2005. A shrinkage approach to large-scale
#' covariance estimation and implications for functional genomics. Statist.
#' emphAppl. Genet. Mol. Biol.  \bold{4}:32.
#' (http://www.bepress.com/sagmb/vol4/iss1/art32/)
#' @keywords multivariate
#' @examples
#'
#' ## Classic CCA
#' X <- linnerud$exercise
#' Y <- linnerud$physiological
#' linn.res <- rcc(X, Y)
#'
#' \dontrun{
#' ## Regularized CCA
#' X <- nutrimouse$lipid
#' Y <- nutrimouse$gene
#' nutri.res1 <- rcc(X, Y, ncomp = 3, lambda1 = 0.064, lambda2 = 0.008)
#'
#' ## using shrinkage parameters
#' nutri.res2 <- rcc(X, Y, ncomp = 3, method = 'shrinkage')
#' nutri.res2$lambda # the shrinkage parameters
#' }
#'
#' @export rcc
rcc =
function(   X,
            Y,
            ncomp = 2,
            method = "ridge", #c("ridge", "shrinkage")
            lambda1 = 0,
            lambda2 = 0)
{
    #-- checking general input parameters --------------------------------------#
    #---------------------------------------------------------------------------#

    #-- check that the user did not enter extra arguments
    arg.call = match.call()
    user.arg = names(arg.call)[-1]

    err = tryCatch(mget(names(formals()), sys.frame(sys.nframe())),
    error = function(e) e)

    if ("simpleError" %in% class(err))
    stop(err[[1]], ".", call. = FALSE)

    #-- data set names --#
    data.names = c(deparse(substitute(X)), deparse(substitute(Y)))

    #-- method
    choices = c("ridge", "shrinkage")
    method = choices[pmatch(method, choices)]
    if (is.na(method))
    stop("'method' should be one of 'ridge' or 'shrinkage'.",
    call. = FALSE)

    #-- X matrix
    if (is.data.frame(X)) X = as.matrix(X)

    if (!is.matrix(X) || is.character(X))
    stop("'X' must be a numeric matrix.", call. = FALSE)

    if (any(apply(X, 1, is.infinite)))
    stop("infinite values in 'X'.", call. = FALSE)

    if (method == "shrinkage"){
        if (any(is.na(X)))
        stop("missing values in 'X' matrix. NAs not are allowed if method = 'shrinkage'.", call. = FALSE)
    }
    #-- Y matrix
    if (is.data.frame(Y)) Y = as.matrix(Y)

    if (!is.matrix(Y) || is.character(Y))
    stop("'Y' must be a numeric matrix.", call. = FALSE)

    if (any(apply(Y, 1, is.infinite)))
    stop("infinite values in 'Y'.", call. = FALSE)

    if (method == "shrinkage")
    if (any(is.na(Y)))
    stop("missing values in 'Y' matrix. NAs not are allowed if method = 'shrinkage'.", call. = FALSE)

    #-- equal number of rows in X and Y
    if ((n = nrow(X)) != nrow(Y))
    stop("unequal number of rows in 'X' and 'Y'.", call. = FALSE)

    p = ncol(X)
    q = ncol(Y)

    #-- put a names on the columns of X and Y --#
    X.names = colnames(X)
    if (is.null(X.names)) X.names = paste("X", 1:ncol(X), sep = "")

    Y.names = colnames(Y)
    if (is.null(Y.names)) Y.names = paste("Y", 1:ncol(Y), sep = "")

    #-- put a names on the samples --#
    ind.names = dimnames(X)[[1]]
    if (is.null(ind.names)) ind.names = dimnames(Y)[[1]]
    if (is.null(ind.names)) ind.names = 1:n

    #-- ncomp
    if (is.null(ncomp) || ncomp < 1 || !is.finite(ncomp))
    stop("invalid value for 'ncomp'.", call. = FALSE)

    ncomp = round(ncomp)

    if (ncomp > min(p, q))
    stop("'comp' must be smaller or equal than ", min(p, q), ".",
    call. = FALSE)

    #-- lambda1
    if (!is.finite(lambda1) || is.null(lambda1))
    stop("invalid value for 'lambda1'.", call. = FALSE)

    if(lambda1 < 0)
    stop("'lambda1' must be a non-negative value.", call. = FALSE)

    #-- lambda2
    if (!is.finite(lambda2) || is.null(lambda2))
    stop("invalid value for 'lambda2'.", call. = FALSE)

    if(lambda2 < 0)
    stop("'lambda2' must be a non-negative value.", call. = FALSE)

    #-- end checking --#
    #------------------#


    #-- rcc approach -----------------------------------------------------------#
    #---------------------------------------------------------------------------#

    #-- covariance matrices regularization --#
    if (method == "ridge") {
        Cxx = var(X, na.rm = TRUE, use = "pairwise") + diag(lambda1, ncol(X))
        Cyy = var(Y, na.rm = TRUE, use = "pairwise") + diag(lambda2, ncol(Y))
        Cxy = cov(X, Y, use = "pairwise")
    }
    else { # if method == 'shrinkage'
        Cxx = cov.shrink(X, verbose = FALSE)
        Cyy = cov.shrink(Y, verbose = FALSE)

        lambda.x = attr(Cxx, "lambda")
        lambda.y = attr(Cyy, "lambda")

        sc.x = sqrt(var.shrink(X, verbose = FALSE))
        sc.y = sqrt(var.shrink(Y, verbose = FALSE))

        w = rep(1/n, n)
        xs = wt.scale(X, w, center = TRUE, scale = TRUE)
        ys = wt.scale(Y, w, center = TRUE, scale = TRUE)

        #-- bias correction factor
        h1 = n / (n - 1)

        #-- unbiased empirical estimator
        Cxy = h1 * crossprod(sweep(sweep(xs, 1, sqrt((1 - lambda.x) * w), "*"), 2, sc.x, "*"),
        sweep(sweep(ys, 1, sqrt((1 - lambda.y) * w), "*"), 2, sc.y, "*"))
    }

    #-- calculation of the canonical correlations and canonical variables --#
    Cxx.fac = chol(Cxx)
    Cyy.fac = chol(Cyy)
    Cxx.fac.inv = solve(Cxx.fac)
    Cyy.fac.inv = solve(Cyy.fac)
    mat = t(Cxx.fac.inv) %*% Cxy %*% Cyy.fac.inv

    if (p >= q) {
        result = svd(mat, nu = ncomp, nv = ncomp)
        cor = result$d
        xcoef = Cxx.fac.inv %*% result$u
        ycoef = Cyy.fac.inv %*% result$v
    }
    else {
        result = svd(t(mat), nu = ncomp, nv = ncomp)
        cor = result$d
        xcoef = Cxx.fac.inv %*% result$v
        ycoef = Cyy.fac.inv %*% result$u
    }

    #-- output -----------------------------------------------------------------#
    #---------------------------------------------------------------------------#
    names(cor) = 1:length(cor)
    X.aux = scale(X, center = TRUE, scale = FALSE)
    Y.aux = scale(Y, center = TRUE, scale = FALSE)
    X.aux[is.na(X.aux)] = 0
    Y.aux[is.na(Y.aux)] = 0

    U = X.aux %*% xcoef
    V = Y.aux %*% ycoef

    cl = match.call()
    cl[[1]] = as.name('rcc')

    if (method == "ridge"){
    lambda = c("lambda1" = lambda1, "lambda2" = lambda2)
    } else { # if method == 'shrinkage')
      lambda = c("lambda1" = lambda.x, "lambda2" = lambda.y)
    }

    result = list(call = cl,
    X = X,
    Y = Y,
    ncomp = ncomp,
    method = method,
    cor = cor,
    loadings = list(X = xcoef, Y = ycoef),
    variates = list(X = U, Y = V),
    names = list(sample = ind.names, colnames = list(X=colnames(X),Y=colnames(Y)), blocks = c("X","Y"),#list(X = X.names, Y = Y.names, indiv = ind.names,
    data = data.names),
    lambda = lambda)

    #calcul explained variance
    explX=explained_variance(result$X,result$variates$X,ncomp)
    explY=explained_variance(result$Y,result$variates$Y,ncomp)
    result$explained_variance=list(X=explX,Y=explY)


    class(result) = "rcc"
    return(invisible(result))
}