R/internal_mint.block_helpers.R

Defines functions .getWeights study_split soft_thresholding_L1 BinarySearch soft norm2 sparsity scale.function_old scale.function mean_centering_per_study l2.norm deflation defl.select initialisation_by_svd

Documented in study_split

################################################################################
# Author :
#   Florian Rohart,
#   Kim-Anh Le Cao,
#   Benoit Gautier,
#
# created: 22-04-2015
# last modified: 04-10-2017
#
# Copyright (C) 2015
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
################################################################################


# =============================================================================
# Internal helpers functions to run "mixOmics" and ".mintBlock"
# =============================================================================

# Some of these functions have been borrowed from the RGCCA package,
#   as indicated below

# --------------------------------------
# study_split: used in '.mintBlock.R' and 'predict.mint.block.pls.R'
# --------------------------------------
#' @importFrom rARPACK svds
.getWeights = function(variates, indY)
{
    ncomp = min(sapply(variates, ncol))
    x.xList <- list()
    compt = 1
    for(comp in seq_len(ncomp))
    {
        for(i in seq_len(length(variates))){
            corDat <- rep(0, length(variates))
            names(corDat) <- paste("cor", names(variates)[i], names(variates),
            sep = "_")
            for(j in seq_len(length(variates))){
                corDat[j] <- as.numeric(cor(variates[[i]][,comp],
                variates[[j]][,comp]))
            }
            x.xList[[compt]] <- corDat
            compt = compt +1
        }
    }
    corMat.diablo <- do.call(rbind, x.xList)
    rownames(corMat.diablo) <- paste(names(variates),".comp",
    rep(seq_len(ncomp),each=length(variates)),sep="")
    colnames(corMat.diablo) <- names(variates)

    temp = matrix(corMat.diablo[,indY],ncol=ncomp)
    correlation = apply(temp, 1, function(x){mean(abs(x))})[seq_len(length(variates))]
    names(correlation) = names(variates)

    correlation = correlation[-indY]
    return(correlation)

}


# --------------------------------------
# study_split: used in '.mintBlock.R' and 'predict.mint.block.pls.R'
# --------------------------------------






#' divides a data matrix in a list of matrices defined by a factor
#'
#' \code{study_split} divides a data matrix in a list of matrices defined by a
#' \code{study} input.
#'
#'
#' @param data numeric matrix of predictors
#' @param study grouping factor indicating which samples are from the same
#' study
#' @return \code{study_split} simply returns a list of the same length as the
#' number of levels of \code{study} tha contains submatrices of \code{data}.
#' @author Florian Rohart
#' @seealso \code{\link{mint.pls}}, \code{\link{mint.spls}},
#' \code{\link{mint.plsda}}, \code{\link{mint.splsda}}.
#' @keywords regression multivariate
#' @examples
#'
#' data = stemcells$gene
#' exp = stemcells$study
#'
#' data.list = study_split(data, exp)
#'
#' names(data.list)
#' lapply(data.list, dim)
#' table(exp)
#'
#' @export study_split
study_split = function(data, study)
{
    #data should be a matrix
    if(!is(data, "matrix"))
    data = as.matrix(data)

    M = length(levels(study))

    #---------------------- split data
    if(M>1)
    {
        data.list.study = split.data.frame(data,study)
    } else {
        data.list.study = list(data)
        names(data.list.study) = levels(study)
    }

    result = data.list.study
    return(invisible(result))
}


# --------------------------------------
# soft_thresholding: used in sparsity (below)
# --------------------------------------
# x: vector
# nx: number of entries to put to zero

soft_thresholding_L1 = function(x,nx)
{
    #selection on a (loadings.X). modified on 19/02/15 to make sure that a!=0
    if (nx!=0)
    {
        absa = abs(x)
        if (any(rank(absa, ties.method = "max") <= nx))
        {
            x = ifelse(rank(absa, ties.method = "max") <= nx, 0,
                        sign(x) * (absa - max(absa[rank(absa,
                        ties.method = "max") <= nx])))
        }
    }

    x
}

# -----------------------------------------------------------------------------
# soft.threshold() - soft-thresholds a vector such that
# the L1-norm constraint is satisfied.
# -----------------------------------------------------------------------------
soft.threshold = function (x, sumabs = 1)
return(soft(x, BinarySearch(x, sumabs)))

BinarySearch = function(argu,sumabs)
{
    if (norm2(argu)==0 || sum(abs(argu/norm2(argu)))<=sumabs)
    return(0)

    lam1 = 0
    lam2 = max(abs(argu))-1e-5
    iter = 1
    while (iter < 500)
    {
        su = soft(argu,(lam1+lam2)/2)
        if (sum(abs(su/norm2(su)))<sumabs)
        {
            lam2 = (lam1+lam2)/2
        } else {
            lam1 = (lam1+lam2)/2
        }
        if ((lam2-lam1)<1e-10)
        return((lam1+lam2)/2)

        iter = iter+1
    }
    warning("Didn't quite converge")
    return((lam1+lam2)/2)
}

soft = function(x,d) return(sign(x)*pmax(0, abs(x)-d))

norm2 = function(vec)
{
    a = sqrt(sum(vec^2))
    if (a == 0)
    a = .05

    return(a)
}


# --------------------------------------
# sparsity function: used in '.mintBlock.R'
# --------------------------------------
sparsity=function(loadings.A, keepA, penalty=NULL)
{

    if (!is.null(keepA)) {
        nx = length(loadings.A) - keepA
        loadings.A = soft_thresholding_L1(loadings.A, nx = nx)
    } else if (!is.null(penalty)) {
        loadings.A = soft.threshold(loadings.A, penalty)
    }

    return(loadings.A)
}



# --------------------------------------
# scaling with or without bias: used in mean_centering_per_study (below)
# --------------------------------------
scale.function_old=function(temp, scale = TRUE)
# problem: divide by n instead of n-#NA
{
    meanX = colMeans(temp, na.rm = TRUE)
    data.list.study.scale_i = t(t(temp) - meanX)
    if (scale)
    {
        sqrt.sdX = sqrt(colSums(data.list.study.scale_i^2, na.rm = TRUE) /
        (nrow(temp) - 1))
        data.list.study.scale_i = t(t(data.list.study.scale_i) / sqrt.sdX)
    } else {
        sqrt.sdX = NULL
    }

    #is.na.data = is.na(data.list.study.scale_i)
    #if (sum(is.na.data) > 0)
    #data.list.study.scale_i[is.na.data] = 0

    out = list(data_scale=data.list.study.scale_i, meanX=meanX,
    sqrt.sdX=sqrt.sdX)
    return(out)
}

# --------------------------------------
# scaling, using colSds from library(matrixStats),
# used in mean_centering_per_study (below)
# --------------------------------------
scale.function=function(temp, scale = TRUE)
{
    meanX = colMeans(temp, na.rm = TRUE)

    if (scale)
    {
        sqrt.sdX = colSds(temp,  na.rm=TRUE)
        # first possiblity: scale(), too long
        # second possibility: matrix approach: transpose is too long
        # data.list.study.scale_i = t( (t(temp)-meanX) / sqrt.sdX)

        # third possibility (ell.equal=>TRUE)
        data.list.study.scale_i = temp-rep(1, nrow(temp)) %*% t(meanX)
        data.list.study.scale_i = data.list.study.scale_i /
            rep(1, nrow(temp)) %*% t(sqrt.sdX)


        ind = which(sqrt.sdX == 0) # scaling can creates NA
        if(length(ind) >0)
        data.list.study.scale_i[,ind] = 0

    } else {
        sqrt.sdX = NULL
        # data.list.study.scale_i = t( (t(temp)-meanX)) # too long bc t()
        data.list.study.scale_i = temp-rep(1, nrow(temp)) %*% t(meanX)
    }

    #is.na.data = is.na(data.list.study.scale_i)
    #if (sum(is.na.data) > 0)
    #data.list.study.scale_i[is.na.data] = 0

    out = list(data_scale=data.list.study.scale_i, meanX=meanX,
    sqrt.sdX=sqrt.sdX)
    return(out)
}


# --------------------------------------
# Mean centering/scaling per study: used in '.mintBlock.R'
# --------------------------------------
mean_centering_per_study=function(data, study, scale)
{

    M = length(levels(study))   # number of groups
    # split the data
    data.list.study = study_split(data, study)

    # center and scale data per group, and concatene the data
    res = lapply(data.list.study, scale.function, scale = scale)

    meanX = lapply(res, function(x){x[[2]]})
    sqrt.sdX = lapply(res, function(x){x[[3]]})
    rownames.study = lapply(res, function(x){rownames(x[[1]])})

    #rename rows and cols of concatenated centered (and/or scaled) data
    #colnames(concat.data) = colnames(data) #already done

    #sort the samples as in the original X
    if(M>1) # otherwise already same order
    {
        concat.data = do.call("rbind", lapply(res,function(x){x[[1]]}))
        indice.match = match(rownames(data),rownames(concat.data))
        concat.data = concat.data[indice.match, ,drop=FALSE]
    } else{
        concat.data = res[[1]][[1]]
    }
    if (M > 1)
    {
        for (m in seq_len(M))
        {
            attr(concat.data,paste0("means:", levels(study)[m])) = meanX[[m]]
            if(scale)
            {
                attr(concat.data,paste0("sigma:", levels(study)[m])) =
                sqrt.sdX[[m]]
            } else {
                attr(concat.data,paste0("sigma:", levels(study)[m])) = NULL
            }
        }
    } else {
        attr(concat.data,"scaled:center") = meanX[[1]]
        if (scale)
        {
            attr(concat.data,"scaled:scale") = sqrt.sdX[[1]]
        } else {
            attr(concat.data,"scaled:scale") = NULL
        }
    }

    return(list(concat.data=concat.data, rownames.study=rownames.study))
}


# --------------------------------------
# l2.norm: used in '.mintBlock.R'
# --------------------------------------
l2.norm=function(x)
{
    if (!is.vector(x))
    stop("x has to be a vector")

    out = x / drop(sqrt(crossprod(x)))
}

# ---------------------------------------------------
# tau.estimate() - Estimation of tau accoring to Strimmer formula
# ---------------------------------------------------
#used in '.mintBlock.R'
tau.estimate = function (x)
{
    if (is.matrix(x) == TRUE && is.numeric(x) == FALSE)
    stop("The data matrix must be numeric!")

    p = NCOL(x)
    n = NROW(x)
    #covm = cov(x)
    corm = cor(x)
    xs = scale(x, center = TRUE, scale = TRUE)
    xs2 = xs^2
    v = (n/((n - 1)^3)) * (crossprod(xs2) - 1/n * (crossprod(xs))^2)
    diag(v) = 0
    m = matrix(rep(apply(xs2, 2, mean), p), p, p)
    I = diag(NCOL(x))
    d = (corm - I)^2
    tau = (sum(v))/sum(d)
    tau = max(min(tau, 1), 0)
    return(tau)
}


################################################################################
# Functions acquired from RGCCA R-library
################################################################################
# ------------------------------------------------------------------------------
# cov2() - Compute biased and unbiased covariance and variance estimates
# ------------------------------------------------------------------------------
# used in '.mintBlock.R'
cov2 = function (x, y = NULL, bias = FALSE) {
    n = NROW(x)
    if (is.null(y)) {
        x = as.matrix(x)
        if (bias) {
            C = ((n - 1)/n) * cov(x, use = "pairwise.complete.obs")
        } else {
            C = cov(x, use = "pairwise.complete.obs")
        }
    } else {
        if (bias) {
            C = ((n - 1)/n) * cov(x, y, use = "pairwise.complete.obs")
        } else {
            C = cov(x, y, use = "pairwise.complete.obs")
        }
    }
    return(C)
}


# ------------------------------------------------------------------------------
# miscrossprod() - Compute cross-product between vectors x and y
# ------------------------------------------------------------------------------
# used in '.mintBlock.R'
miscrossprod = function (x, y) {
    d.p = sum(drop(x) * drop(y), na.rm = TRUE)
    #d.p = as.vector(d.p)/norm2(d.p)     ## change made
    return(d.p)
}


# ------------------------------------------------------------------------------
# deflation()
# ------------------------------------------------------------------------------
# used in defl.select (below)
deflation = function(X, y, misdata.q, is.na.A.q, ind.NA){
    # Computation of the residual matrix R
    # Computation of the vector p.

    #is.na.tX <- is.na(t(X))
    #save(list=ls(),file="temp3.Rdata")
    if (misdata.q)
    {
        #is.na.tX = t(is.na.A.q)
        #p = apply(t(X),1,miscrossprod,y)/as.vector(crossprod(y))

        #variates.A[, q] =  apply(A[[q]], 1, miscrossprod, loadings.A[[q]])
        #A.temp = replace(t(X), is.na.tX, 0) # replace NA in A[[q]] by 0
        loadings.A.temp = crossprod(X, y)
        #temp = drop(y) %o% rep(1, ncol(A.temp.q))
        #temp[is.na.A.q] = 0
        # we only want the diagonal, which is the norm of each column of temp
        #loadings.A.norm = crossprod(temp)
        #p = variates.A.temp / diag(loadings.A.norm)

        #d.loadings.A.norm = apply(temp,2, crossprod)
        #only calculating the ones where there's a NA
        d.loadings.A.norm = rep(crossprod(y), ncol(X))
        #ind.NA = which(apply(is.na.A.q, 2, sum) == 1)


        if(length(ind.NA)>0)
        {
            temp = drop(y) %o% rep(1, length(ind.NA))
            # should be n*p, but we limit it to n* where there's NA
            temp[is.na.A.q[,ind.NA,drop=FALSE]] = 0
            d.loadings.A.norm[ind.NA] = apply(temp,2, crossprod)
        }

        p = loadings.A.temp / d.loadings.A.norm
        # we can have 0/0, so we put 0
        a = is.na(p)
        if (any(a))
        p[a] = 0

    } else {
        p <- crossprod(X,y) / as.vector(crossprod(y))
    }

    R <- X - tcrossprod(y,p)
    return(list(p=p,R=R))
}


# ------------------------------------------------------------------------------
# defl.select() - computes residual matrices
# ------------------------------------------------------------------------------
# used in '.mintBlock.R'
defl.select = function(yy, rr, nncomp, nn, nbloc, indY = NULL,
mode = "canonical", aa = NULL, misdata, is.na.A, ind.NA) {
    ### Start: Add new parameter for estimation classic mode
    #save(list=ls(),file="temp2.Rdata")
    resdefl = pdefl = vector("list",length=nbloc)
    for (q in seq_len(nbloc)) {
        # for each block we create missing data parameters to be passed to
        #  deflation()
        if(misdata[q])
        {
            is.na.A.q = is.na.A[[q]]
        } else {
            is.na.A.q = NULL
        }
        ### Start: insertion of new deflations
        # (See La regression PLS Theorie et pratique p204 (Chap 11))
        if ( nn <= nncomp[q] ) {
            if ((mode == "canonical") || (q != indY)) {
                #deflation of each block independently from the others,
                # except indY
                defltmp = deflation(rr[[q]], yy[ , q], misdata[q],
                is.na.A.q, ind.NA[[q]])
                #save(list=ls(),file="temp.Rdata")
                resdefl[[q]] = defltmp$R
                pdefl[[q]]   = defltmp$p
            } else if (mode == "classic") {
                resdefl[[q]] = Reduce("+", lapply(c(seq_len(nbloc))[-q], function(x)
                {rr[[q]] - yy[ ,x]%*%t(aa[[q]])}))/(nbloc-1)
                pdefl[[q]]   =  rep(0,NCOL(rr[[q]]))
            } else if (mode == "invariant") { #no deflation
                resdefl[[q]] = rr[[q]]
                pdefl[[q]]   =  rep(0,NCOL(rr[[q]]))
            } else if (mode == "regression") {
                resdefl[[q]] = Reduce("+", lapply(c(seq_len(nbloc))[-q], function(x)
                {deflation(rr[[q]],yy[, x], misdata[q], is.na.A.q,
                    ind.NA[[q]])$R}))/(nbloc-1)
                pdefl[[q]]   =  rep(0,NCOL(rr[[q]]))
            }
            ### End: insertion of new deflations
        } else {
            resdefl[[q]] = rr[[q]]
            pdefl[[q]]   =  rep(0,NCOL(rr[[q]]))
        }
    }
    names(resdefl) = names(pdefl) = names(rr)

    return(list(resdefl=resdefl,pdefl=pdefl))
}

# ------------------------------------------------------------------------------
# initsvd() - performs SVD on matrix X
# ------------------------------------------------------------------------------
# used in '.mintBlock.R'
initsvd = function (X) {
    n = NROW(X)
    p = NCOL(X)

    if(p>3) #use svds
    {
        ifelse(n >= p, return(svds(X, k=1, nu = 1, nv = 1)$v),
        return(svds(X, k=1, nu = 1, nv = 1)$u))

    } else {
        ifelse(n >= p, return(svd(X, nu = 0, nv = 1)$v),
        return(svd(X, nu = 1, nv = 0)$u))

    }
}

# ------------------------------------------------------------------------------
# init svd
# ------------------------------------------------------------------------------
initialisation_by_svd = function(A, indY = NULL, misdata, is.na.A = NULL,
init = "svd")
{

    J = length(A)
    loadings.A = vector("list",length=J)

    if (init == "svd")
    {

        # same step with or without NA, as they are already replaced by 0
        M = lapply(c(seq_len(J))[-indY], function(x){crossprod(A[[x]], A[[indY]])})
        #ssvd faster with svds, only if more than 3 column, otherwise break down
        svd.M = lapply(M, function(x){if(ncol(x)>3)
            {svds(x, k=1, nu = 1, nv = 1)} else {svd(x, nu = 1, nv = 1)}})

        loadings.A[c(seq_len(J))[-indY]] = lapply(seq_len(length(M)), function(x)
        {svd.M[[x]]$u})
        loadings.A[[indY]] = svd.M[[1]]$v

    } else if (init=="svd.single") {

        alpha =  lapply(seq_len(J), function(y){initsvd(A[[y]])})

        for (j in seq_len(J))
        {
            if (nrow(A[[j]]) >= ncol(A[[j]]))
            {
                loadings.A[[j]] = alpha[[j]]
            } else {
                alpha[[j]] = drop(1/sqrt( t(alpha[[j]]) %*% A[[j]] %*%
                (t(A[[j]]) %*% alpha[[j]]))) * alpha[[j]]

                loadings.A[[j]] = crossprod(A[[j]],alpha[[j]])
            }
        }
    } else {
        stop("init should be either 'svd' or 'svd.single'.")
    }

    return(loadings.A)

}
ajabadi/mixOmics2 documentation built on Aug. 9, 2019, 1:08 a.m.