mixOmics2: Omics Data Integration Project

################################################################################
# Authors:
#   Florian Rohart,
#   Kim-Anh Le Cao,
#   Francois Bartolo,
#
# created: 18-09-2017
# last modified: 05-10-2017
#
# Copyright (C) 2015
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
################################################################################


# ==============================================================================
# tune.block.splsda: chose the optimal number of parameters per component on a
# block.splsda method
# ==============================================================================

# X: a list of data sets (called 'blocks') matching on the same samples.
#   Data in the list should be arranged in samples x variables, with
#   samples order matching in all data sets. \code{NA}s are not allowed.
# Y: a factor or a class vector for the discrete outcome.
# validation: Mfold or loo cross validation
# folds: if validation = Mfold, how many folds?
# nrepeat: number of replication of the Mfold process
# ncomp: the number of components to include in the model. Default to 1.
# choice.keepX: a list, each choice.keepX[[i]] is a vector giving keepX on
#   the components that were already tuned for block i
# test.keepX: a list of value(keepX) to test on the last component.
#   There needs to be names(test.keepX)
# measure: one of c("overall","BER"). Accuracy measure used in the cross
#   validation processs
# weighted: optimise the weighted or not-weighted prediction
# dist: distance to classify samples. see predict
# scheme: the input scheme, one of "horst", "factorial" or ""centroid".
#   Default to "centroid"
# design: the input design.
# init: intialisation of the algorithm, one of "svd" or "svd.single".
#   Default to "svd"
# tol: Convergence stopping value.
# max.iter: integer, the maximum number of iterations.
# near.zero.var: boolean, see the internal \code{\link{nearZeroVar}} function
#   (should be set to TRUE in particular for data with many zero values).
# progressBar: show progress,
# cl: if parallel, the clusters
# scale: boleean. If scale = TRUE, each block is standardized to zero means and
#   unit variances (default: TRUE).
# misdata: optional. any missing values in the data? list,
#   misdata[[q]] for each data set
# is.na.A: optional. where are the missing values? list,
#   is.na.A[[q]] for each data set (if misdata[[q]] == TRUE)
# ind.NA: optional. which rows have missing values? list,
#   ind.NA[[q]] for each data set.
# ind.NA.col: optional. which col have missing values? list,
#   ind.NA.col[[q]] for each data set.
# parallel: logical.



MCVfold.block.splsda = function(
X,
Y,
validation,
folds,
nrepeat = 1,
ncomp,
choice.keepX = NULL,
test.keepX,
measure = c("overall"),
weighted = TRUE,
dist = "max.dist",
scheme,
design,
init,
tol,
max.iter = 100,
near.zero.var = FALSE,
progressBar = TRUE,
cl,
scale,
misdata,
is.na.A,
parallel
)
{    #-- checking general input parameters ------------------------------------#
    #--------------------------------------------------------------------------#
    #-- set up a progress bar --#
    if (progressBar ==  TRUE)
    {
        pb = txtProgressBar(style = 3)
        nBar = 1
    } else {
        pb = FALSE
    }
    
    #design = matrix(c(0,1,1,0), ncol = 2, nrow = 2, byrow = TRUE)
    
    if(ncomp>1)
    {
        keepY = rep(nlevels(Y), ncomp-1)
    } else {keepY = NULL}
    
    M = length(folds)
    prediction.comp = class.comp = list()
    for(ijk in dist)
    class.comp[[ijk]] = array(0, c(nrow(X[[1]]), nrepeat,
    nrow(expand.grid(test.keepX))))
    # prediction of all samples for each test.keepX and  nrep at comp fixed
    folds.input = folds
    for(nrep in 1:nrepeat)
    {
        # we don't record all the predictions for all fold and all blocks,
        #   too much data

        n = nrow(X[[1]])
        repeated.measure = 1:n
        
        #-- define the folds --#
        if (validation ==  "Mfold")
        {
            
            if (nrep > 1) # reinitialise the folds
            folds = folds.input
            
            if (is.null(folds) || !is.numeric(folds) || folds < 2 || folds > n)
            {
                stop("Invalid number of folds.")
            } else {
                M = round(folds)
                #if (is.null(multilevel))
                #{
                    temp = stratified.subsampling(Y, folds = M)
                    folds = temp$SAMPLE
                    if(temp$stop > 0 & nrep == 1) # to show only once
                    warning("At least one class is not represented in one fold,
                    which may unbalance the error rate.\n  Consider a number of
                    folds lower than the minimum in table(Y): ", min(table(Y)))
             }
        } else if (validation ==  "loo") {
            folds = split(1:n, rep(1:n, length = n))
            M = n
        }
        
        M = length(folds)
        
        error.sw = matrix(0,nrow = M, ncol = length(test.keepX))
        rownames(error.sw) = paste0("fold",1:M)
        colnames(error.sw) = names(test.keepX)
        # for the last keepX (i) tested, prediction combined for all M folds so
        # as to extract the error rate per class
        # prediction.all = vector(length = nrow(X))
        # in case the test set only includes one sample, it is better to advise
        #the user to perform loocv
        stop.user = FALSE
        
        #result.all=list()
        fonction.j.folds =function(j)#for(j in 1:M)
        {
            if (progressBar ==  TRUE)
            setTxtProgressBar(pb, (M*(nrep-1)+j-1)/(M*nrepeat))
            
            #print(j)
            #set up leave out samples.
            omit = which(repeated.measure %in% folds[[j]] == TRUE)
            
            # get training and test set
            X.train = lapply(X, function(x){x[-omit, ]})
            Y.train = Y[-omit]
            Y.train.mat = unmap(Y.train)
            Q = ncol(Y.train.mat)
            colnames(Y.train.mat) = levels(Y.train)
            rownames(Y.train.mat) = rownames(X.train[[1]])
            X.test = lapply(X, function(x){x[omit, , drop = FALSE]})
            #matrix(X[omit, ], nrow = length(omit))
            #removed to keep the colnames in X.test
            Y.test = Y[omit]


            #---------------------------------------#
            #-- near.zero.var ----------------------#
            remove = vector("list",length=length(X))
            # first remove variables with no variance inside each X.train/X.test
            #var.train = colVars(X.train, na.rm=TRUE)#apply(X.train, 2, var)
            var.train = lapply(X.train, function(x){colVars(x, na.rm=TRUE)})
            for(q in 1:length(X))
            {
                ind.var = which(var.train[[q]] == 0)
                if (length(ind.var) > 0)
                {
                    remove[[q]] =
                    c(remove[[q]], colnames(X.train[[q]])[ind.var])

                    X.train[[q]] = X.train[[q]][, -c(ind.var),drop = FALSE]
                    X.test[[q]] = X.test[[q]][, -c(ind.var),drop = FALSE]
                    
                    # reduce choice.keepX and test.keepX if needed
                    if (any(choice.keepX[[q]] > ncol(X.train[[q]])))
                    choice.keepX[[q]][which(choice.keepX[[q]]>
                    ncol(X.train[[q]]))] = ncol(X.train[[q]])
                    
                    # reduce test.keepX if needed
                    if (any(test.keepX[[q]] > ncol(X.train[[q]])))
                    test.keepX[[q]][which(test.keepX[[q]]>
                    ncol(X.train[[q]]))] = ncol(X.train[[q]])
                    
                }
            }
            
            # near zero var on X.train
            if(near.zero.var == TRUE)
            {
                nzv.A = lapply(X.train, nearZeroVar)
                for(q in 1:length(X.train))
                {
                    if (length(nzv.A[[q]]$Position) > 0)
                    {
                        names.remove.X =
                        colnames(X.train[[q]])[nzv.A[[q]]$Position]
                        remove[[q]] = c(remove[[q]], names.remove.X)

                        X.train[[q]] =
                        X.train[[q]][, -nzv.A[[q]]$Position, drop=FALSE]
                        X.test[[q]] =
                        X.test[[q]][, -nzv.A[[q]]$Position,drop = FALSE]
                        
                        if (ncol(X.train[[q]]) == 0)
                        stop(paste0("No more variables in",X.train[[q]]))
                        
                        #need to check that the keepA[[q]] is now not higher
                        # than ncol(A[[q]])
                        if (any(test.keepX[[q]] > ncol(X.train[[q]])))
                        test.keepX[[q]][which(test.keepX[[q]]>
                        ncol(X.train[[q]]))] = ncol(X.train[[q]])
                    }
                }
            }
            
            #-- near.zero.var ----------------------#
            #---------------------------------------#
            
            
            #------------------------------------------#
            # split the NA in training and testing
            if(any(misdata))
            {
                
                is.na.A.train = is.na.A.test = ind.NA.train = ind.NA.col.train =
                vector("list", length = length(X))

                for(q in 1:length(X))
                {
                    if(misdata[q])
                    {
                        if(length(remove[[q]])>0){
                            ind.remove =
                            which(colnames(X[[q]]) %in% remove[[q]])
                            is.na.A.train[[q]] =
                            is.na.A[[q]][-omit, -ind.remove, drop=FALSE]
                            is.na.A.test[[q]] =
                            is.na.A[[q]][omit, -ind.remove, drop=FALSE]
                        } else {
                            is.na.A.train[[q]] =
                            is.na.A[[q]][-omit, , drop=FALSE]
                            is.na.A.test[[q]] = is.na.A[[q]][omit, , drop=FALSE]
                        }
                        temp = which(is.na.A.train[[q]], arr.ind=TRUE)
                        ind.NA.train[[q]] = unique(temp[,1])
                        ind.NA.col.train[[q]] = unique(temp[,2])
                    }
                }
                names(is.na.A.train) = names(is.na.A.test) =
                names(ind.NA.train) = names(ind.NA.col.train) = names(is.na.A)
                
                
                if(FALSE){
                    is.na.A.train = ind.NA.train = ind.NA.col.train =
                    vector("list", length = length(X))
                    
                    is.na.A.train= lapply(is.na.A, function(x)
                    {x[-omit,, drop=FALSE]})
                    is.na.A.test = lapply(is.na.A, function(x)
                    {x[omit,,drop=FALSE]})
                    for(q in 1:length(X))
                    {
                        if(misdata[q])
                        {
                            
                            temp = which(is.na.A.train[[q]], arr.ind=TRUE)
                            ind.NA.train[[q]] = unique(temp[,1])
                            ind.NA.col.train[[q]] = unique(temp[,2])
                        }
                    }
                }
            } else {
                is.na.A.train = is.na.A.test = NULL
                ind.NA.train = NULL
                ind.NA.col.train = NULL
            }

            # split the NA in training and testing
            #------------------------------------------#

            is.na.A.temp = ind.NA.temp = ind.NA.col.temp =
            vector("list", length = length(X)+1)
            # inside wrapper.mint.block, X and Y are combined,
            # so the ind.NA need  length(X)+1
            is.na.A.temp[1:length(X)] = is.na.A.train
            ind.NA.temp[1:length(X)] = ind.NA.train
            ind.NA.col.temp[1:length(X)] = ind.NA.col.train
                        
            # shape input for `.mintBlock' (keepA, test.keepA, etc)
            result <- suppressMessages(
            .mintWrapperBlock(
            X=X.train, Y=Y.train.mat, study=factor(rep(1,length(Y.train))),
            ncomp=ncomp, keepX=choice.keepX,
            keepY=rep(ncol(Y.train.mat), ncomp-1),
            test.keepX=test.keepX, test.keepY=ncol(Y.train.mat),
            mode="regression", scale=scale, near.zero.var=near.zero.var,
            design=design, max.iter=max.iter, scheme =scheme, init=init,
            tol=tol,
            misdata = misdata, is.na.A = is.na.A.temp, ind.NA = ind.NA.temp,
            ind.NA.col = ind.NA.col.temp, all.outputs=FALSE))
            
            # `result' returns loadings and variates for all test.keepX on
            # the ncomp component
            
            # need to find the best keepX/keepY among all the tested models
            #save(list=ls(),file="temp.Rdata")
            
            # we prep the test set for the successive prediction: scale and
            # is.na.newdata
            
            # scale X.test
            ind.match = 1:length(X.train)# for missing blocks in predict.R
            if (!is.null(attr(result$A[[1]], "scaled:center")))
            X.test[which(!is.na(ind.match))] = lapply(which(!is.na(ind.match)),
            function(x){sweep(X.test[[x]], 2, STATS =
                attr(result$A[[x]], "scaled:center"))})
            if (scale)
            X.test[which(!is.na(ind.match))] = lapply(which(!is.na(ind.match)),
            function(x){sweep(X.test[[x]], 2, FUN = "/", STATS =
                attr(result$A[[x]], "scaled:scale"))})
            
            means.Y = matrix(attr(result$A[[result$indY]], "scaled:center"),
            nrow=nrow(X.test[[1]]),ncol=Q,byrow=TRUE);
            if (scale)
            {sigma.Y = matrix(attr(result$A[[result$indY]], "scaled:scale"),
                nrow=nrow(X.test[[1]]),ncol=Q,byrow=TRUE)
                
            }else{
                sigma.Y=matrix(1,nrow=nrow(X.test[[1]]),ncol=Q)}
            
            names(X.test)=names(X.train)
            
            # record prediction results for each test.keepX
            keepA = result$keepA
            test.keepA = keepA[[ncomp]]
            
            class.comp.j = list()
            for(ijk in dist)
            class.comp.j[[ijk]] =
            matrix(0, nrow = length(omit), ncol = nrow(test.keepA))
            #prediction of all samples for each test.keepX and  nrep
            # at comp fixed
            
            
            # creates temporary block.splsda object to use the predict function
            class(result) =
            c("block.splsda", "block.spls", "sgccda", "sgcca", "DA")
            
            result$X = result$A[-result$indY]
            result$ind.mat = result$A[result$indY][[1]]
            result$Y = factor(Y.train)

            #save variates and loadings for all test.keepA
            result.temp =
            list(variates = result$variates, loadings = result$loadings)

            #time2 = proc.time()
            for(i in 1:nrow(test.keepA))
            {
                #print(i)
                
                #only pick the loadings and variates relevant to that test.keepX
                
                names.to.pick = NULL
                if(ncomp>1)
                names.to.pick = unlist(lapply(1:(ncomp-1), function(x){
                    paste(paste0("comp",x),apply(keepA[[x]],1,function(x)
                    paste(x,collapse="_")), sep=":")
                    
                }))
                
                names.to.pick.ncomp = paste(paste0("comp",ncomp),
                paste(as.numeric(keepA[[ncomp]][i,]),collapse="_"), sep=":")
                names.to.pick = c(names.to.pick, names.to.pick.ncomp)
                
                
                result$variates = lapply(result.temp$variates, function(x)
                {if(ncol(x)!=ncomp) {x[,colnames(x)%in%names.to.pick,
                    drop=FALSE]}else{x}})
                result$loadings = lapply(result.temp$loadings, function(x)
                {if(ncol(x)!=ncomp) {x[,colnames(x)%in%names.to.pick,
                    drop=FALSE]}else{x}})
                
                result$weights = .getWeights(result$variates, indY=result$indY)

                # do the prediction, we are passing to the function some
                # invisible parameters:the scaled newdata and the missing values

                test.predict.sw <- predict.block.spls(result,
                newdata.scale = X.test, dist = dist, misdata.all=misdata,
                is.na.X = is.na.A.train, is.na.newdata = is.na.A.test,
                noAveragePredict=FALSE)

                if(weighted ==TRUE) #WeightedVote
                {
                    for(ijk in dist)
                    class.comp.j[[ijk]][, i] =
                    test.predict.sw$WeightedVote[[ijk]][, ncomp]
                } else {#MajorityVote
                    for(ijk in dist)
                    class.comp.j[[ijk]][, i] =
                    test.predict.sw$MajorityVote[[ijk]][, ncomp]
                }
            } # end i

            return(list(
            class.comp.j = class.comp.j, omit = omit, keepA = keepA))
        } # end fonction.j.folds
        


        if (parallel == TRUE)
        {
            clusterEvalQ(cl, library(mixOmics))
            clusterExport(cl, ls(), envir=environment())
           result.all = parLapply(cl, 1: M, fonction.j.folds)
        } else {
           result.all = lapply(1: M, fonction.j.folds)
        }

        keepA = result.all[[1]]$keepA
        test.keepA = keepA[[ncomp]]

        # combine the results
        for(j in 1:M)
        {
            omit = result.all[[j]]$omit
            #prediction.comp.j = result[[j]]$prediction.comp.j
            class.comp.j = result.all[[j]]$class.comp.j

            #prediction.comp[[nrep]][omit, , ] = prediction.comp.j
            for(ijk in dist)
            class.comp[[ijk]][omit,nrep, ] = class.comp.j[[ijk]]
        }
        
        if (progressBar ==  TRUE)
        setTxtProgressBar(pb, (M*nrep)/(M*nrepeat))
        
    } #end nrep 1:nrepeat

    #names(prediction.comp) =
    # class.comp[[ijk]] is a matrix containing all prediction for test.keepX,
    # all nrepeat and all distance, at comp fixed
    
    keepA.names = apply(test.keepA[,1:length(X)],1,function(x)
    paste(x,collapse="_"))#, sep=":")
        

    result = list()
    error.mean = error.sd = error.per.class.keepX.opt.comp = keepX.opt =
    test.keepX.out = mat.error.final = choice.keepX.out = list()
    #save(list=ls(), file="temp22.Rdata")

    if (any(measure == "overall"))
    {
        for(ijk in dist)
        {
            rownames(class.comp[[ijk]]) = rownames(X)
            colnames(class.comp[[ijk]]) = paste0("nrep.", 1:nrepeat)
            dimnames(class.comp[[ijk]])[[3]] = keepA.names
            
            #finding the best keepX depending on the error measure:
            # overall or BER
            # classification error for each nrep and each test.keepX:
            # summing over all samples
            error = apply(class.comp[[ijk]],c(3,2),function(x)
            {
                length(Y) - sum(as.character(Y) == x, na.rm=TRUE)
            })
            
            # we want to average the error per keepX over nrepeat and choose
            # the minimum error
            error.mean[[ijk]] = apply(error,1,mean)/length(Y)
            if (!nrepeat ==  1)
            error.sd[[ijk]] = apply(error,1,sd)/length(Y)
            
            mat.error.final[[ijk]] = error/length(Y)
            # percentage of misclassification error for each test.keepX (rows)
            # and each nrepeat (columns)
            
            
            min.error = min(error.mean[[ijk]])
            min.keepX = rownames(error)[which(error.mean[[ijk]] == min.error)]
            # vector of all keepX combination that gives the minimum error
            
            a = lapply(min.keepX, function(x)
            {as.numeric(strsplit(x, "_")[[1]][1:length(X)])})
            
            #transform keepX in percentage of variable per dataset, so we choose
            # the minimal overall
            p = sapply(X,ncol)
            percent = sapply(a, function(x) sum(x/p))
            ind.opt = which.min(percent) # we take only one
            a = a[[ind.opt]]# vector of each optimal keepX for all block on
            # component comp.real[comp]
            
            # best keepX
            opt.keepX.comp = as.list(a)
            names(opt.keepX.comp) = names(X)
            
            choice.keepX = lapply(1:length(X), function(x)
            {c(choice.keepX[[x]],opt.keepX.comp[[x]])})
            names(choice.keepX) = names(X)
            
            keepX.opt[[ijk]] = which(error.mean[[ijk]] == min.error)[ind.opt]
            
            
            # confusion matrix for keepX.opt
            error.per.class.keepX.opt.comp[[ijk]] =apply(class.comp[[ijk]][, ,
            keepX.opt[[ijk]], drop = FALSE], 2, function(x)
            {
                conf = get.confusion_matrix(truth = factor(Y), predicted = x)
                out = (apply(conf, 1, sum) - diag(conf)) / summary(Y)
            })
            
            test.keepX.out[[ijk]] = keepA.names[keepX.opt[[ijk]]]
            #strsplit(keepA.names[keepX.opt[[ijk]]],":")[[1]][2]
            # single entry of the keepX for each block
        
            result$"overall"$error.rate.mean = error.mean
            if (!nrepeat ==  1)
            result$"overall"$error.rate.sd = error.sd
            
            result$"overall"$confusion = error.per.class.keepX.opt.comp
            result$"overall"$mat.error.rate = mat.error.final
            result$"overall"$ind.keepX.opt = keepX.opt
            result$"overall"$keepX.opt = test.keepX.out
            result$"overall"$choice.keepX = choice.keepX

        }
    }
    
    if (any(measure ==  "BER"))
    {
        for(ijk in dist)
        {
            rownames(class.comp[[ijk]]) = rownames(X[[1]])
            colnames(class.comp[[ijk]]) = paste0("nrep.", 1:nrepeat)
            dimnames(class.comp[[ijk]])[[3]] = keepA.names
            
            error = apply(class.comp[[ijk]],c(3,2),function(x)
            {
                conf = get.confusion_matrix(truth = factor(Y),predicted = x)
                get.BER(conf)
            })
            rownames(error) = keepA.names
            colnames(error) = paste0("nrep.",1:nrepeat)
            
            # average BER over the nrepeat
            error.mean[[ijk]] = apply(error,1,mean)
            if (!nrepeat ==  1)
            error.sd[[ijk]] = apply(error,1,sd)
            
            mat.error.final[[ijk]] = error
            # BER for each test.keepX (rows) and each nrepeat (columns)
            
            
            min.error = min(error.mean[[ijk]])
            min.keepX = rownames(error)[which(error.mean[[ijk]] == min.error)]
            # vector of all keepX combination that gives the minimum error
            
            a = lapply(min.keepX, function(x)
            {as.numeric(strsplit(x, "_")[[1]][1:length(X)])})
            
            #transform keepX in percentage of variable per dataset, so we
            # choose the minimal overall
            p = sapply(X,ncol)
            percent = sapply(a, function(x) sum(x/p))
            ind.opt = which.min(percent) # we take only one
            a = a[[ind.opt]]# vector of each optimal keepX for all block on
            # component comp.real[comp]
            
            # best keepX
            opt.keepX.comp = as.list(a)
            names(opt.keepX.comp) = names(X)

            choice.keepX = lapply(1:length(X), function(x)
            {c(choice.keepX[[x]],opt.keepX.comp[[x]])})
            names(choice.keepX) = names(X)

            keepX.opt[[ijk]] = which(error.mean[[ijk]] == min.error)[ind.opt]
            
            # confusion matrix for keepX.opt
            error.per.class.keepX.opt.comp[[ijk]] = apply(class.comp[[ijk]][, ,
            keepX.opt[[ijk]], drop = FALSE], 2, function(x)
            {
                conf = get.confusion_matrix(truth = factor(Y), predicted = x)
                out = (apply(conf, 1, sum) - diag(conf)) / summary(Y)
            })
            
            rownames(error.per.class.keepX.opt.comp[[ijk]]) = levels(Y)
            colnames(error.per.class.keepX.opt.comp[[ijk]]) =
            paste0("nrep.", 1:nrepeat)
            
            test.keepX.out[[ijk]] = keepA.names[keepX.opt[[ijk]]]

            result$"BER"$error.rate.mean = error.mean
            if (!nrepeat ==  1)
            result$"BER"$error.rate.sd = error.sd
            
            result$"BER"$confusion = error.per.class.keepX.opt.comp
            result$"BER"$mat.error.rate = mat.error.final
            result$"BER"$ind.keepX.opt = keepX.opt
            result$"BER"$keepX.opt = test.keepX.out
            result$"BER"$choice.keepX = choice.keepX
        }
        
        
    }
    
    #result$prediction.comp = prediction.comp
    result$class.comp = class.comp
    return(result)
}