R/check_entry.R

Defines functions get.keepA Check.entry.single Check.entry.pls Check.entry.wrapper.mint.block Check.entry.sgcca Check.entry.rgcca

#############################################################################################################
# Authors:
#   Florian Rohart, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
#
# created: 22-04-2015
# last modified: 05-10-2017
#
# Copyright (C) 2015
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#############################################################################################################

# --------------------------------------
# check and construct keepA
# --------------------------------------
get.keepA=function(X,keepX,ncomp)
{
    # X:data
    # keepA
    
    keepA = list()
    if (missing(keepX) || length(keepX)==0)
    {
        #if keepX is missing, pls-like: keepX=ncol(X)
        for (q in 1:length(X))
        keepA[[q]]=rep(ncol(X[[q]]),max(ncomp)) #keepX
        
        names(keepA)=names(X)
    } else {
        
        if (!is.list(keepX))
        stop("'keepX' must be a list")
        
        if (length(keepX) > length(X))
        stop(paste0("length(keepX) is higher than the number of blocks in X, which is ", length(X), "."))
        
        # error if no names on keepX or not matching the names in X
        if(length(unique(names(keepX)))!=length(keepX) | sum(is.na(match(names(keepX),names(X)))) > 0)
        stop("Each entry of 'keepX' must have a unique name corresponding to a block of 'X'")
        
        # I want to match keepX to X by names
        ind.match = match(names(X), names(keepX))
        
        for (q in 1:length(X))
        {
            
            if (!is.na(ind.match[q])) # means there is a keepX with the same name as X[q] #(q <= length(keepX))
            {
                #checking entries of keepX
                if (is.list(keepX[[ind.match[q]]]))
                stop(paste0("keepX[[",ind.match[q],"]]' must be a vector"))
                
                if (any(keepX[[ind.match[q]]] > ncol(X[[q]])))
                stop(paste0("each component of 'keepX[[",ind.match[q],"]]' must be lower or equal to ncol(X[[",q,"]])=",ncol(X[[q]]),"."))
                
                if (any(keepX[[ind.match[q]]] < 0))
                stop(paste0("each component of 'keepX[[",ind.match[q],"]]' must be non negative."))
                
                if (length(keepX[[ind.match[q]]]) > ncomp[q])
                stop(paste0("length of 'keepX[[",ind.match[q],"]]' must be lower or equal to ncomp[",q,"]=",ncomp[q], "."))
                
                keepA[[q]] = keepX[[ind.match[q]]]
                if (length(keepA[[q]]) < max(ncomp))
                keepA[[q]] = c(keepA[[q]], rep(ncol(X[[q]]), max(ncomp) - length(keepA[[q]]))) #complete the keepX already provided
                
            }else{
                keepA[[q]]=rep(ncol(X[[q]]),max(ncomp))##
            }
        }
        
    }
    
    #print(keepA)
    
    names(keepA) = names(X)
    return(list(keepA=keepA))
}


# --------------------------------------
# Check.entry.single: check input parameter for a single matrix that comes from a list of matrix
# --------------------------------------
# X: a single numeric matrix
# ncomp: the number of components to include in the model
# q: position of X in a list of matrix, as used in horizontal analysis (e.g. block.pls)

Check.entry.single = function(X,  ncomp, q)
{

    #-- validation des arguments --#
    if (length(dim(X)) != 2)
    stop(paste0("'X[[", q, "]]' must be a numeric matrix."))
    
    if(! any(class(X) %in% "matrix"))
    X = as.matrix(X)
    
    if (!is.numeric(X))
    stop(paste0("'X[[", q, "]]'  must be a numeric matrix."))
    
    N = nrow(X)
    P = ncol(X)
    
    if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0)
    stop(paste0("invalid number of variates 'ncomp' for matrix 'X[[", q, "]]'."))
    
    ncomp = round(ncomp)
    
    # add colnames and rownames if missing
    X.names = dimnames(X)[[2]]
    if (is.null(X.names))
    {
        X.names = paste("X", 1:P, sep = "")
        dimnames(X)[[2]] = X.names
    }
    
    ind.names = dimnames(X)[[1]]
    if (is.null(ind.names))
    {
        ind.names = 1:N
        rownames(X)  = ind.names
    }
    
    if (length(unique(rownames(X))) != nrow(X))
        stop("samples should have a unique identifier/rowname")
    if (length(unique(X.names)) != P)
        stop("Unique indentifier is needed for the columns of X")
    
    return(list(X=X, ncomp=ncomp, X.names=X.names, ind.names=ind.names))
}



# --------------------------------------
# Check.entry.pls: check the entry associated with (s)PLS. Usde in 'internal_wrapper.mint.R'
# --------------------------------------
# X: numeric matrix of predictors
# Y: numeric vector or matrix of responses
# ncomp: the number of components to include in the model. Default to 2.
# keepX: number of \eqn{X} variables kept in the model on the last components.
# keepY: number of \eqn{Y} variables kept in the model on the last components.
# mode: deflation mode
# scale: boleean. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE).
# near.zero.var: boolean, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations
# max.iter: integer, the maximum number of iterations.
# tol: Convergence stopping value.
# logratio: whether a log transformation will be performed, and which one,
# DA: whether the input is used for a Discrimant Analysis
# multilevel: whether a multilevel analysis will be performed (repeated measurements)


Check.entry.pls = function(X, Y, ncomp, keepX, keepY, test.keepX, test.keepY, mode, scale,
     near.zero.var, max.iter, tol, logratio, DA, multilevel)
{
    
    if (missing(mode))
    mode = "regression"
    
    if (length(mode)>1)
    mode = mode[1]
    
    if (!(mode %in% c("canonical", "invariant", "classic", "regression")))
    stop("Choose one of the four following modes: canonical, invariant, classic or regression")
    
    
    #-- validation des arguments --#
    if (length(dim(X)) != 2)
    stop("'X' must be a numeric matrix.")
    
    if(! any(class(X) %in% "matrix"))
    X = as.matrix(X)

    if (!(logratio %in% c("none", "CLR")))
    stop("Choose one of the two following logratio transformation: none or CLR")
    
    if(!is.null(multilevel))
    {
        #multilevel analysis: withinVariation and then pls-like
        # if it's DA analysis, Y and 'multilevel' are combined
        if(DA)
        {
            Y = multilevel
        }else{
            if ((nrow(X) != nrow(multilevel)))
            stop("unequal number of rows in 'X' and 'multilevel'.")
            
            Y = as.matrix(Y)
            if (!is.numeric(X) || !is.numeric(Y))
            stop("'X' and/or 'Y' must be a numeric matrix.")
        }
    }else{
        Y = as.matrix(Y)
        if (!is.numeric(X) || !is.numeric(Y))
        stop("'X' and/or 'Y' must be a numeric matrix.")
    }
    N = nrow(X)
    Q = ncol(Y)
    P= ncol(X)
    
    if ((N != nrow(Y)))
    stop("Unequal number of rows in 'X' and 'Y'.")
    
    if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0 || length(ncomp)>1)
    stop("invalid number of variates, 'ncomp'.")
    
    if(mode == "canonical" & ncomp>ncol(Y))
    stop("For `canonical mode', 'ncomp' needs to be lower than ncol(Y)= ",ncol(Y))


    ncomp = round(ncomp)
    if(ncomp > P)
    {
        warning("Reset maximum number of variates 'ncomp' to ncol(X) = ", P, ".")
        ncomp = P
    }
    
    if (!is.numeric(tol) | tol<=0)
    stop("tol must be non negative")
    
    if (!is.numeric(max.iter) | max.iter<=0)
    stop("max.iter must be non negative")
    
    
    # add colnames and rownames if missing
    X.names = dimnames(X)[[2]]
    if (is.null(X.names))
    {
        X.names = paste("X", 1:P, sep = "")
        dimnames(X)[[2]] = X.names
    }
    

    
    ind.names = dimnames(X)[[1]]
    if (is.null(ind.names))
    {
        ind.names = dimnames(Y)[[1]]
        if (is.null(ind.names))
        {
            ind.names = 1:N
            rownames(X) = rownames(Y) = ind.names

        } else {
            rownames(X) = ind.names
        }
        
        #rownames(X) = ind.names
    } else {
        rownames(Y) = ind.names
    }
    
    #rownames(X) = rownames(Y) = ind.names
    
    
    #if (dim(Y)[2] == 1) Y.names = "Y"
    Y.names = dimnames(Y)[[2]]
    if (is.null(Y.names))
    {
        if (dim(Y)[2] == 1)
        {
            Y.names = "Y"
        } else {
            Y.names = paste("Y", 1:Q, sep = "")
        }
        
        dimnames(Y)[[2]]=Y.names
    }
    
    if (length(unique(X.names)) != P)
    stop("Unique indentifier is needed for the columns of X")
    
    if (length(unique(Y.names)) != Q)
    stop("Unique indentifier is needed for the columns of Y")
    
    
    # check keepX
    if (missing(keepX))
    {
        keepX = rep(P, ncomp)
    } else {
        if (length(keepX)<ncomp)
        keepX = c(keepX, rep(P, ncomp - length(keepX))) #complete (with ncomp) the keepX already provided
    }
    
    # check keepY
    if (missing(keepY))
    {
        keepY = rep(Q, ncomp)
    } else {
        if (length(keepY) < ncomp)
        keepY = c(keepY, rep(Q, ncomp - length(keepY))) #complete the keepY already provided
    }


    
    
    if (any(keepX<0))
    stop("each component of 'keepX' must be non negative ")
    if (any(keepY<0))
    stop("each component of 'keepY' must be non negative ")
    
    if (any(keepX > ncol(X)))
    stop("each component of 'keepX' must be lower or equal than ", P, ".")
    if (any(keepY > ncol(Y)))
    stop("each component of 'keepY' must be lower or equal than ", Q, ".")
    

    if (!is.logical(scale))
    stop("'scale' must be either TRUE or FALSE")
    
    if (!is.logical(near.zero.var))
    stop("'near.zero.var' must be either TRUE or FALSE")
    

    ### near.zero.var, remove the variables with very small variances
    if (near.zero.var == TRUE)
    {
        nzv.A = nearZeroVar(X)
        
        if (length(nzv.A$Position) > 0)
        {
            names.remove.X = colnames(X)[nzv.A$Position]
            X = X[, -nzv.A$Position, drop=FALSE]
            warning("Zero- or near-zero variance predictors.\n Reset predictors matrix to not near-zero variance predictors.\n See $nzv for problematic predictors.")
            if (ncol(X) == 0)
            stop("No more variables in X")
            
            #need to check that the keepA[[q]] is now not higher than ncol(A[[q]])
            if (any(keepX > ncol(X)))
            {
                ind = which(keepX > ncol(X))
                keepX[ind] = ncol(X)
            }
        }
        
    }else{nzv.A=NULL}
    

    return(list(X=X, Y=Y, ncomp=ncomp, X.names=X.names, Y.names=Y.names, ind.names=ind.names, mode=mode,
        keepX=keepX, keepY=keepY, nzv.A=nzv.A))
}


# --------------------------------------
# Check.entry.wrapper.mint.block: used in 'internal_wrapper.mint.block.R'
# --------------------------------------
# X: a list of data sets (called 'blocks') matching on the same samples. Data in the list should be arranged in samples x variables, with samples order matching in all data sets. \code{NA}s are not allowed.
# Y: a factor or a class vector for the discrete outcome.
# indY: to supply if Y is missing, indicate the position of the outcome in the list X.
# ncomp: numeric vector of length the number of blocks in \code{X}. The number of components to include in the model for each block (does not necessarily need to take the same value for each block). By default set to 2 per block.
# keepX: A vector of same length as X.  Each entry keepX[i] is the number of X[[i]]-variables kept in the model on the last components.
# keepY: Only used if Y is provided. Each entry keepY[i] is the number of Y-variables kept in the model on the last components.
# study: grouping factor indicating which samples are from the same study
# design: the input design.
# init: intialisation of the algorithm, one of "svd" or "svd.single". Default to "svd"
# scheme: the input scheme, one of "horst", "factorial" or ""centroid". Default to "centroid"
# scale: boleean. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE).
# near.zero.var: boolean, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations
# mode: input mode, one of "canonical", "classic", "invariant" or "regression". Default to "regression"
# tol: Convergence stopping value.
# max.iter: integer, the maximum number of iterations.

Check.entry.wrapper.mint.block = function(X,
Y,
indY, #only use if Y not provided
ncomp,
keepX,
keepY,
study, #mint
design, #block
init,
scheme, #block
scale,
near.zero.var,
mode,
tol,
max.iter)
{
    #need to give the default values of mint.block.spls to mixOmics
    
    if (!is.list(X))
    stop("X must be a list")
    
    # check names on X are unique
    if (length(unique(names(X))) != length(X))
    stop("Each block of 'X' must have a unique name.")
    #names(X)=paste0("block", 1:length(X)) #add names to the blocks if no names or not unique name for each block

    if (length(unique(unlist(lapply(X, nrow)))) != 1)
    stop("Unequal number of rows among the blocks of 'X'")
    
    #check for numeric vector instead of matrix
    if (length(unlist(lapply(X, nrow))) != length(X)) #means there is at least one NULL
    {
        nrow.all = unlist(lapply(X, nrow)) # could be NULL
        ind.null = which(is.na(match(names(X), names(nrow.all)))) #gives the vectors
        
        # we need matrices with rownames and colnames
        stop("At least one block of 'X' is not a matrix")
    }
    
    if ((missing(indY) & missing(Y)))
    stop("Either 'Y' or 'indY' is needed")
    
    if (missing(ncomp))
    ncomp = 1
    
    #check length(ncomp)=1
    if (length(ncomp) != 1)
    stop("'ncomp' must be a single value")
    
    # transform ncomp to length(X)
    ncomp = rep(ncomp, length(X))


    #check dimnames and ncomp per block of A
    for (q in 1:length(X))
    {
        check = Check.entry.single(X[[q]], ncomp[q], q=q)
        X[[q]] = check$X
        ncomp[q] = check$ncomp
    }
    
    #check ncomp[q] < ncol(X[[q]])
    for (q in 1:length(X))
    {
        ncomp[q] = round(ncomp[q])
        if(ncomp[q] > ncol(X[[q]]))
        {
            warning(paste0("Reset maximum number of variates 'ncomp[", q, "]' to ncol(X[[", q, "]])= ", ncol(X[[q]]), "."))
            ncomp[q] = ncol(X[[q]])
        }
    }
    
    # construction of keepA for X, we include Y later on if needed (i.e. if Y is provided, otherwise Y is in X[[indY]])
    check = get.keepA(X=X, keepX=keepX, ncomp=ncomp)
    keepA = check$keepA
    
    
    if (!missing(Y))# Y is not missing, so we don't care about indY
    {        
        if (!missing(indY))
        warning("'Y' and 'indY' are provided, 'Y' is used.")
        
        if (is.list(Y) | length(dim(Y)) != 2 | !is.numeric(Y))
        stop("'Y' must be a numeric matrix.")

        #check dimnames and ncomp per block of A
        check = Check.entry.single(Y, max(ncomp), q=1)
        Y = check$X
        
        if (nrow(Y)!=nrow(X[[1]]))
        stop("Different number of samples between the blocks and Y")
        
        # if not missing, we transform keepY in list to input them in get.keepA
        if (!missing(keepY))
        keepY = list(Y = keepY)
            
        check.temp.Y = get.keepA(X = list(Y = Y), keepX = keepY, ncomp = ncomp)
        keepY.temp = check.temp.Y$keepA
            
        keepA[[length(X)+1]] = keepY.temp[[1]] #add keepY in keepA

        # check design matrix before adding Y in
        A = X
        ### Start check design matrix
        if (missing(design))
        {
            design = 1 - diag(length(A)+1)
            rownames(design) = colnames(design) = c(names(A), "Y")
        } else if (ncol(design) != nrow(design) || ncol(design) < length(X) || ncol(design) > (length(X) + 1) ){#|| any(!design %in% c(0,1))) {
            stop(paste0("'design' must be a square matrix with ", length(X), "columns."))
        } else if (ncol(design) == length(X)) {
            message("Design matrix has changed to include Y; each block will be linked to Y.")
            design = rbind(cbind(design, 1), 1)
            diag(design) = 0
            rownames(design) = colnames(design) = c(names(A), "Y")
        }
        rownames(design) = colnames(design) = c(names(A), "Y")
        ### End check design matrix

        # build the list A by adding Y, and creating indY
        A[[length(A)+1]] = Y
        names(A)[length(A)] = names(keepA)[length(A)] = "Y"
        indY = length(A)
        
        if (mode == "canonical")
        ncomp = c(ncomp, min(ncomp, ncol(Y) - 1)) #adjust ncomp for Y
        
        if (mode == "regression")
        ncomp = c(ncomp, max(ncomp)) #adjust ncomp for Y
        
    } else {        #missing(Y) but indY not missing
        
        if (!missing(keepY))
        warning("indY is provided: 'keepY' is ignored and only 'keepX' is used.")

        A = X #list provided as input
        
        ### Start check design matrix
        if (missing(design))
        {
            design = 1 - diag(length(A))
        } else if (ncol(design) != nrow(design) || ncol(design) < length(A) || ncol(design) > (length(A) + 1) )#|| any( !design %in% c(0,1)))
        {
            stop(paste0("'design' must be a square matrix with ", length(A), "columns."))
        }
        rownames(design) = colnames(design) = names(A)
        ### End check design matrix
        
        # check indY
        if (!is.numeric(indY) | indY > length(A) | length(indY) > 1)
        stop(paste0("'indY' must be a numeric value lower or equal to ", length(A), ", the number of blocks in A."))

    }
    
    # --------------------------------------------------------------------------------
    # at this stage, we have A, indY, keepA, ncomp verified
    # --------------------------------------------------------------------------------
    
    # force all rownames to be the same
    ind.names=lapply(A,rownames)
    check = sapply(1:length(A),function(j){identical(ind.names[[1]],ind.names[[j]])})
    
    if(sum(check) != length(check))
    stop("Please check the rownames of the data, there seems to be some discrepancies")
    
    if (!(mode %in% c("canonical", "invariant", "classic", "regression")))
    stop("Choose one of the four following modes: canonical, invariant, classic or regression")
    
    #set the default study factor
    if (missing(study))
    {
        study = factor(rep(1, nrow(A[[1]])))
    } else {
        study = as.factor(study)
    }
    if (length(study) != nrow(A[[1]]))
    stop(paste0("'study' must be a factor of length ", nrow(A[[1]]), "."))
    
    if (any(table(study) <= 1))
    stop("At least one study has only one sample, please consider removing before calling the function again")
    if (any(table(study) < 5))
    warning("At least one study has less than 5 samples, mean centering might not do as expected")
    
    if (missing(init))
    init = "svd.single"
    
    if (!init%in%c("svd","svd.single"))
    stop("init should be one of 'svd' or 'svd.single'")
    
    if (!abs(indY - round(indY) < 1e-25))
    stop ("indY must be an integer")
    if (indY > length(A))
    stop ("indY must point to a block of A")
    
    # =====================================================
    # with or without tau (RGGCA or mint.block.spls algo)
    # =====================================================
    x = unlist(lapply(A,nrow))
    if (!isTRUE(all.equal(max(x), min(x))))
    stop("The samplesize must be the same for all blocks")
    
    #check scheme
    if (!(scheme %in% c("horst", "factorial", "centroid")))
    {
        stop("Choose one of the three following schemes: horst, centroid or factorial")
    }
    
    if (!is.numeric(tol) | tol <= 0)
    stop("tol must be non negative")
    if (!is.numeric(max.iter) | max.iter <= 0)
    stop("max.iter must be non negative")
    
    if (!is.logical(scale))
    stop("scale must be either TRUE or FALSE")
    if (!is.logical(near.zero.var))
    stop("near.zero.var must be either TRUE or FALSE")
    
    ### near.zero.var, remove the variables with very small variances
    if(near.zero.var == TRUE)
    {
        nzv.A = lapply(A, nearZeroVar)
        for(q in 1:length(A))
        {
            if (length(nzv.A[[q]]$Position) > 0)
            {
                names.remove.X = colnames(A[[q]])[nzv.A[[q]]$Position]
                A[[q]] = A[[q]][, -nzv.A[[q]]$Position, drop=FALSE]
                #if (verbose)
                #warning("Zero- or near-zero variance predictors.\n Reset predictors matrix to not near-zero variance predictors.\n See $nzv for problematic predictors.")
                if (ncol(A[[q]]) == 0)
                stop(paste0("No more variables in",A[[q]]))
                
                #need to check that the keepA[[q]] is now not higher than ncol(A[[q]])
                if (any(keepA[[q]] > ncol(A[[q]])))
                {
                    ind = which(keepA[[q]] > ncol(A[[q]]))
                    keepA[[q]][ind] = ncol(A[[q]])
                }
            }
            
        }
    } else {
        nzv.A=NULL
    }
    
    return(list(A=A, ncomp=ncomp, study=study, keepA=keepA,
    indY=indY, design=design, init=init, nzv.A=nzv.A))
}


# --------------------------------------
# Check.entry.sgcca: used in 'wrapper.sgcca.R'
# --------------------------------------
# X: a list of data sets (called 'blocks') matching on the same samples. Data in the list should be arranged in samples x variables, with samples order matching in all data sets. \code{NA}s are not allowed.
# design: the input design.
# ncomp: numeric vector of length the number of blocks in \code{X}. The number of components to include in the model for each block (does not necessarily need to take the same value for each block). By default set to 2 per block.
# scheme: the input scheme, one of "horst", "factorial" or ""centroid". Default to "centroid"
# mode: input mode, one of "canonical", "classic", "invariant" or "regression". Default to "regression"
# scale: boleean. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE).
# init: intialisation of the algorithm, one of "svd" or "svd.single". Default to "svd"
# tol: Convergence stopping value.
# max.iter: integer, the maximum number of iterations.
# near.zero.var: boolean, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations
# keepX: A vector of same length as X.  Each entry keepX[i] is the number of X[[i]]-variables kept in the model on the last components.

Check.entry.sgcca = function(X,
design,
ncomp,
scheme,
mode,
scale,
init,
tol,
max.iter,
near.zero.var,
keepX)
{
    #need to give the default values of mint.block.spls to mixOmics
    
    if (!is.list(X))
    stop("X must be a list of at least two matrices")
    
    if (length(X)<2)
    stop("X must be a list of at least two matrices")
    
    if (length(unique(unlist(lapply(X, nrow)))) != 1)
    stop("Unequal number of rows among the blocks of 'X'")
    
    # check names on X are unique
    if (length(unique(names(X))) != length(X))
    stop("Each block of 'X' must have a unique name.")
    #names(X)=paste0("block", 1:length(X)) #add names to the blocks if no names or not unique name for each block

    if (missing(ncomp))
    ncomp = 1

    #check length(ncomp)=1
    if (length(ncomp) != 1)
    stop("'ncomp' must be a single value")

    # transform ncomp to length(X)
    ncomp = rep(ncomp, length(X))

    #check dimnames and ncomp per block of A
    for (q in 1:length(X))
    {
        check = Check.entry.single(X[[q]], ncomp[q], q=q)
        X[[q]] = check$X
        ncomp[q] = check$ncomp
    }
    
    #check ncomp[q]<ncol(X[[q]])
    for (q in 1:length(X))
    {
        ncomp[q] = round(ncomp[q])
        if(ncomp[q] > ncol(X[[q]]))
        {
            warning(paste0("Reset maximum number of variates 'ncomp[", q, "]' to ncol(X[[", q, "]])= ", ncol(X[[q]]), "."))
            ncomp[q] = ncol(X[[q]])
        }
    }
    
    A = X#input
    
    if (missing(init))
    init="svd"
    
    if (!init%in%c("svd", "svd.single"))
    stop("init should be one of 'svd' or 'svd.single'")
    
    if (!(mode %in% c("canonical", "invariant", "classic", "regression")))
    stop("Choose one of the four following modes: canonical, invariant, classic or regression")
    
    
    #check scheme
    if (missing(scheme))
    scheme= "horst"
    
    if (!(scheme %in% c("horst", "factorial","centroid")))
    {
        stop("Choose one of the three following schemes: horst, centroid or factorial")
    }
    
    if(missing(design))
    design = 1 - diag(length(A))
    
    # check design matrix
    if (nrow(design) != ncol(design))
    stop(paste0("'design' must be a square matrix."))
    
    if (nrow(design) != length(A))
    stop(paste0("'design' must be a square matrix with", length(A), "columns."))
    
    if (tol <= 0)
    stop("tol must be non negative")
    
    if (max.iter <= 0)
    stop("max.iter must be non negative")
    
    if (!is.logical(scale))
    stop("scale must be either TRUE or FALSE")
    if (!is.logical(near.zero.var))
    stop("near.zero.var must be either TRUE or FALSE")
    
    # construction of keepA
    check = get.keepA(X=A, keepX=keepX, ncomp=ncomp)
    keepA = check$keepA
    
    ### near.zero.var, remove the variables with very small variances
    if(near.zero.var == TRUE)
    {
        nzv.A = lapply(A,nearZeroVar)
        for (q in 1:length(A))
        {
            if (length(nzv.A[[q]]$Position) > 0)
            {
                names.remove.X = colnames(A[[q]])[nzv.A[[q]]$Position]
                A[[q]] = A[[q]][, -nzv.A[[q]]$Position, drop=FALSE]
                #if (verbose)
                #warning("Zero- or near-zero variance predictors.\n Reset predictors matrix to not near-zero variance predictors.\n See $nzv for problematic predictors.")
                if (ncol(A[[q]]) == 0)
                stop(paste0("No more variables in", A[[q]]))
                
                  #need to check that the keepA[[q]] is now not higher than ncol(A[[q]])
                if (any(keepA[[q]]>ncol(A[[q]])))
                {
                    ind = which(keepA[[q]] > ncol(A[[q]]))
                    keepA[[q]][ind] = ncol(A[[q]])
                }
            }
            
        }
    } else {
        nzv.A=NULL
    }
    
    
    return(list(A=A, ncomp=ncomp, design=design, init=init, scheme=scheme, nzv.A=nzv.A,
    keepA=keepA))
    
}



# --------------------------------------
# Check.entry.rgcca: used in 'wrapper.rgcca.R'
# --------------------------------------
# X: a list of data sets (called 'blocks') matching on the same samples. Data in the list should be arranged in samples x variables, with samples order matching in all data sets. \code{NA}s are not allowed.
# design: the input design.
# tau:
# ncomp: numeric vector of length the number of blocks in \code{X}. The number of components to include in the model for each block (does not necessarily need to take the same value for each block). By default set to 2 per block.
# scheme: the input scheme, one of "horst", "factorial" or ""centroid". Default to "centroid"
# scale: boleean. If scale = TRUE, each block is standardized to zero means and unit variances (default: TRUE).
# init: intialisation of the algorithm, one of "svd" or "svd.single". Default to "svd"
# tol: Convergence stopping value.
# max.iter: integer, the maximum number of iterations.
# near.zero.var: boolean, see the internal \code{\link{nearZeroVar}} function (should be set to TRUE in particular for data with many zero values). Setting this argument to FALSE (when appropriate) will speed up the computations
# keepX: A vector of same length as X.  Each entry keepX[i] is the number of X[[i]]-variables kept in the model on the last components.
Check.entry.rgcca = function(X,
design,
tau,
ncomp,
scheme,
scale,
init,
tol,
max.iter,
near.zero.var,
keepX)
{
    #need to give the default values of mint.block.spls to mixOmics
    
    if (!is.list(X))
    stop("X must be a list of at list two matrices")
    
    if (length(X)<2)
    stop("X must be a list of at list two matrices")
    
    if (length(unique(unlist(lapply(X, nrow)))) != 1)
    stop("Unequal number of rows among the blocks of 'X'")

    # check names on X are unique
    if (length(unique(names(X))) != length(X))
    stop("Each block of 'X' must have a unique name.")
    #names(X)=paste0("block", 1:length(X)) #add names to the blocks if no names or not unique name for each block

    if (is.null(tau))
    stop("'tau' is needed")
    
    if (missing(ncomp))
    ncomp = 1
    
    #check length(ncomp)=1
    if (length(ncomp) != 1)
    stop("'ncomp' must be a single value")
    
    # transform ncomp to length(X)
    ncomp = rep(ncomp, length(X))
    
    #check dimnames and ncomp per block of A
    for (q in 1:length(X))
    {
        check = Check.entry.single(X[[q]], ncomp[q], q=q)
        X[[q]] = check$X
        ncomp[q] = check$ncomp
    }
    
    
    #check ncomp[q]<ncol(X[[q]])
    for(q in 1:length(X))
    {
        ncomp[q] = round(ncomp[q])
        if(ncomp[q] > ncol(X[[q]]))
        {
            warning(paste0("Reset maximum number of variates 'ncomp[", q, "]' to ncol(X[[", q, "]])= ", ncol(X[[q]]), "."))
            ncomp[q] = ncol(X[[q]])
        }
    }
    
    A = X#input
    
    if (is.numeric(tau))
    {
        if (any(tau < 0) | any(tau > 1))
        stop("'tau' contains either values between 0 and 1, or 'optimal'.")
        
        if (is.vector(tau))
        {
            if (length(tau) != length(A))
            stop(paste0("'tau' must be of length ", length(A), "."))
            
            tau = matrix(rep(tau, max(ncomp)), nrow = max(ncomp), ncol = length(tau), byrow = TRUE)
        }
    } else {
        if (tau != "optimal")
        stop("'tau' contains either values between 0 and 1, or 'optimal'.")
    }
    
    if (missing(init))
    init = "svd.single"
    
    if (init != "svd.single")
    stop("init should be 'svd.single'.")
    
    # check scheme
    if(missing(scheme)) scheme = "horst"
    if (!(scheme %in% c("horst", "factorial", "centroid")))
    {
        stop("Choose one of the three following schemes: horst, centroid or factorial")
    }
    
    
    if (missing(design))
    design = 1 - diag(length(A))
    
    #check design matrix
    if (nrow(design) != ncol(design))
    stop(paste0("'design' must be a square matrix."))
    if (nrow(design) != length(A))
    stop(paste0("'design' must be a square matrix with", length(A), "columns."))
    
    
    if (missing(near.zero.var))
    near.zero.var = FALSE
    
    if (tol <= 0)
    stop("tol must be non negative")
    
    if (max.iter <= 0)
    stop("max.iter must be non negative")
    
    if (!is.logical(scale))
    stop("scale must be either TRUE or FALSE")
    if (!is.logical(near.zero.var))
    stop("near.zero.var must be either TRUE or FALSE")
    
    # construction of keepA
    check = get.keepA(X=A, keepX=keepX, ncomp=ncomp)
    keepA = check$keepA
    
    ### near.zero.var, remove the variables with very small variances
    if(near.zero.var == TRUE)
    {
        nzv.A = lapply(A,nearZeroVar)
        for (q in 1:length(A))
        {
            if (length(nzv.A[[q]]$Position) > 0)
            {
                names.remove.X = colnames(A[[q]])[nzv.A[[q]]$Position]
                A[[q]] = A[[q]][, -nzv.A[[q]]$Position, drop=FALSE]
                #if (verbose)
                #warning("Zero- or near-zero variance predictors.\n Reset predictors matrix to not near-zero variance predictors.\n See $nzv for problematic predictors.")
                if (ncol(A[[q]]) == 0)
                stop(paste0("No more variables in", A[[q]]))
                
                #need to check that the keepA[[q]] is now not higher than ncol(A[[q]])
                if (any(keepA[[q]] > ncol(A[[q]])))
                {
                    ind = which(keepA[[q]] > ncol(A[[q]]))
                    keepA[[q]][ind] = ncol(A[[q]])
                }
            }
            
        }
    } else {
        nzv.A = NULL
    }
    
    
    return(list(A=A, ncomp=ncomp, design=design, init=init, scheme=scheme, nzv.A=nzv.A,
    keepA=keepA))
}

Try the mixOmics package in your browser

Any scripts or data that you put into this service are public.

mixOmics documentation built on June 1, 2018, 5:06 p.m.