R/ortho_projection.R

Defines functions predict.ortho_projection pls_projection pc_projection ortho_projection

Documented in ortho_projection pc_projection pls_projection predict.ortho_projection

#' @title Orthogonal projections using principal component analysis and partial
#' least squares
#' @aliases ortho_projection
#' @aliases pls_projection
#' @aliases pc_projection
#' @aliases predict.ortho_projection
#' @description
#' \loadmathjax
#' Functions to perform orthogonal projections of high dimensional data matrices
#' using principal component analysis (pca) and partial least squares (pls).
#' @usage
#' ortho_projection(Xr, Xu = NULL,
#'                  Yr = NULL,
#'                  method = "pca",
#'                  pc_selection = list(method = "var", value = 0.01),
#'                  center = TRUE, scale = FALSE, ...)
#'
#' pc_projection(Xr, Xu = NULL, Yr = NULL,
#'               pc_selection = list(method = "var", value = 0.01),
#'               center = TRUE, scale = FALSE,
#'               method = "pca",
#'               tol = 1e-6, max_iter = 1000, ...)
#'
#' pls_projection(Xr, Xu = NULL, Yr,
#'                pc_selection = list(method = "opc", value = min(dim(Xr), 40)),
#'                scale = FALSE, method = "pls",
#'                tol = 1e-6, max_iter = 1000, ...)
#'
#' \method{predict}{ortho_projection}(object, newdata, ...)
#'
#' @param Xr a matrix of observations.
#' @param Xu an optional matrix containing data of a second set of observations.
#' @param Yr if the method used in the \code{pc_selection} argument is \code{"opc"}
#' or if \code{method = "pls"}, then it must be a matrix
#' containing the side information corresponding to the spectra in \code{Xr}.
#' It is equivalent to the \code{side_info} parameter of the \code{\link{sim_eval}}
#' function. In case \code{method = "pca"}, a matrix (with one or more
#' continuous variables) can also be used as input. The root mean square of
#' differences (rmsd) is used for assessing the similarity between the observations
#' and their corresponding most similar observations in terms of the side information
#' provided. A single discrete variable of class factor can also be passed. In
#' that case, the kappa index is used. See \code{\link{sim_eval}} function for more details.
#' @param method the method for projecting the data. Options are:
#' \itemize{
#' \item{\code{"pca"}:}{ principal component analysis using the singular value
#' decomposition algorithm.}
#' \item{\code{"pca.nipals"}:}{ principal component analysis using the
#' non-linear iterative partial least squares algorithm.}
#' \item{\code{"pls"}:}{ partial least squares.}
#' \item{\code{"mpls"}:}{ modified partial least squares. See details.}
#' }
#' @param pc_selection a list of length 2 which specifies the method to be used
#' for optimizing the number of components (principal components or pls factors)
#' to be retained. This list must contain two elements (in the following order):
#' \code{method} (a character indicating the method for selecting the number of
#' components) and \code{value} (a numerical value that complements the selected
#' method). The methods available are:
#' \itemize{
#'        \item{\code{"opc"}:} { optimized principal component selection based on
#'        Ramirez-Lopez et al. (2013a, 2013b). The optimal number of components
#'        of a given set of observations is the one for which its distance matrix
#'        minimizes the differences between the \code{Yr} value of each
#'        observation and the \code{Yr} value of its closest observation. In this
#'        case \code{value} must be a value (larger than 0 and
#'        below \code{min(nrow(Xr)} \code{+ nrow(Xu),} \code{ncol(Xr))} indicating
#'        the maximum number of principal components to be tested. See details.}

#'        \item{\code{"cumvar"}:}{ selection of the principal components based
#'        on a given cumulative amount of explained variance. In this case,
#'        \code{value} must be a value (larger than 0 and below or equal to 1)
#'        indicating the minimum amount of cumulative variance that the
#'        combination of retained components should explain.}

#'        \item{\code{"var"}:}{ selection of the principal components based
#'        on a given amount of explained variance. In this case,
#'        \code{value} must be a value (larger than 0 and below or equal to 1)
#'        indicating the minimum amount of variance that a single component should
#'        explain in order to be retained.}

#'        \item{\code{"manual"}:}{ for manually specifying a fix number of
#'        principal components. In this case, \code{value} must be a value
#'        (larger than 0 and
#'        below the minimum dimension of \code{Xr} or \code{Xr} and \code{Xu}
#'        combined).
#'        indicating the minimum amount of variance that a component should
#'        explain in order to be retained.}
#'        }
#' The list \code{list(method = "var", value = 0.01)} is the default.
#' Optionally, the \code{pc_selection} argument admits \code{"opc"} or
#' \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character
#' string. In such a case the default \code{"value"} when either \code{"opc"} or
#' \code{"manual"} are used is 40. When \code{"cumvar"} is used the default
#' \code{"value"} is set to 0.99 and when \code{"var"} is used, the default
#' \code{"value"} is set to 0.01.
#' @param center a logical indicating if the data \code{Xr} (and \code{Xu} if
#' specified) must be centered. If \code{Xu} is specified the data is centered
#' on the basis of \mjeqn{Xr \cup Xu}{Xr U Xu}. NOTE: This argument only applies to the
#' principal components projection. For pls projections the data is always
#' centered.
#' @param scale a logical indicating if \code{Xr} (and \code{Xu} if specified)
#' must be scaled. If \code{Xu} is specified the data is scaled on the basis of
#' \mjeqn{Xr \cup Xu}{Xr U Xu}.
#' @param tol tolerance limit for convergence of the algorithm in the nipals
#' algorithm (default is 1e-06). In the case of PLS this applies only to Yr with
#' more than one variable.
#' @param max_iter maximum number of iterations (default is 1000). In the case of
#' \code{method = "pls"} this applies only to \code{Yr} matrices with more than
#' one variable.
#' @param ... additional arguments to be passed
#' to \code{pc_projection} or \code{pls_projection}.
#' @param object object of class \code{"ortho_projection"}.
#' @param newdata an optional data frame or matrix in which to look for variables
#' with which to predict. If omitted, the scores are used. It must contain the
#' same number of columns, to be used in the same order.
#' @details
#' In the case of \code{method = "pca"}, the algorithm used is the singular value
#' decomposition in which a given data matrix (\mjeqn{X}{X}) is factorized as follows:
#'
#'  \mjdeqn{X = UDV^{T}}{X = UDV^{\mathrm{T}}}
#'
#' where \mjeqn{U}{U} and \mjeqn{V}{V} are orthogonal matrices, being the left and right
#' singular vectors of \mjeqn{X}{X} respectively, \mjeqn{D}{D} is a diagonal matrix
#' containing the singular values of \mjeqn{X}{X} and \mjeqn{V}{V} is the is a matrix of
#' the right singular vectors of \mjeqn{X}{X}.
#' The matrix of principal component scores is obtained by a matrix
#' multiplication of \mjeqn{U}{U} and \mjeqn{D}{D}, and the matrix of principal component
#' loadings is equivalent to the matrix \mjeqn{V}{V}.
#'
#' When \code{method = "pca.nipals"}, the algorithm used for principal component
#' analysis is the non-linear iterative partial least squares (nipals).
#'
#'
#' In the case of the of the partial least squares projection (a.k.a projection
#' to latent structures) the nipals regression algorithm is used by default.
#' Details on the "nipals" algorithm are presented in Martens (1991). Another
#' method called modified pls (\code{'mpls'}) can also be used. The modified
#' pls was proposed Shenk and Westerhaus (1991, see also Westerhaus, 2014) and it
#' differs from the standard pls method in the way the weights of the \code{Xr}
#' (used to compute the matrix of scores) are obtained. While pls uses the covariance
#' between  \code{Yr} and \code{Xr} (and later their deflated versions
#' corresponding at each pls component iteration) to obtain these weights, the modified pls
#' uses the correlation as weights. The authors indicate that by using correlation,
#' a larger potion of the response variable(s) can be explained.
#'
#'
#' When \code{method = "opc"}, the selection of the components is carried out by
#' using an iterative method based on the side information concept
#' (Ramirez-Lopez et al. 2013a, 2013b). First let be \mjeqn{P}{P} a sequence of
#' retained components (so that \mjeqn{P = 1, 2, ...,k }{P = 1, 2, ...,k }).
#' At each iteration, the function computes a dissimilarity matrix retaining
#' \mjeqn{p_i}{p_i} components. The values in this side information variable are
#' compared against the side information values of their most spectrally similar
#' observations (closest \code{Xr} observation).
#' The optimal number of components retrieved by the function is the one that
#' minimizes the root mean squared differences (RMSD) in the case of continuous
#' variables, or maximizes the kappa index in the case of categorical variables.
#' In this process, the \code{\link{sim_eval}} function is used.
#' Note that for the \code{"opc"} method \code{Yr} is required (i.e. the
#' side information of the observations).
#'
#' @return
#' a \code{list} of class \code{ortho_projection} with the following
#' components:
#' \itemize{
#'  \item{\code{scores}}{ a matrix of scores corresponding to the observations in
#'  \code{Xr} (and \code{Xu} if it was provided). The components retrieved
#'  correspond to the ones optimized or specified.}
#'  \item{\code{X_loadings}}{ a matrix of loadings corresponding to the
#'  explanatory variables. The components retrieved correspond to the ones
#'  optimized or specified.}
#'  \item{\code{Y_loadings}}{ a matrix of partial least squares loadings
#'  corresponding to \code{Yr}. The components retrieved  correspond to the
#'  ones optimized or specified.
#'  This object is only returned if the partial least squares algorithm was used.}
#'  \item{\code{weigths}}{ a matrix of partial least squares ("pls") weights.
#'  This object is only returned if the "pls" algorithm was used.}
#'  \item{\code{projection_mat}}{ a matrix that can be used to project new data
#'  onto a "pls" space. This object is only returned if the "pls" algorithm was
#'  used.}
#'  \item{\code{variance}}{ a list with information on the original variance and
#'  the explained variances. This list contains a matrix indicating the amount of
#'  variance explained by each component (var), the ratio between explained
#'  variance by each single component and the original variance (explained_var) and
#'  the cumulative ratio of explained variance (cumulative_explained_var).
#'  The amount of variance explained by each component is computed by multiplying
#'  its score vector by its corresponding loading vector and calculating the
#'  variance of the result. These values are computed based on the observations
#'  used to create the projection matrices. For example if the "pls" method was
#'  used, then these values are computed  based only on the data that contains
#'  information on \code{Yr} (i.e. the  \code{Xr} data). If the principal
#'  component method is used, the this data is  computed on the basis of
#'  \code{Xr} and \code{Xu} (if it applies) since both  matrices are employed in
#'  the computation of the projection matrix (loadings  in this case)}.
#'  \item{\code{sdv}}{ the standard deviation of the retrieved scores. This vector
#'  can be different from the "sd" in \code{variance}.}
#'  \item{\code{n_components}}{ the number of components (either principal
#'  components or partial least squares components) used for computing the
#'  global dissimilarity scores.}
#'  \item{\code{opc_evaluation}}{ a matrix containing the statistics computed
#'  for optimizing the number of principal components based on the variable(s)
#'  specified in the \code{Yr} argument. If \code{Yr} was a continuous  was a
#'  continuous vector or matrix then this object indicates the root mean square
#'  of differences (rmse) for each number of components. If \code{Yr} was a
#'  categorical variable this object indicates the kappa values for each number
#'  of components. This object is returned only if \code{"opc"} was used within
#'  the \code{pc_selection} argument. See the \code{\link{sim_eval}} function for
#'  more details.}
#'  \item{\code{method}}{ the \code{ortho_projection} method used.}
#'  }
#'  \code{predict.ortho_projection}, returns a matrix of scores proprojected for
#'  \code{newdtata}.
#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez}
#' @references
#' Martens, H. (1991). Multivariate calibration. John Wiley & Sons.
#'
#' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Stevens, A., Dematte, J.A.M.,
#' Scholten, T. 2013a. The spectrum-based learner: A new local approach for
#' modeling soil vis-NIR spectra of complex data sets. Geoderma 195-196, 268-279.
#'
#' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Viscarra Rossel, R., Dematte,
#' J. A. M.,  Scholten, T. 2013b. Distance and similarity-search metrics for use
#' with soil vis-NIR spectra. Geoderma 199, 43-53.
#'
#' Shenk, J. S., & Westerhaus, M. O. 1991. Populations structuring of
#' near infrared spectra and modified partial least squares regression.
#' Crop Science, 31(6), 1548-1555.
#'
#' Shenk, J., Westerhaus, M., and Berzaghi, P. 1997. Investigation of a LOCAL
#' calibration procedure for near infrared instruments. Journal of Near Infrared
#' Spectroscopy, 5, 223-232.
#'
#' Westerhaus, M. 2014. Eastern Analytical Symposium Award for outstanding
#' Wachievements in near infrared spectroscopy: my contributions to
#' Wnear infrared spectroscopy. NIR news, 25(8), 16-20.
#'
#' @seealso \code{\link{ortho_diss}}, \code{\link{sim_eval}}, \code{\link{mbl}}
#' @examples
#' \donttest{
#' library(prospectr)
#' data(NIRsoil)
#'
#' # Proprocess the data using detrend plus first derivative with Savitzky and
#' # Golay smoothing filter
#' sg_det <- savitzkyGolay(
#'   detrend(NIRsoil$spc,
#'     wav = as.numeric(colnames(NIRsoil$spc))
#'   ),
#'   m = 1,
#'   p = 1,
#'   w = 7
#' )
#' NIRsoil$spc_pr <- sg_det
#'
#' # split into training and testing sets
#' test_x <- NIRsoil$spc_pr[NIRsoil$train == 0 & !is.na(NIRsoil$CEC), ]
#' test_y <- NIRsoil$CEC[NIRsoil$train == 0 & !is.na(NIRsoil$CEC)]
#'
#' train_y <- NIRsoil$CEC[NIRsoil$train == 1 & !is.na(NIRsoil$CEC)]
#' train_x <- NIRsoil$spc_pr[NIRsoil$train == 1 & !is.na(NIRsoil$CEC), ]
#'
#' # A principal component analysis using 5 components
#' pca_projected <- ortho_projection(train_x, pc_selection = list("manual", 5))
#' pca_projected
#'
#' # A principal components projection using the "opc" method
#' # for the selection of the optimal number of components
#' pca_projected_2 <- ortho_projection(
#'   Xr = train_x, Xu = test_x, Yr = train_y,
#'   method = "pca",
#'   pc_selection = list("opc", 40)
#' )
#' pca_projected_2
#' plot(pca_projected_2)
#'
#' # A partial least squares projection using the "opc" method
#' # for the selection of the optimal number of components
#' pls_projected <- ortho_projection(
#'   Xr = train_x, Xu = test_x, Yr = train_y,
#'   method = "pls",
#'   pc_selection = list("opc", 40)
#' )
#' pls_projected
#' plot(pls_projected)
#'
#' # A partial least squares projection using the "cumvar" method
#' # for the selection of the optimal number of components
#' pls_projected_2 <- ortho_projection(
#'   Xr = train_x, Xu = test_x, Yr = train_y,
#'   method = "pls",
#'   pc_selection = list("cumvar", 0.99)
#' )
#' }
#' @rdname ortho_projection
#' @export

######################################################################
# resemble
# Copyright (C) 2014 Leonardo Ramirez-Lopez and Antoine Stevens
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
######################################################################

## History:
## 09.03.2014 Leo     In the doc was specified that multi-threading is
##                    not working for mac
## 10.03.2014 Leo     Sanity check for the number of cases in Yr and Xr
## 13.03.2014 Antoine The explanation of the cores argument was modified
## 03.12.2015 Leo     The pls projection function is now based on Rcpp
## 04.12.2015 Leo     The center argument was removed from the pls projection
##                    function. Matrices are now always centered.
## 04.12.2015 Leo     the dist function needs to be replaced by the fDiss.
##                    dist() retrieves squared distances!
## 07.12.2015 Leo     The dist function was removed from these functions (it was
##                    implemented)
##                    for testing for a while but it was not finally released.
## 07.12.2015 Leo     The pc_projection, pls_projection and the predict.othoProjection
##                    are now visible.
## 08.09.2016 Leo     A bug in the predict.ortho_projection function was fixed.
##                    A an error was thrown when PCA preditions were requested.
##                    Furthermore, the PLS score predictions were hadnled wrognly
##                    as PCA preditions.
## 13.09.2016 Leo     Documentation generated by roxygen retuns duplicated
##                    pc_projection
##                    pls_projection docs. For the moment the workaround is to
##                    fix them in the Rd file of ortho_projection
## 24.07.2017 Leo     error in the predict function of ortho_projection the code
##                    was written as:
##                    if(length(grep("pca", object) == 0)) do PC projection...
##                    i.e. if the method is "pca" do not execute the PC projection.
##                    This was fixed by if(length(grep("pca", object) != 0))
##                    do PC projection
## 23.09.2018 Leo     add a newdata names check in predict.ortho_projection..
##                    they need to correspond to variable names in ortho_projection
##                    object
## 03.07.2019 Leo     there was a problem retriving the optimal number of pcs in
##                    pc_projection when the 'opc'method was used in combination
##                    with missing data in Yr. The neighbors retrieved were not
##                    excluding missing values generating a miscalcilation of the
##                    rmsd. This was solved by removing missing values before
##                    the calculations with the fast_diss_vector.
## 22.04.2020 Leo     Argument scaled renamed to scale
##                    Argument max_compter renamed to max_iter
##                    the call is now always part of the error message (this was
##                    not the case when the function was being called from inside
##                    other functions)
##                    Helpers were added (see ortho_helpers.R)
## 30.04.2020 Leo     pca.nipals implemented in c++
##                    cores is deprecated
ortho_projection <- function(Xr, Xu = NULL,
                             Yr = NULL,
                             method = "pca",
                             pc_selection = list(method = "var", value = 0.01),
                             center = TRUE, scale = FALSE, ...) {
  method <- match.arg(method, c("pca", "pca.nipals", "pls", "mpls"))

  if (method %in% c("pls", "mpls")) {
    if (missing(Yr)) {
      stop("'Yr' is missing for pls method")
    }
    if (!is.numeric(as.matrix(Yr))) {
      stop("When pls projection is used, 'Yr' must be numeric")
    }
    proj <- pls_projection(
      Xr = Xr, Yr = Yr, Xu = Xu, method = method, pc_selection = pc_selection,
      scale = scale, ...
    )
    mthd <- method
  } else {
    mthd <- ifelse(method == "pca", "pca (svd)", "pca (nipals)")
    proj <- pc_projection(
      Xr = Xr, Yr = Yr, Xu = Xu, pc_selection = pc_selection,
      center = center, scale = scale, method = method, ...
    )
  }

  proj$method <- mthd
  class(proj) <- c("ortho_projection", "list")

  proj
}


#' @aliases ortho_projection
#' @aliases pls_projection
#' @aliases pc_projection
#' @aliases predict.ortho_projection
#' @importFrom stats quantile complete.cases diffinv
#' @export pc_projection
pc_projection <- function(Xr, Xu = NULL, Yr = NULL,
                          pc_selection = list(method = "var", value = 0.01),
                          center = TRUE, scale = FALSE,
                          method = "pca",
                          tol = 1e-6, max_iter = 1000, ...) {
  pc_selection_method <- pc_selection[[1]]
  match.arg(pc_selection_method, c("opc", "var", "cumvar", "manual"))

  match.arg(method, c("pca", "pca.nipals"))

  if (!is.logical(center)) {
    stop("'center' must be logical")
  }

  if (!is.logical(scale)) {
    stop("'scale' must be logical")
  }

  if (!is.null(Yr)) {
    Yr <- as.matrix(Yr)
    # if (!is.matrix(Yr)) {
    #   stop("Yr must be a matrix")
    # }
    if (nrow(Yr) != nrow(Xr)) {
      stop(paste0(
        "The number of observations in 'Yr' does not match with the ",
        "number of observations in 'Xr'"
      ))
    }
  }

  ny <- ncol(Yr)

  if (!is.null(Xu)) {
    if (ncol(Xr) != ncol(Xu)) {
      stop("Number of columns in 'Xr' and 'Xu' do not match")
    }
  }

  effective_rows_xr <- nrow(Xr)
  n_cols_xr <- ncol(Xr)
  # here ifelse is preferred over if_else as the later returns an error
  # because it evaluates nrow(Xu) when XU == NULL
  effective_rows_xu <- ifelse(is.null(Xu), 0, nrow(Xu))
  Xr <- rbind(Xr, Xu)
  dparam <- check_pc_arguments(
    n_rows_x = effective_rows_xr + effective_rows_xu,
    n_cols_x = n_cols_xr,
    pc_selection = pc_selection,
    default_max_comp = 40,
    default_max_cumvar = 0.99,
    default_max_var = 0.01
  )
  pc_selection <- dparam$pc_selection_checked
  max_comp <- dparam$max_comp

  pc_selection_copy <- pc_selection

  # center
  if (center) {
    mean_vector <- colMeans(Xr)
    X0 <- sweep(x = Xr, MARGIN = 2, FUN = "-", STATS = mean_vector)
  } else {
    mean_vector <- rep(0, ncol(Xr))
    X0 <- Xr
  }

  if (scale) {
    sd_vector <- get_column_sds(X0)
    X0 <- sweep(x = X0, MARGIN = 2, FUN = "/", STATS = sd_vector)
  } else {
    sd_vector <- rep(1, ncol(X0))
  }

  if (method == "pca") {
    sv_decomposition <- svd(x = X0, nu = max_comp, nv = max_comp)
    sv_decomposition$d <- sv_decomposition$d[1:max_comp]
    if (length(sv_decomposition$d) == 1) {
      # scores
      pc_scores <- sv_decomposition$u %*% as.matrix(sv_decomposition$d)
    } else {
      # scores
      pc_scores <- sv_decomposition$u %*% diag(sv_decomposition$d)
    }

    # Loadings
    pc_loadings <- t(sv_decomposition$v)

    # Variance of each PC variable
    sdPC <- get_column_sds(pc_scores)
    # Compute the percentage of explained variance for all the PCs
    ons <- (sv_decomposition$d)^2
    xvariance <- overall_var(X0)
    explained_v <- ons / xvariance
    cumulative_v <- cumsum(explained_v)
    variance <- rbind(
      var = ons,
      explained_var = explained_v,
      cumulative_explained_var = cumulative_v
    )
  }

  if (method == "pca.nipals") {
    nipals_pca <- pca_nipals(
      X = X0,
      ncomp = max_comp,
      center = FALSE,
      scale = FALSE,
      maxiter = max_iter,
      tol = tol,
      pcSelmethod = pc_selection_method,
      pcSelvalue = pc_selection_copy$value
    )

    pc_scores <- nipals_pca$pc_scores
    pc_loadings <- nipals_pca$pc_loadings
    explained_v <- nipals_pca$pc_variance
    xvariance <- nipals_pca$original_x_variance
    variance <- rbind(
      var = explained_v[1, ],
      explained_var = explained_v[2, ],
      cumulative_explained_var = explained_v[3, ]
    )
    colnames(variance) <- paste0("pc_", 1:ncol(variance))
  }

  # assign names
  colnames(pc_scores) <- paste0("pc_", 1:ncol(pc_scores))
  rownames(pc_scores) <- c(
    paste0("Xr_", 1:effective_rows_xr),
    if (!is.null(Xu)) {
      paste0("Xu_", 1:effective_rows_xu)
    }
  )
  colnames(pc_loadings) <- colnames(X0)
  rownames(pc_loadings) <- paste0("pc_", 1:nrow(pc_loadings))

  if (pc_selection_method == "opc") {
    if (is.null(Yr) | !is.matrix(Yr)) {
      stop(paste0(
        "'Yr' msut be a matrix when the 'opc' method is used for ",
        "selecting the optimal number of principal components"
      ))
    }
    if (nrow(Yr) != effective_rows_xr) {
      stop("The number of rows in Xr does not match the number of cases in Yr")
    }
    if (!is.null(colnames(Yr))) {
      if (sum(duplicated(colnames(Yr))) > 0) {
        stop("column names in Yr must be different")
      }
    } else {
      colnames(Yr) <- paste0("Yr_", 1:ny)
    }
    results <- eval_multi_pc_diss(pc_scores[, 1:max_comp],
      side_info = Yr,
      method = "pc",
      check_dims = FALSE
    )
    selected_pcs <- results$best_pc
    results <- results$result
  }

  if (pc_selection_method == "cumvar") {
    selected_pcs <- variance[3, ] < pc_selection_copy$value
    selected_pcs <- sum(selected_pcs) + 1
    if (selected_pcs > ncol(Xr)) {
      selected_pcs <- ncol(Xr)
    }
  }

  if (pc_selection_method == "var") {
    selected_pcs <- variance[2, ] >= pc_selection_copy$value
    selected_pcs <- sum(selected_pcs)
  }

  if (pc_selection_method == "manual") {
    selected_pcs <- (1:ncol(pc_scores)) <= pc_selection_copy$value
    selected_pcs <- sum(selected_pcs)
  }

  if (pc_selection_method == "opc") {
    selected_pcs <- sum(selected_pcs)
  }

  scores_sd <- get_column_sds(pc_scores[, 1:selected_pcs, drop = FALSE])
  colnames(scores_sd) <- colnames(pc_scores[, 1:selected_pcs, drop = FALSE])
  rownames(scores_sd) <- "sd"

  fresults <- list(
    scores = pc_scores[, 1:selected_pcs, drop = FALSE],
    X_loadings = pc_loadings[1:selected_pcs, , drop = FALSE],
    variance = list(
      original_x_var = xvariance,
      x_var = variance[, 1:selected_pcs, drop = FALSE]
    ),
    scores_sd = scores_sd,
    n_components = selected_pcs, pc_selection = pc_selection,
    center = mean_vector,
    scale = sd_vector
  )
  colnames(fresults$variance$x_var) <- rownames(fresults$X_loadings)
  fresults$method <- ifelse(method == "pca", "pca (svd)", "pca (nipals)")
  if (pc_selection_method == "opc") {
    fresults$opc_evaluation <- results
  }
  class(fresults) <- c("ortho_projection", "list")

  fresults
}


#' @aliases ortho_projection
#' @aliases pls_projection
#' @aliases pc_projection
#' @aliases predict.ortho_projection
#' @export pls_projection
pls_projection <- function(Xr, Xu = NULL, Yr,
                           pc_selection = list(method = "opc", value = min(dim(Xr), 40)),
                           scale = FALSE,
                           method = "pls",
                           tol = 1e-6, max_iter = 1000, ...) {
  pc_selection_method <- pc_selection[[1]]
  match.arg(pc_selection_method, c("opc", "var", "cumvar", "manual"))

  if (!is.logical(scale)) {
    stop("'scale' argument must be logical")
  }


  if (missing(Yr)) {
    stop("'Yr' must be provided")
  } else {
    Yr <- as.matrix(Yr)
    if (!is.numeric(Yr)) {
      stop("'Yr' must be a numeric matrix")
    }
  }

  if (!is.null(Xu)) {
    if (ncol(Xr) != ncol(Xu)) {
      stop("Number of columns in 'Xr' and 'Xu' do not match")
    }
  }


  match.arg(method, c("pls", "mpls"))


  if (is.null(pc_selection$value)) {
    pc_selection_value <- pc_selection[[2]]
  } else {
    pc_selection_value <- pc_selection$value
  }

  nas <- rowSums(is.na(Yr)) > 0

  ny <- ncol(Yr)

  X0 <- Xr
  Y0 <- Yr
  non_nas_yr <- 1:nrow(Xr)
  if (sum(nas) > 0) {
    nas_yr <- (1:nrow(Xr))[nas]
    non_nas_yr <- (1:nrow(Xr))[!nas]
    Xout <- Xr[nas_yr, , drop = FALSE]
    X0 <- Xr[non_nas_yr, ]
    Y0 <- Yr[non_nas_yr, , drop = FALSE]
    if (pc_selection_method %in% c("opc", "manual")) {
      if (min(dim(X0)) < pc_selection_value) {
        stop(paste0(
          "Missing values in Yr. The number of components specified ",
          "in 'pc_selection' exceeds the number of observations with ",
          "non-missing Yr values. Try another number of components."
        ))
      }
    }
  }


  effective_rows_xr <- nrow(X0)
  n_cols_xr <- ncol(X0)

  dparam <- check_pc_arguments(
    n_rows_x = effective_rows_xr,
    n_cols_x = n_cols_xr,
    pc_selection = pc_selection,
    default_max_comp = 40,
    default_max_cumvar = 0.99,
    default_max_var = 0.01
  )
  pc_selection <- dparam$pc_selection_checked
  max_comp <- dparam$max_comp

  # weights <- matrix(NA, max_comp, ncol(X0))
  # scores <- matrix(NA, nrow(X0), max_comp)
  # X_loadings <- matrix(NA, max_comp, ncol(X0))
  # Y_loadings <- matrix(NA, max_comp, ny)
  # pls_variance <- matrix(NA, 3, max_comp)

  if (pc_selection_method %in% c("opc", "manual")) {
    pc_selection$value <- pc_selection_value - 1
    cpp_method <- "manual"
  } else {
    cpp_method <- pc_selection_method
  }


  plsp <- opls_for_projection(
    X = X0,
    Y = as.matrix(Y0),
    ncomp = max_comp,
    scale = scale,
    maxiter = max_iter,
    tol = tol,
    pcSelmethod = cpp_method,
    pcSelvalue = pc_selection$value,
    algorithm = method
  )
  max_comp <- plsp$ncomp

  if (pc_selection_method == "opc") {
    if (is.null(Yr) | !is.matrix(Yr)) {
      stop(paste0(
        "'Yr' msut be a matrix when the 'opc' method is used for ",
        "selecting the optimal number of principal components"
      ))
    }
    if (nrow(Yr) != nrow(Xr)) {
      stop("The number of rows in Xr does not match the number of cases in Yr")
    }
    if (!is.null(colnames(Yr))) {
      if (sum(duplicated(colnames(Yr))) > 0) {
        stop("column names in Yr must be different")
      }
    } else {
      colnames(Y0) <- colnames(Yr) <- paste0("Yr_", 1:ny)
    }
    results <- eval_multi_pc_diss(plsp$scores[, 1:max_comp],
      side_info = as.matrix(Y0),
      method = "pls",
      check_dims = FALSE
    )
    max_comp <- results$best_pc
    results <- results$result
  }


  # Select the necessary components
  pls_variance <- plsp$variance$x_var[, 1:max_comp, drop = FALSE]
  weights <- plsp$weights[1:max_comp, , drop = FALSE]
  scores <- plsp$scores[, 1:max_comp, drop = FALSE]
  X_loadings <- plsp$X_loadings[1:max_comp, , drop = FALSE]
  Y_loadings <- plsp$Y_loadings[1:max_comp, , drop = FALSE]
  plsp$projection_mat <- plsp$projection_mat[, 1:max_comp, drop = FALSE]

  # Give some names...
  rownames(plsp$projection_mat) <- colnames(X_loadings) <- colnames(X0)
  rownames(X_loadings) <- colnames(plsp$projection_mat) <- paste0("pls_", 1:max_comp)
  rownames(Y_loadings) <- rownames(X_loadings)
  colnames(Y_loadings) <- paste0("Y_pls", 1:ny)
  rownames(weights) <- rownames(X_loadings)
  colnames(weights) <- colnames(X_loadings)
  colnames(pls_variance) <- rownames(X_loadings)
  rownames(pls_variance) <- c("var", "explained_var_X", "cumulative_explained_var_X")

  yex <- plsp$variance$y_var[, 1:max_comp, drop = FALSE]

  colnames(yex) <- rownames(Y_loadings)
  if (ny > 1) {
    rownames(yex) <- paste0("cumulative_explained_var_", colnames(Yr))
  } else {
    rownames(yex) <- "cumulative_explained_var_Yr"
  }

  if (sum(nas) > 0) {
    scores.a <- matrix(NA, length(c(non_nas_yr, nas_yr)), ncol(scores))
    scores.a[nas_yr, ] <- project_opls(
      projection_mat = plsp$projection_mat,
      ncomp = max_comp,
      newdata = Xout,
      scale = scale,
      Xcenter = plsp$transf$Xcenter,
      Xscale = plsp$transf$Xscale
    )
    scores.a[non_nas_yr, ] <- scores
    scores <- scores.a
  }

  rownames(scores) <- paste0("Xr_", 1:nrow(scores))
  colnames(scores) <- rownames(Y_loadings)


  if (!is.null(Xu)) {
    if (is.vector(Xu)) {
      Xu <- t(Xu)
    }
    scores_Xu <- project_opls(
      projection_mat = plsp$projection_mat,
      ncomp = max_comp,
      newdata = Xu,
      scale = scale,
      Xcenter = plsp$transf$Xcenter,
      Xscale = plsp$transf$Xscale
    )

    colnames(scores_Xu) <- rownames(Y_loadings)
    rownames(scores_Xu) <- paste0("Xu_", 1:nrow(Xu))
    scores <- rbind(scores, scores_Xu)
  }

  if (!nrow(plsp$transf$Xscale)) {
    plsp$transf$Xscale <- matrix(1, 1, length(plsp$transf$Xcenter))
  }

  scores_sd <- get_column_sds(scores)
  colnames(scores_sd) <- colnames(scores)
  rownames(scores_sd) <- "sd"
  fresults <- list(
    scores = scores,
    X_loadings = X_loadings,
    Y_loadings = Y_loadings,
    weights = weights,
    projection_mat = plsp$projection_mat[, 1:ncol(scores), drop = FALSE],
    variance = list(
      original_x_var = plsp$variance$original_x_var,
      x_var = pls_variance, y_var = yex
    ),
    scores_sd = scores_sd,
    n_components = max_comp,
    pc_selection = pc_selection,
    center = plsp$transf$Xcenter,
    scale = plsp$transf$Xscale
  )
  fresults$method <- "pls"
  if (pc_selection_method == "opc") {
    fresults$opc_evaluation <- results
  }
  class(fresults) <- c("ortho_projection", "list")

  fresults
}

#' @aliases ortho_projection
#' @aliases pls_projection
#' @aliases pc_projection
#' @aliases predict.ortho_projection
#' @export
predict.ortho_projection <- function(object, newdata, ...) {
  if (missing(newdata)) {
    return(object$scores)
  }

  nms.org <- colnames(object$X_loadings)
  nms.nd <- colnames(newdata)

  if (sum(!nms.org %in% nms.nd) != 0) {
    stop("There are missing variables in new data that are required for the projection")
  } else {
    if (length(grep("pca", object$method)) != 0) {
      newdata <- sweep(newdata, MARGIN = 2, FUN = "-", STATS = object$center)
      newdata <- sweep(newdata, MARGIN = 2, FUN = "/", STATS = object$scale)
      return(newdata %*% t(object$X_loadings))
    } else {
      predpoj <- project_opls(
        projection_mat = object$projection_mat,
        ncomp = ncol(object$projection_mat),
        newdata = newdata,
        scale = TRUE,
        Xcenter = object$center,
        Xscale = object$scale
      )

      colnames(predpoj) <- paste0("pls", 1:ncol(predpoj))
      rownames(predpoj) <- rownames(newdata)

      return(predpoj)
    }
  }
}

Try the resemble package in your browser

Any scripts or data that you put into this service are public.

resemble documentation built on April 21, 2023, 1:13 a.m.