R/PCA.R

Defines functions get_IN_score get_CE_score pred_missing_curve predict.funPCA funPCA

Documented in funPCA predict.funPCA

#' Functional Principal Component Analysis (FPCA) via Conditional Expectation
#'
#' FPCA is performed via PACE (Principal Analysis via Conditional Expectation) proposed by Yao et al. (2005).
#'
#' @param Lt  a list of vectors or a vector containing time points for all curves
#' @param Ly  a list of vectors or a vector containing observations for all curves
#' @param mu a mean function estimated at work.grid
#' @param cov a covariance function estimated at work.grid
#' @param sig2 a noise variance
#' @param work.grid a work grid
#' @param K a number of FPCs. If K is NULL, K is selected by PVE.
#' @param PVE a proportion of variance explained
#'
#' @return a list contatining as follows:
#' \item{data}{a list containing Lt and Ly}
#' \item{lambda}{the first K eigenvalues}
#' \item{eig.fun}{the first K eigenvectors}
#' \item{pc.score}{the first K FPC scores}
#' \item{K}{a number of FPCs}
#' \item{PVE}{a proportion of variance explained}
#' \item{work.grid}{a work grid}
#' \item{eig.obj}{an object of the eigenanalysis}
#' \item{mu}{a mean function}
#' \item{cov}{a covariance function}
#' \item{sig2}{a noise variance}
#'
#' @examples
#' ### Generate example data
#' set.seed(100)
#' x.list <- sim_delaigle(n = 100,
#'                        type = "partial",
#'                        out.prop = 0.2,
#'                        dist = "normal")
#' x <- list2matrix(x.list)
#'
#' ### Estimate robust covariance function
#' work.grid <- seq(0, 1, length.out = 51)
#' cov.obj <- cov_ogk(x,
#'                    type = "huber",
#'                    bw = 0.1)
#'
#' ### Functional principal component analysis
#' fpca.obj <- funPCA(Lt = x.list$Lt,
#'                    Ly = x.list$Ly,
#'                    mu = cov.obj$mean,
#'                    cov = cov.obj$cov,
#'                    work.grid = work.grid,
#'                    PVE = 0.95)
#' fpc.score <- fpca.obj$pc.score
#'
#' # ### Give same result in the above
#' # fpca.obj <- robfpca.partial(x,
#' #                             type = "huber",
#' #                             PVE = 0.95,
#' #                             bw = 0.1)
#' # fpc.score <- fpca.obj$pc.score
#'
#' @export
funPCA <- function(Lt,
                   Ly,
                   mu,
                   cov,
                   sig2 = 0,
                   work.grid = NULL,
                   K = NULL,
                   PVE = 0.99) {
    n <- length(Lt)   # number of observations

    # grid for numerical integration
    if (is.null(work.grid)) {
        time.range <- range(unlist(Lt))
        work.grid <- seq(time.range[1], time.range[2],
                         length.out = nrow(cov))
    }

    # eigen analysis
    eig.obj <- get_eigen(cov, work.grid)

    # fitted covariance which is transformed to positive semi-definite
    cov <- eig.obj$cov_psd

    # # noise variance
    # if (is.null(sig2)) {
    #     sig2 <- 0
    # }

    # number of PCs
    if (is.null(K)){
        K <- which(eig.obj$PVE >= PVE)[1]
        PVE <- eig.obj$PVE[K]   # PVE
    }

    ## estimate PC scores via conditional expectation
    # - for loop is as fast as sapply!
    pc_score <- matrix(NA, n, K)

    ## If there exist complete curves, compute by matrix mutliplication.
    # complete curves - calculate matrix multiplication
    ind_complete <- sapply(Lt, function(t) { identical(work.grid, t) })
    if (sum(ind_complete) > 0) {
        complete_curves <- list2rbind(Ly[ind_complete])   # combine complete curves
        pc_score[ind_complete, ] <- get_CE_score(work.grid,
                                                 complete_curves,
                                                 mu,
                                                 cov,
                                                 sig2,
                                                 eig.obj,
                                                 K,
                                                 work.grid)
        # index of incomplete curves
        ind_snippet <- (1:n)[!ind_complete]
    } else {
        # does not exist complete curves
        ind_snippet <- 1:n
    }

    # snippets or partially observed curves - calculate individually
    for (i in ind_snippet) {
        pc_score[i, ] <- get_CE_score(Lt[[i]],
                                      Ly[[i]],
                                      mu,
                                      cov,
                                      sig2,
                                      eig.obj,
                                      K,
                                      work.grid)
    }


    res <- list(
        data = list(Lt = Lt,
                    Ly = Ly),
        lambda = eig.obj$lambda[1:K],
        eig.fun = matrix(eig.obj$phi[, 1:K],
                         ncol = K),
        pc.score = pc_score,
        K = K,
        PVE = eig.obj$PVE[K],
        work.grid = work.grid,
        eig.obj = eig.obj,
        mu = mu,
        cov = cov,
        sig2 = sig2
    )

    class(res) <- "funPCA"

    return(res)
}


#' Reconstruction via functional PCA
#' newdata should be a list containing Lt and Ly.
#'
#' @param object a \code{funPCA} object from \code{funPCA()}
#' @param newdata a list containing \code{Lt} and \code{Ly}
#' @param K a number of PCs for reconstruction
#' @param ... Not used
#'
#' @method predict funPCA
#'
#' @export
predict.funPCA <- function(object, newdata = NULL, K = NULL, ...) {
    if (class(object) != "funPCA") {
        stop("Check the class of input object! class name 'funPCA' is only supported!")
    }

    if (is.null(K)) {
        K <- object$K
    }
    if (K > object$K) {
        stop(paste0("Selected number of PCs from funPCA object is less than K."))
    }

    if (is.null(newdata)) {
        pc.score <- matrix(object$pc.score[, 1:K],
                           ncol = K)
        n <- nrow(pc.score)
    } else {
        Lt <- newdata$Lt
        Ly <- newdata$Ly
        n <- length(Lt)

        pc.score <- matrix(NA, n, K)
        for (i in 1:n) {
            pc.score[i, ] <- get_CE_score(Lt[[i]],
                                          Ly[[i]],
                                          object$mu,
                                          object$cov,
                                          object$sig2,
                                          object$eig.obj,
                                          K,
                                          object$work.grid)
        }
    }

    mu <- matrix(rep(object$mu, n),
                 nrow = n, byrow = TRUE)
    eig.fun <- matrix(object$eig.fun[, 1:K],
                      ncol = K)
    pred <- mu + pc.score %*% t(eig.fun)   # reconstructed curves

    return(pred)
}


### Obtain reconstructed curve for missing parts and simple curve registration
pred_missing_curve <- function(x, pred, grid = NULL, align = FALSE, conti = TRUE) {
    num_grid <- length(x)
    if (is.null(grid)) {
        grid <- seq(0, 1, length.out = num_grid)
    }

    pred_missing <- rep(NA, num_grid)   # leave only missing parts

    # Wheter missing is outer only or is in the middle
    # is_snippets == TRUE, missing is outer side.
    # is_snippets == FALSE, missing is in middle.
    is_snippets <- (max( diff( which(!is.na(x)) ) ) == 1)
    if (is_snippets) {
        obs_range <- range(which(!is.na(x)))   # index range of observed periods

        if ((obs_range[1] > 1) & (obs_range[2] < num_grid)) {
            # start and end
            A_i <- obs_range[1] - 1
            B_i <- obs_range[2] + 1

            if (align == TRUE) {
                pred_missing[1:A_i] <- pred[1:A_i] - pred[A_i] + x[A_i + 1]
                pred_missing[B_i:num_grid] <- pred[B_i:num_grid] - pred[B_i] + x[B_i - 1]
            } else {
                pred_missing[1:A_i] <- pred[1:A_i]
                pred_missing[B_i:num_grid] <- pred[B_i:num_grid]
            }

            # add true endpoints
            if (conti == TRUE) {
                pred_missing[c(A_i+1, B_i-1)] <- x[c(A_i+1, B_i-1)]
            }
        } else if ((obs_range[1] > 1) | (obs_range[2] < num_grid)) {
            if (obs_range[1] > 1) {
                # start periods
                A_i <- obs_range[1] - 1

                if (align == TRUE) {
                    pred_missing[1:A_i] <- pred[1:A_i] - pred[A_i] + x[A_i + 1]
                } else {
                    pred_missing[1:A_i] <- pred[1:A_i]
                }

                # add true endpoints
                if (conti == TRUE) {
                    pred_missing[A_i+1] <- x[A_i+1]
                }
            } else if (obs_range[2] < num_grid) {
                # end periods
                B_i <- obs_range[2] + 1

                if (align == TRUE) {
                    pred_missing[B_i:num_grid] <- pred[B_i:num_grid] - pred[B_i] + x[B_i - 1]
                } else {
                    pred_missing[B_i:num_grid] <- pred[B_i:num_grid]
                }

                # add true endpoints
                if (conti == TRUE) {
                    pred_missing[B_i-1] <- x[B_i-1]
                }
            }
        }
    } else {
        # missing is in the middle.
        na_range <- range(which(is.na(x)))   # index range of missing parts

        # last observed point (different with above case!!)
        A_i <- na_range[1] - 1
        B_i <- na_range[2] + 1

        if (align == TRUE) {
            # align gradient(slope) and shift to the true scale
            slope <- (x[B_i] - x[A_i]) / (pred[B_i] - pred[A_i])
            intercept <- x[A_i] - slope * pred[A_i]
            pred_missing[A_i:B_i] <- slope*pred[A_i:B_i] + intercept

            # small missing values => Not align
            if (diff(na_range) >= 3) {
                # check convexity and difference of slope
                # if (+ => -) or (- => +), other transform is performed.
                # at least 7 obs are required to compare 3rd and 5th
                if (A_i - 3 > 0) {
                    # missing start part
                    first_deriv <- get_deriv(c(x[(A_i-3):A_i],
                                               pred_missing[(A_i+1):(A_i+3)]),
                                             grid[(A_i-3):(A_i+3)])
                    is_convex <- is.convex(c(x[(A_i-3):A_i],
                                             pred_missing[(A_i+1):(A_i+3)]),
                                           grid[(A_i-3):(A_i+3)])
                } else if (B_i + 3 <= num_grid) {
                    # missing end part
                    first_deriv <- get_deriv(c(pred_missing[(B_i-3):(B_i-1)],
                                               x[B_i:(B_i+3)]),
                                             grid[(B_i-3):(B_i+3)])
                    is_convex <- is.convex(c(pred_missing[(B_i-3):(B_i-1)],
                                             x[B_i:(B_i+3)]),
                                           grid[(B_i-3):(B_i+3)])
                }

                # proportional transform having linear gradient of slope
                # second derivative is changed or differnece of first derivative > 10
                if ( xor(is_convex[3], is_convex[5]) || abs(first_deriv[3] - first_deriv[5]) > 10 ) {
                    g <- function(t, y) {
                        numerator <- x[B_i] / pred[B_i] - x[A_i] / pred[A_i]
                        gamma_prime <- numerator / (grid[B_i] - grid[A_i])
                        const <- x[A_i] / pred[A_i] - gamma_prime*grid[A_i]
                        gamma <- gamma_prime * t + const

                        return(gamma * y)
                    }
                    pred_missing[A_i:B_i] <- g(grid[(A_i):(B_i)],
                                               pred[(A_i):(B_i)])
                }
            }
        } else {
            pred_missing[A_i:B_i] <- pred[A_i:B_i]
        }

        # remove true endpoints
        if (conti == FALSE) {
            pred_missing[c(A_i, B_i)] <- NA
        }
    }

    return(pred_missing)
}


### Get PC scores via conditional expectation
# - t: observed time points
# - y: matrix => fully observed curves
# - y: vector => snippets or partially observed curve
get_CE_score <- function(t, y, mu, cov, sig2, eig.obj, K, work.grid) {
    phi <- matrix(eig.obj$phi[, 1:K],
                  ncol = K)
    lambda <- eig.obj$lambda[1:K]
    Sigma_y <- cov + diag(sig2, nrow = nrow(cov))
    # Sigma_Y <- fpca.yao$smoothedCov
    # Sigma_Y <- eig.yao$phi %*% diag(eig.yao$lambda) %*% t(eig.yao$phi)


    # # convert phi and fittedCov to obsGrid.
    # mu <- ConvertSupport(work.grid, obs.grid, mu = mu)
    # phi <- ConvertSupport(work.grid, obs.grid, phi = phi)
    # Sigma_Y <- ConvertSupport(work.grid, obs.grid,
    #                           Cov = Sigma_y)
    # # get subsets at each observed grid for conditional expectation
    # mu_y_i <- approx(obs.grid, mu, t)$y
    # phi_y_i <- apply(phi, 2, function(eigvec){
    #   return(approx(obs.grid, eigvec, t)$y)
    # })
    # Sigma_y_i <- matrix(pracma::interp2(obs.grid,
    #                                     obs.grid,
    #                                     Sigma_y,
    #                                     rep(t, each = length(t)),
    #                                     rep(t, length(t))),
    #                     length(t),
    #                     length(t))

    # get CE scores via matrix multiplication for fully observed curves
    if (is.matrix(y)) {
        n <- nrow(y)   # number of complete curves

        # obtain PC score via conditional expectation
        y_mu <- y - matrix(rep(mu, n),
                           nrow = n,
                           byrow = TRUE)
        lamda_phi <- diag(lambda, ncol = K) %*% t(phi)
        # Sigma_y_mu <- solve(Sigma_y, t(y_mu))
        Sigma_y_mu <- MASS::ginv(Sigma_y) %*% t(y_mu)
        xi <- lamda_phi %*% Sigma_y_mu

        return( t(xi) )
    }

    # get subsets at each observed grid for conditional expectation
    if (!identical(work.grid, t)) {
        mu_y_i <- stats::approx(work.grid, mu, t)$y
        phi_y_i <- apply(phi, 2, function(eigvec){
            return(stats::approx(work.grid, eigvec, t)$y)
        })
        Sigma_y_i <- matrix(pracma::interp2(work.grid,
                                            work.grid,
                                            Sigma_y,
                                            rep(t, each = length(t)),
                                            rep(t, length(t))),
                            length(t),
                            length(t))
    } else {
        mu_y_i <- mu
        phi_y_i <- phi
        Sigma_y_i <- Sigma_y
    }

    # obtain PC score via conditional expectation
    lamda_phi <- diag(lambda, ncol = K) %*% t(phi_y_i)
    # Sigma_y_i_mu <- solve(Sigma_y_i, y - mu_y_i)
    Sigma_y_i_mu <- MASS::ginv(Sigma_y_i) %*% matrix(y - mu_y_i, ncol = 1)
    xi <- lamda_phi %*% Sigma_y_i_mu

    return(as.numeric(xi))
}


### Get PC scores using numerical integration
get_IN_score <- function(t, y, mu, cov, sig2, eig.obj, K, work.grid) {
    phi <- eig.obj$phi[, 1:K]
    lambda <- eig.obj$lambda[1:K]
    Sigma_y <- cov + diag(sig2, nrow = nrow(cov))

    # get subsets at each observed grid for numerical integration
    mu_y_i <- stats::approx(work.grid, mu, t)$y
    phi_y_i <- apply(phi, 2, function(eigvec){
        return(stats::approx(work.grid, eigvec, t)$y)
    })

    # obtain PC score using numerical integration
    h <- (y - mu_y_i) %*% phi_y_i
    xi <- fdapace::trapzRcpp(t, h)

    return(as.numeric(xi))
}
statKim/robfpca documentation built on April 15, 2023, 10:12 p.m.