intervalreg: Regression for Interval-valued Data

#' Symbolic Covariance Method
#'
#' \code{SCM()} is used to fit a linear regression model based on symbolic covariance matrix(Xu, 2010).
#' @param formula an object of class \code{\link[stats]{formula}}, a symbolic description of the model to be fitted.
#' @param data an data frame containing the variables in the model.
#'
#' @details
#' The SCM proposed by Xu(2010) is a method of estimating a regression coefficient using a symbolic covariance matrix. In SCM, the centralized linear regression model is used(model with centered variables).
#' The regression coefficient is estimated by least squares method, but it is used by symbolic covariance matrix. Because the process of calculating the symbolic sample covariance uses the lower and upper limits of each variable, the SCM reflects the variability of the interval.
#'
#' @note In dataset, a pair of the interval variables should always be composed in order from lower to upper bound. In order to apply this function, the data should be composed as follows:
#' \tabular{cccccc}{
#' \eqn{y_L}     \tab  \eqn{y_U}     \tab  \eqn{x1_L}    \tab  \eqn{x1_U}    \tab  \eqn{x2_L}    \tab  \eqn{x2_U}\cr
#' \eqn{y_L1}  \tab  \eqn{y_U1}  \tab  \eqn{x_L11} \tab  \eqn{x_U11} \tab  \eqn{x_L12} \tab  \eqn{x_U12}\cr
#' \eqn{y_L2}  \tab  \eqn{y_U2}  \tab  \eqn{x_L21} \tab  \eqn{x_U21} \tab  \eqn{x_L22} \tab  \eqn{x_U22}\cr
#' \eqn{y_L3}  \tab  \eqn{y_U3}  \tab  \eqn{x_L31} \tab  \eqn{x_U31} \tab  \eqn{x_L32} \tab  \eqn{x_U32}\cr
#' \eqn{y_L4}  \tab  \eqn{y_U4}  \tab  \eqn{x_L41} \tab  \eqn{x_U41} \tab  \eqn{x_L42} \tab  \eqn{x_U42}\cr
#' \eqn{y_L5}  \tab  \eqn{y_U5}  \tab  \eqn{x_L51} \tab  \eqn{x_U51} \tab  \eqn{x_L52} \tab  \eqn{x_U52}\cr
#' }
#' @note The upper limit value of the variable should be unconditionally greater than the lower limit value. Otherwise, it will be output as \code{NA} or \code{NAN}, and the value can not be generated.
#'
#' @return \item{symbolic.covariance.Sxx}{Symbolic sample variance-covariance matrix between response variable Y and predictor variables X.}
#' @return \item{symbolic.covariance.Sxy}{Symbolic sample covariance vector between response variable Y and predictor variables X}
#' @return \item{coefficients}{regression coefficients}
#' @return \item{fitted.values}{The fitted values for the lower and upper interval bound.}
#' @return \item{residuals}{The residuals for the lower and upper interval bound.}
#'
#' @references Xu, W.(2010), Symbolic Data Analysis: Interval-Valued Data Regression
#'
#' @examples
#' data(example3)
#' m1 <- SCM(cbind(Sepal.Length_L, Sepal.Length_U) ~ Sepal.Width_L + Sepal.Width_U + Petal.Length_L + Petal.Length_U + Petal.Length_L + Petal.Length_U, data = example3)
#' m1
#' m1$coefficients
#' @seealso \code{\link{RMSE}} \code{\link{symbolic.r}}
#' @import stats
#' @export
SCM <- function(formula = formula, data = data) {

  n = nrow(data)

  # input formula for regression
  formula1 = as.formula(formula)
  model_interval = model.frame(formula = formula1, data = data)

  # response variable interval value
  yinterval = model.response(model_interval)
  y = as.matrix(yinterval)

  # predictor variable interval matrix (each left: Lower, each right: Upper)
  xinterval = model.matrix(attr(model_interval, "terms"), data = data)
  x = as.matrix(xinterval)

  p = ncol(x) - 1 # except num. of intercept

  ### SCM STEP ###

  # symbolic sample mean
  mean_x <- NULL
  for (j in 1:{p/2}) {
    mean_x[j] = sum(x[, {2*j}] + x[, {2*j+1}]) / (2*n)
  }

  mean_y <- NULL
  for (i in 1:n) {
    mean_y = sum(y[, 1] + y[, 2]) / (2*n)
  }

  # set lower & upper bounds
  x_L <- x_U <- matrix(NA, nrow = n, ncol = {p/2})
  for (j in {1:p/2}) {
    x_L[, j] = x[, 2*j]
    x_U[, j] = x[, 2*j+1]
  }
  y_L = y[, 1]
  y_U = y[, 2]

  # symbolic covariance
  symbolic.cov <- function(lower, upper) {

    n = nrow(lower)
    p = ncol(lower)

    x_mean = colSums(lower + upper) / (2*n)

    sym.c = matrix(NA, nrow = p, ncol = p)

    for (j in 1:p) {
      for (k in 1:p) {
        sym.c[j, k] = ( sum(2*(x_L[,j] - x_mean[j])*(x_L[,k] - x_mean[k]) +
                              (x_L[,j] - x_mean[j])*(x_U[,k] - x_mean[k]) +
                              (x_U[,j] - x_mean[j])*(x_L[,k] - x_mean[k]) +
                              2*(x_U[,j] - x_mean[j])*(x_U[,k] - x_mean[k]) ) / ( 6*n ) )
      }
    }

    return(sym.c)

  }

  # S_xx
  S_xx <- symbolic.cov(x_L, x_U)

  # S_xy
  S_xy <- matrix(NA, nrow = {p/2}, ncol = 1)
  for (j in 1:{p/2}) {
    S_xy[j, ] <- sum(2*(x_L[, j] - mean_x[j])*(y_L - mean_y) +
                       (x_L[, j] - mean_x[j])*(y_U - mean_y) +
                       (x_U[, j] - mean_x[j])*(y_L - mean_y) +
                       2*(x_U[, j] - mean_x[j])*(y_U - mean_y)) / (6*n)
  }

  ### coefficients estimate STEP ###
  beta <- t(solve(S_xx) %*% S_xy)
  intercept <- mean_y - sum(mean_x * beta)

  estcoef <- matrix(c(intercept, beta), ncol = 1)
  var.names = colnames(x)[(1:{p/2})*2]
  rownames(estcoef) = c("intercept", var.names)
  colnames(estcoef) = c("coefficients")

  # fitted values (Lower & Upper)
  fitted_L <- fitted_U <- NULL
  for (i in 1:n) {
    fitted_L[i] <- min(x[i, c(1, seq(2, p, by = 2))] %*% c(t(estcoef[, 1])), x[i, c(1, seq(3, p+1, by = 2))] %*% c(t(estcoef[, 1])))
    fitted_U[i] <- max(x[i, c(1, seq(2, p, by = 2))] %*% c(t(estcoef[, 1])), x[i, c(1, seq(3, p+1, by = 2))] %*% c(t(estcoef[, 1])))
  }
  fitted_values <- cbind(fitted_L, fitted_U)
  colnames(fitted_values) <- c("fitted.Lower", "fitted.Upper")

  # residuals (Lower & Upper)
  residual_L <- y[, 1] - fitted_L
  residual_U <- y[, 2] - fitted_U
  residuals <- cbind(residual_L, residual_U)
  colnames(residuals) <- c("resid.Lower", "resid.Upper")

  ### final STEP ###
  result <- list(call = match.call(),
                 response = y,
                 predictor = x,
                 symbolic.covariance.Sxx = S_xx,
                 symbolic.covariance.Sxy = S_xy,
                 coefficients = estcoef,
                 fitted.values = fitted_values,
                 residuals = residuals)

  return(result)

}