R/LRCoDa.R

#' @importFrom stats lm
#' @import dplyr
#' @import tidyr
#' @importFrom robustbase lmrob
#' @importFrom robustbase lmrob.control
#' @importFrom robCompositions pivotCoord
#' @importFrom sjmisc is_empty
#' @importFrom forcats fct_recode
NULL


#' Classical and robust regression of non-compositional (real) response on compositional and non-compositional predictors combined
#'
#' Delivers appropriate inference for regression of y on compositional and non-compositional combined input X.
#'
#' Compositional explanatory variables should not be directly used in a linear regression model because any inference statistic
#' can become misleading. While various approaches for this problem were proposed, here an approach based on the pivot coordinates
#' is used. Further these compositional explanatory variables can be supplemented with external non-compositional data
#' and factor variables.
#'
#' @aliases LRCoDa ilrregression robilrregression
#' @param y The response which should be non-compositional
#' @param X The compositional and/or non-compositional predictors as a matrix, data.frame or numeric vector
#' @param external Specify the columns name of the external variables. The name has to be introduced as follows:
#' external = c("variable_name"). Multiple selection is supported for the external variable. Factor variables are
#' automatically detected.
#' @param method If \dQuote{robust}, the fast MM-type linear regression is applied, while with method
#' \dQuote{classical}, the conventional least squares regression is applied.
#' @param pivot_norm if FALSE then the normalizing constant is not used, if TRUE sqrt((D-i)/(D-i+1))
#' is used (default). The user can also specify a self-defined constant
#' @param max_refinement_steps (for the fast-S algorithm): maximal number of refinement
#' steps for the “fully” iterated best candidates.
#' @return An object of class \sQuote{lmrob} or \sQuote{lm} and two summary
#' objects.
#' @author Roman Wiedemeier
#' @seealso \code{\link{lm}}, \code{\link{lmrob}} and \code{\link{pivotCoord}}
#' @keywords models compositional data linear regression
#' @export
#' @examples
#'
#' ## How the content of sand of the agricultural
#' ## and grazing land soils in Germany depend on
#' ## relative contributions of the main chemical trace elements,
#' ## their different soil types and the Annual mean temperature:
#' data("gemas")
#' gemas$COUNTRY <- as.factor(gemas$COUNTRY)
#' gemas_GER <- dplyr::filter(gemas, gemas$COUNTRY == 'GER')
#' y <- gemas_GER$sand
#' X <- dplyr::select(gemas_GER, c(MeanTemp, soilclass, Al:Zr))
#' LRCoDa(y, X, external = c('MeanTemp', 'soilclass'),
#' method='classical', pivot_norm = 'orthonormal')
#' LRCoDa(y, X, external = c('MeanTemp', 'soilclass'),
#' method='robust', pivot_norm = 'orthonormal')
LRCoDa <- function (y, X, external = NULL, method = "robust", pivot_norm = 'orthonormal', max_refinement_steps = 200) { # ltsReg mit lmrob ersetzen und dann sollte die Fehlermeldung verschwinden

  if (!is.null(external) & (typeof(external) != "character")) {
    stop("Invalid datatype for external")
  }
  if ((any(external == names(X %>% select_if(is.factor)))) == FALSE & (any(sapply(X, function(x) !is.numeric(x))))) {
    stop("Variable with datatype character or factor have to be defined as external")
  }
  if (any(is.na(y))){
    dat <- cbind(y, X)
    dat_missing <- dat %>% filter(is.na(y))
    n <- dim(dat_missing)[1]
    dat_new <- dat %>% filter(!is.na(y))

    X <- dat_new %>% select(-c(y))
    y <- dat_new %>% select(c(y))
  }
  if (!is.null(external) & (any(sapply(X, function(x) is.numeric(x))))){
    external_col <- X %>% select(all_of(external)) %>% select_if(is.numeric)
    if (all(sapply(external_col, function(x) is.numeric(x)))){
      n_externals <- length(external_col)
    } else {
      stop("Datatype of all external non-factor variables have to be numeric")
    }
  } else {
    external_col = NULL
  }
  if (!is.null(external) & (any(sapply(X, function(x) !is.numeric(x))))) {
    X <- X %>% mutate_if(sapply(X, is.character), as.factor)
    factor_var <- X %>% select(all_of(external)) %>% select_if(is.factor)
    factor_col <- factor_var[, 1]
    if (is.factor(factor_col) | is.character(factor_col)){
      n_levels <- length(unique(as.character(factor_col)))
    } else {
      stop("Datatype of factor variable has to be factor or character")
    }
    if (any(is_empty(unique(as.character(factor_col)), first.only = FALSE, all.na.empty = TRUE) == TRUE)){
      stop("Dataset contains levels with empty strings or missing values. Specify factor name or drop these observations.")
    }
  } else {
    factor_col = NULL
  }
  if (!is.null(factor_col) & length(factor_var) > 1) {
    stop("There are more than 1 factor variable defined")
  }

  ilrregression <- function(X, y, external, pivot_norm) {

    if (!is.null(factor_col) & !is.null(external_col)){
      X_selected <- X %>% select(-all_of(c(external)))
      ZV <- data.frame(cbind(y, X %>% relocate(all_of(c(names(factor_var), names(external_col)))) %>%
                               rename_with(.cols = colnames(X %>% select(all_of(external))), function(x){paste0("External.", x)}) %>%
                               rename_with(.cols = colnames(X %>% select(-all_of(c(external)))), function(x){paste0("Internal.", x)})))
      lmcla <- lm(y ~ ., data = ZV)
      lmcla.sum <- summary(lmcla)
      ilr.sum <- lmcla.sum

      for (j in 1:(ncol(X)-1-n_externals)) {
        Zj <- pivotCoord(cbind(X_selected[, j], X_selected[, -j]), method = pivot_norm)
        ZVj <- data.frame(Factor = factor_col, Externals = external_col, Z = Zj)
        dj <- data.frame(y = y, Z = ZVj)
        res <- lm(y ~ ., data = dj)
        res.sum <- summary(res)
        if (j == 1) {
          ilr.sum$coefficients[1:(n_levels+n_externals+1), ] <- res.sum$coefficients[1:(n_levels+n_externals+1), ]
          ilr.sum$residuals <- res.sum$residuals
          ilr.sum$sigma <- res.sum$sigma
          ilr.sum$r.squared <- res.sum$r.squared
          ilr.sum$adj.r.squared <- res.sum$adj.r.squared
          ilr.sum$fstatistic <- res.sum$fstatistic
        }
        else {
          ilr.sum$coefficients[j + n_levels + n_externals, ] <- res.sum$coefficients[(n_levels+n_externals+1), ]

        }
      }
    }
    if (is.null(factor_col) & is.null(external_col)){
      ZV <- data.frame(cbind(y, X %>% rename_with(.cols = everything(), function(x){paste0("Internal.", x)})))
      lmcla <- lm(y ~ ., data = ZV)
      lmcla.sum <- summary(lmcla)
      ilr.sum <- lmcla.sum

      for (j in 1:ncol(X)) {
        Zj <- pivotCoord(cbind(X[, j], X[, -j]), method = pivot_norm)
        dj <- data.frame(y = y, Z = Zj)
        res <- lm(y ~ ., data = dj)
        res.sum <- summary(res)
        if (j == 1) {
          ilr.sum$coefficients[1:2, ] <- res.sum$coefficients[1:2, ]
          ilr.sum$residuals <- res.sum$residuals
          ilr.sum$sigma <- res.sum$sigma
          ilr.sum$r.squared <- res.sum$r.squared
          ilr.sum$adj.r.squared <- res.sum$adj.r.squared
          ilr.sum$fstatistic <- res.sum$fstatistic
        }
        else {
          ilr.sum$coefficients[j + 1, ] <- res.sum$coefficients[2, ]
        }
      }
    }
    if (!is.null(factor_col) & is.null(external_col)){
      X_selected <- X %>% select(-all_of(c(external)))
      ZV <- data.frame(cbind(y, X %>% relocate(all_of(c(names(factor_var)))) %>%
                               rename_with(.cols = colnames(X %>% select(all_of(external))), function(x){paste0("External.", x)}) %>%
                               rename_with(.cols = colnames(X %>% select(-all_of(c(external)))), function(x){paste0("Internal.", x)})))
      lmcla <- lm(y ~ ., data = ZV)
      lmcla.sum <- summary(lmcla)
      ilr.sum <- lmcla.sum

      for (j in 1:(ncol(X)-1)) {
        Zj <- pivotCoord(cbind(X_selected[, j], X_selected[, -j]), method = pivot_norm)
        ZVj <- data.frame(Factor = factor_col, Z = Zj)
        dj <- data.frame(y = y, Z = ZVj)
        res <- lm(y ~ ., data = dj)
        res.sum <- summary(res)
        if (j == 1) {
          ilr.sum$coefficients[1:(n_levels+1), ] <- res.sum$coefficients[1:(n_levels+1), ]
          ilr.sum$residuals <- res.sum$residuals
          ilr.sum$sigma <- res.sum$sigma
          ilr.sum$r.squared <- res.sum$r.squared
          ilr.sum$adj.r.squared <- res.sum$adj.r.squared
          ilr.sum$fstatistic <- res.sum$fstatistic
        }
        else {
          ilr.sum$coefficients[j + n_levels, ] <- res.sum$coefficients[(n_levels+1), ]
        }
      }
    }
    if (is.null(factor_col) & !is.null(external_col)){
      X_selected <- X %>% select(-all_of(c(external)))
      ZV <- data.frame(cbind(y, X %>% relocate(all_of(c(names(external_col)))) %>%
                               rename_with(.cols = colnames(X %>% select(all_of(external))), function(x){paste0("External.", x)}) %>%
                               rename_with(.cols = colnames(X %>% select(-all_of(c(external)))), function(x){paste0("Internal.", x)})))
      lmcla <- lm(y ~ ., data = ZV)
      lmcla.sum <- summary(lmcla)
      ilr.sum <- lmcla.sum

      for (j in 1:(ncol(X)-n_externals)) {
        Zj <- pivotCoord(cbind(X_selected[, j], X_selected[, -j]), method = pivot_norm)
        ZVj <- data.frame(Externals = external_col, Z = Zj)
        dj <- data.frame(y = y, Z = ZVj)
        res <- lm(y ~ ., data = dj)
        res.sum <- summary(res)
        if (j == 1) {
          ilr.sum$coefficients[1:(n_externals+2), ] <- res.sum$coefficients[1:(n_externals+2), ]
          ilr.sum$residuals <- res.sum$residuals
          ilr.sum$sigma <- res.sum$sigma
          ilr.sum$r.squared <- res.sum$r.squared
          ilr.sum$adj.r.squared <- res.sum$adj.r.squared
          ilr.sum$fstatistic <- res.sum$fstatistic
        }
        else {
          ilr.sum$coefficients[j + n_externals + 1, ] <- res.sum$coefficients[(n_externals+2), ]
        }
      }
    }
    list(lm = lmcla, lm = lmcla.sum, ilr = ilr.sum)
  }

  robilrregression <- function(X, y, external, pivot_norm) {
    cont_lmrob <- lmrob.control(fast.s.large.n = Inf, k.max = max_refinement_steps)

    if (!is.null(factor_col) & !is.null(external_col)){
      X_selected <- X %>% select(-all_of(c(external)))
      ZV <- data.frame(cbind(y, X %>% relocate(all_of(c(names(factor_var), names(external_col)))) %>%
                               rename_with(.cols = colnames(X %>% select(all_of(external))), function(x){paste0("External.", x)}) %>%
                               rename_with(.cols = colnames(X %>% select(-all_of(c(external)))), function(x){paste0("Internal.", x)})))
      lmcla <- robustbase::lmrob(y ~ ., data = ZV, control = cont_lmrob)
      lmcla.sum <- summary(lmcla)
      ilr.sum <- lmcla.sum

      for (j in 1:(ncol(X)-1-n_externals)) {
        Zj <- pivotCoord(cbind(X_selected[, j], X_selected[, -j]), method = pivot_norm)
        ZVj <- data.frame(Factor = factor_col, Externals = external_col, Z = Zj)
        dj <- data.frame(y = y, Z = ZVj)
        res <- robustbase::lmrob(y ~ ., data = dj, control = cont_lmrob)
        res.sum <- summary(res)
        if (j == 1) {
          ilr.sum$coefficients[1:(n_levels+n_externals+1), ] <- res.sum$coefficients[1:(n_levels+n_externals+1), ]
          ilr.sum$residuals <- res.sum$residuals
          ilr.sum$sigma <- res.sum$sigma
          ilr.sum$r.squared <- res.sum$r.squared
          ilr.sum$adj.r.squared <- res.sum$adj.r.squared
        }
        else {
          ilr.sum$coefficients[j + n_levels + n_externals, ] <- res.sum$coefficients[(n_levels+n_externals+1), ]
        }
      }
    }
    if (is.null(factor_col) & is.null(external_col)){
      ZV <- data.frame(cbind(y, X %>% rename_with(.cols = everything(), function(x){paste0("Internal.", x)})))
      lmcla <- robustbase::lmrob(y ~ ., data = ZV, control = cont_lmrob)
      lmcla.sum <- summary(lmcla)
      ilr.sum <- lmcla.sum

      for (j in 1:ncol(X)) {
        Zj <- pivotCoord(cbind(X[, j], X[, -j]), method = pivot_norm)
        dj <- data.frame(y = y, Z = Zj)
        res <- robustbase::lmrob(y ~ ., data = dj, control = cont_lmrob)
        res.sum <- summary(res)
        if (j == 1) {
          ilr.sum$coefficients[1:2, ] <- res.sum$coefficients[1:2, ]
          ilr.sum$residuals <- res.sum$residuals
          ilr.sum$sigma <- res.sum$sigma
          ilr.sum$r.squared <- res.sum$r.squared
          ilr.sum$adj.r.squared <- res.sum$adj.r.squared
        }
        else {
          ilr.sum$coefficients[j + 1, ] <- res.sum$coefficients[2, ]
        }
      }
    }
    if (!is.null(factor_col) & is.null(external_col)){
      X_selected <- X %>% select(-all_of(c(external)))
      ZV <- data.frame(cbind(y, X %>% relocate(all_of(c(names(factor_var)))) %>%
                               rename_with(.cols = colnames(X %>% select(all_of(external))), function(x){paste0("External.", x)}) %>%
                               rename_with(.cols = colnames(X %>% select(-all_of(c(external)))), function(x){paste0("Internal.", x)})))
      lmcla <- robustbase::lmrob(y ~ ., data = ZV, control = cont_lmrob)
      lmcla.sum <- summary(lmcla)
      ilr.sum <- lmcla.sum

      for (j in 1:(ncol(X)-1)) {
        Zj <- pivotCoord(cbind(X_selected[, j], X_selected[, -j]), method = pivot_norm)
        ZVj <- data.frame(Factor = factor_col, Z = Zj)
        dj <- data.frame(y = y, Z = ZVj)
        res <- robustbase::lmrob(y ~ ., data = dj, control = cont_lmrob)
        res.sum <- summary(res)
        if (j == 1) {
          ilr.sum$coefficients[1:(n_levels+1), ] <- res.sum$coefficients[1:(n_levels+1), ]
          ilr.sum$residuals <- res.sum$residuals
          ilr.sum$sigma <- res.sum$sigma
          ilr.sum$r.squared <- res.sum$r.squared
          ilr.sum$adj.r.squared <- res.sum$adj.r.squared
        }
        else {
          ilr.sum$coefficients[j + n_levels, ] <- res.sum$coefficients[n_levels + 1, ]
        }
      }
    }
    if (is.null(factor_col) & !is.null(external_col)){
      X_selected <- X %>% select(-all_of(c(external)))
      ZV <- data.frame(cbind(y, X %>% relocate(all_of(c(names(external_col)))) %>%
                               rename_with(.cols = colnames(X %>% select(all_of(external))), function(x){paste0("External.", x)}) %>%
                               rename_with(.cols = colnames(X %>% select(-all_of(c(external)))), function(x){paste0("Internal.", x)})))
      lmcla <- robustbase::lmrob(y ~ ., data = ZV, control = cont_lmrob)
      lmcla.sum <- summary(lmcla)
      ilr.sum <- lmcla.sum

      for (j in 1:(ncol(X)-n_externals)) {
        Zj <- pivotCoord(cbind(X_selected[, j], X_selected[, -j]), method = pivot_norm)
        ZVj <- data.frame(Externals = external_col, Z = Zj)
        dj <- data.frame(y = y, Z = ZVj)
        res <- robustbase::lmrob(y ~ ., data = dj, control = cont_lmrob)
        res.sum <- summary(res)
        if (j == 1) {
          ilr.sum$coefficients[1:(n_externals+2), ] <- res.sum$coefficients[1:(n_externals+2), ]
          ilr.sum$residuals <- res.sum$residuals
          ilr.sum$sigma <- res.sum$sigma
          ilr.sum$r.squared <- res.sum$r.squared
          ilr.sum$adj.r.squared <- res.sum$adj.r.squared
        }
        else {
          ilr.sum$coefficients[j + n_externals + 1, ] <- res.sum$coefficients[(n_externals+2), ]
        }
      }
    }
    list(lm = lmcla, lm = lmcla.sum, ilr = ilr.sum)
  }

  if (method == "classical") {
    reg <- ilrregression(X, y, external, pivot_norm)
  }
  else if (method == "robust") {
    reg <- robilrregression(X, y, external, pivot_norm)
  }
  if (exists("dat_missing")) {
    message("There are ", n ," observations omitted due to missings in the target variable")
  }
  return(reg)
}
Wiedi97/LRCoDa-Package documentation built on June 30, 2022, 10:51 a.m.