R/anchor_regression_gam.R

#' @title anchor_regression_gam
#'
#' @description Perform an Generalized Additive Anchor Regression
#'
#' @param x is a dataframe containing the matrix x containing the independent variables
#' @param anchor is a dataframe containing the matrix anchor containing the anchor variable
#' @param gamma is the regularization parameter for the Anchor Regression
#' @param target_variable is the target variable name contained in the x dataframe
#' @param bin_factor binary variable that can be transformed to a factor to partial out effects
#'
#' @return A list with coefficient values and a list with the respective names \code{overview_print}. Additionally the transformed data as x and y plus the fixed lambda coefficient.
#' @export
#' @importFrom mgcv gam
#' @importFrom stats coef lm as.formula
#' @examples
#' x <- as.data.frame(matrix(data = rnorm(10000),nrow = 1000,ncol = 10))
#' x$bin <- sample(nrow(x),x = c(1,0),prob = c(0.5,0.5),replace = TRUE)
#' anchor <- as.data.frame(matrix(data = rnorm(2000),nrow = 1000,ncol = 2))
#' colnames(anchor) <- c('X1','X2')
#' gamma <- 2
#' target_variable <- 'V2'
#' anchor_regression_gam(x, anchor, gamma, target_variable,bin_factor =  "bin")


anchor_regression_gam <- function (x, anchor, gamma, target_variable, bin_factor = NULL) {
  if (ncol(x) < 3) {
    print("unsufficient number of columns")
  }
  # transform data
  x <- as.matrix(x)
  anchor <- as.matrix(anchor)
  fit_const <- lm(x ~ 1)
  fit <- lm(x ~ anchor)

  # slice data for model fitting
  anchor_data <- fit_const$fitted.values + fit$residuals +
    sqrt(gamma) * (fit$fitted.values - fit_const$fitted.values)
  indices <- 1:nrow(anchor_data)
  j <- match(target_variable, colnames(anchor_data))
  x <- anchor_data[indices, -c(j)]
  y <- anchor_data[indices, j]


  # extract binary columns
  x <- as.data.frame(anchor_data)
  uniq <- lapply(x, unique)
  nuniq <- as.data.frame(lengths(uniq))
  nuniq_names <- names(as.data.frame(nuniq))[as.vector(nuniq<4)]


  vars <- colnames(x)[!colnames(x) %in% target_variable]
  vars_non_bin <- vars[!vars %in% nuniq_names]

  # generate formula
  if(is.null(bin_factor) != TRUE){
    x[bin_factor] <- as.factor(round(x[,bin_factor]))
    vars_non_bin_fac <- vars_non_bin[!vars_non_bin %in% bin_factor]
    if(length(nuniq_names) !=0){
      col <- paste0(", by=",bin_factor, ")+s(")
      form <- paste(target_variable, "~ s(", paste(vars_non_bin_fac, collapse=col),")+",paste(nuniq_names, collapse=" + "))
    }else{
      col <- paste0(", by=",bin_factor, ")+s(")
      form <- paste(target_variable, "~ s(", paste(vars_non_bin_fac, collapse=col),")")
    }
  }else{
    if(length(nuniq_names) !=0){
      form <- paste(target_variable, "~ s(", paste(vars_non_bin, collapse=") + s("),")+",paste(nuniq_names, collapse=" + "))
    }else{
      form <- paste(target_variable, "~ s(", paste(vars_non_bin, collapse=") + s("),")")
    }
  }
  # estimate model
  model <-  gam(as.formula(form),data=x, method = "REML")

  return_list <- list(model = model)
  return(return_list)
}

Try the AnchorRegression package in your browser

Any scripts or data that you put into this service are public.

AnchorRegression documentation built on Jan. 8, 2021, 2:04 a.m.