################################################################################
################################################################################
################################### formula ####################################
################################################################################
################################################################################
#######################################################
# Function for reading the formula of NGeDS and GGeDS #
#######################################################
read.formula <- function(formula, data, weights, offset)
{
# Determine model terms based on 'formula', using 'data' if provided
mt <- if (missing(data)) {
terms(formula, "f")
} else terms(formula, "f", data = data)
# Remove intercept from model terms
if (attr(mt,"intercept") == 1) {
attr(mt,"intercept") <- 0
} else {
# In case the intercept was explicitly removed in the formula
warning("An intercept will be included in the basis functions")
}
data <- as.list(data)
data$f <- function(x,xx=NULL,...) {
if(!missing(...)) stop("Algorithm supports at most two variables in 'f'")
cbind(x,xx)
}
# Locate f(X)/f(X,Y) in model terms
spec <- attr(mt,"specials")$f
if(length(spec)!= 1) stop("Formula incorrectly specified. Read documentation for further information.")
# Generate a model matrix based on the model terms and data
mm <- model.matrix(mt, data)
# Create a model frame based on the model terms and data, omitting rows with NAs
mf <- model.frame(mt, data, na.action = na.omit)
# Extract response variable
Y <- model.response(mf, type="numeric")
attr(Y,"names")<- NULL
# Extract GeDS covariates
X <- mf[,spec]
# Extract linear covariates
if(ncol(mm)>ncol(X)) {
Z <- mf[, -c(spec, attr(mt, "response"), attr(mt, "offset")), drop = TRUE]
} else {
Z <- NULL
}
# Initialize an offset vector with zeros
offset <- rep(0, nrow(X))
# Sum up offset variables included in model terms
if (!is.null(off.num <- attr(mt, "offset"))) {
for (i in off.num) offset <- offset + eval(attr(mt,"variables")[[i + 1]], data)
}
out <- list("X" = X, "Y" = Y, "Z" = Z, "offset" = offset, "terms" = mt, "model.matrix" = mm)
return(out)
}
###############################################################
# Function for reading the formula of NGeDSboost and NGeDSgam #
###############################################################
read.formula.boost <- read.formula.gam <- function(formula, data){
formula <- as.formula(formula)
# Parse formula
terms <- all.vars(formula)
response <- terms[1]
predictors <- terms[-1]
# Split the formula into LHS and RHS
formula_string <- paste(deparse(formula), collapse = " ")
parts <- strsplit(formula_string, " ~ ")[[1]]
lhs <- parts[1]
rhs <- parts[2]
# Split the RHS into individual terms
rhs_parts <- trimws(unlist(strsplit(rhs, " \\+ ")))
# Reformat parts for validation: keep the first part as is, append others to LHS
parts_to_validate <- c(lhs, paste(lhs, "~", rhs_parts))[-1]
# Adjusted pattern for validation
pattern <- "^[a-zA-Z0-9_.]+ ~ (\\.|(f\\([^)]+\\)|[a-zA-Z0-9_.]+))$"
# Validate each part
valid_parts <- sapply(parts_to_validate, function(part) {
grepl(pattern, part)
})
# Check if all parts are valid
all_parts_valid <- all(valid_parts)
if (!all_parts_valid) {
stop("Formula incorrectly specified. Read documentation for further information.")
}
# Predictors terms
bl_part <- parts[2]
bl_elements <- unlist(strsplit(bl_part, "\\+"))
bl_elements <- trimws(bl_elements) # remove any leading or trailing white spaces from each element
if(any(predictors == ".")) {
# Handling "Y ~ ." case
predictors <- predictors[predictors != "."]
rest_predictors <- setdiff(names(data), c(response, predictors))
predictors <- c(predictors, rest_predictors)
bl_elements <- bl_elements[bl_elements != "."]
rest_bl_elements <- gsub("(x\\.\\d+)", "f(\\1)", rest_predictors)
bl_elements <- c(bl_elements, rest_bl_elements)
}
# Extract predictor variable names from each base-learner element
base_learners <- setNames(lapply(bl_elements, function(bl) {
# Determine the type of the base learner
type <- ifelse(grepl("f\\(.*\\)", bl), "GeDS", "linear")
# Extract the predictor variables
variables <- trimws(unlist(strsplit(gsub("f\\((.*?)\\)", "\\1", bl), ",")))
# Check if the f() has more than two predictors
if (type == "GeDS" && length(variables) > 2) {
stop("Formula incorrectly specified: f() can have at most two predictors. Read documentation for further information.")
}
# Return a list containing variable names and the type
return(list(variables = variables, type = type))
}), bl_elements)
return(list(terms = terms, response = response, predictors = predictors, base_learners = base_learners))
}
#' @title Formula for the predictor model
#' @name formula.GeDS
#' @description
#' A description of the structure of a predictor model to be fitted using
#' \code{\link{NGeDS}} and/or \code{\link{GGeDS}} and how this information can
#' be extracted from a \code{\link{GeDS-class}} object.
#' @param x Fitted \code{\link{GeDS-class}} object, tipically produced by
#' \code{\link{NGeDS}} or \code{\link{GGeDS}} from which the predictor model
#' \code{\link[stats]{formula}} should be extracted.
#' @param ... Unused in this case.
#'
#' @details
#' In the GeDS GNM (GLM) regression, implemented in \code{\link{NGeDS}} and
#' \code{\link{GGeDS}}, it is assumed that the mean of the response variable
#' transformed using an appropriate link function is modelled through a possibly
#' multivariate predictor model involving two components: a GeD variable knot
#' spline regression component involving up to two of the independent variables
#' and a parametric component with respect to the remaining independent
#' variables. The formula is used to specify the structure of such a possibly
#' multivariate predictor model.
#'
#' The formulae that are input in \code{\link{NGeDS}} and \code{\link{GGeDS}}
#' are similar to those input in \code{\link[stats]{lm}} or
#' \code{\link[stats]{glm}} except that the function \code{\link{f}} should be
#' specified in order to identify which of the covariates enter the GeD spline
#' regression part of the predictor model. For example, if the predictor model
#' is univariate and it links the transformed means of \code{y} to \code{x1},
#' the predictor has only a GeD spline component and the
#' \code{\link[stats]{formula}} should be in the form \code{y ~ f(x1)}.
#'
#' As noted, there may be additional independent variables, \code{x2},
#' \code{x3}, ... which may enter linearly into the parametric component of the
#' predictor model and not be part of the GeD spline regression component. For
#' example one may use the formula \code{y ~ f(x1) + x2 + x3} which assumes a
#' spline regression only between the transformed mean of \code{y} and \code{x1},
#' while \code{x2} and \code{x3} enter the predictor model just linearly.
#'
#' In the current version of the package, \code{\link{GGeDS}} is univariate,
#' therefore only one covariate which enters the spline regression component can
#' be specified.
#'
#' In contrast, the function \code{\link{NGeDS}}, generates also bivariate GeDS
#' regression models. Therefore, if the functional dependence of the mean of the
#' response variable \code{y} on \code{x1} and \code{x2} needs to be jointly
#' modelled and there are no other covariates, the formula for the corresponding
#' two dimensional predictor model should be specified as \code{y ~ f(x1,x2)}.
#'
#' Within the argument \code{formula}, similarly as in other \R functions, it is
#' possible to specify one or more offset variables, i.e. known terms with fixed
#' regression coefficients equal to 1. These terms should be identified via the
#' function \code{\link[stats]{offset}}.
#'
#' @export
#'
#' @aliases formula.GeDS
formula.GeDS <- function(x,...)
{
frm <- x$Formula
if(is.null(frm)) stop("Unable to extract the formula. \n
Please re-fit using 'NGeDS' or 'GGeDS'")
frm
}
#' @title Defining the covariates for the spline component in a GeDS formula.
#' @name f
#' @description
#' In general the GeDS predictor model may include a GeD spline regression
#' component with respect to part of the independent variables and a parametric
#' component in which the remaining covariates may enter as additive terms.
#'
#' The function \code{f} is to be used in the
#' \code{\link[=formula.GeDS]{formula}} argument of \code{\link{NGeDS}} or
#' \code{\link{GGeDS}} in order to specify which independent variables
#' (covariates) should be included in the GeD spline regression component of the
#' predictor model.
#' @param x numeric vector containing \eqn{N} sample values of the covariate
#' chosen to enter the spline
#' regression component of the predictor model.
#' @param xx numeric vector containing \eqn{N} sample values for the second
#' covariate (in case \code{\link{NGeDS}} is run for two dimensions). It has to
#' be either \code{NULL} (the default) or a vector of size \eqn{N}, same as
#' \code{x}.
#' @param ... further arguments. As GeDS currently allows for up to two
#' covariates, specification of further arguments will return an error.
#'
#' @examples
#' # Generate a data sample for the response variable Y and
#' # the covariates X, reg1, reg2 and off
#' set.seed(123)
#' N <- 500
#' f_1 <- function(x) (10*x/(1+100*x^2))*4+4
#' X <- sort(runif(N ,min = -2, max = 2))
#' reg1 <- runif(500, min = -0.1, max = 0.1)
#' reg2 <- runif(500, min = -0.2, max = 0.2)
#' off <- runif(500, min = -1, max = 1)
#' # Specify a model for the mean of Y to include a component non linear
#' # in X defined by the function f_1 and a linear one in the other covariates
#' means <- f_1(X) + 2*reg1 + 0.5*reg2 + off
#' # Add Normal noise to the mean of Y
#' Y <- rnorm(N, means, sd = 0.1)
#'
#' # Specify a formula that will be used to model Y as a
#' # function of X, reg1, reg2 and off.
#' # The covariate X is for the spline component modeled as GeDS,
#' # reg1 and reg2 enter linearly, off is an offset, i.e. no coefficient
#' # will be estimated for it
#' formula <- Y ~ f(X) + reg1 + reg2 + offset(off)
#'
#' # Fit a GeDS model specified in formula using NGeDS
#' (Gmod <- NGeDS(formula, beta = 0.6, phi = 0.995, Xextr = c(-2,2)))
#'
#'
#' @seealso \link{NGeDS}; \link{GGeDS}.
#'
#' @note This function is intended to be used only as part of the
#' \code{\link[=formula.GeDS]{formula}} in a GeDS regression via
#' \code{\link{NGeDS}} or \code{\link{GGeDS}} and not to be called in other
#' cases by the user.
f <- function(x,xx=NULL,...) {
if(!missing(...)) stop("Algorithm supports at most two variables in 'f'")
cbind(x,xx)
}
# this is to get the names of the Z variable(s)
getZnames <- function(out){
names <- colnames(out$model.matrix)
id <- attr(out$model.matrix,"assign")
spec <- attr(out$terms,"specials")$f-1
znames <- names[id!=spec]
znames
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.