R/helpers_NGeDSboost-NGeDSgam.R

Defines functions compute_avg_int.knots bivariate_bl_linear_model univariate_bl_linear_model piecewise_multivar_linear_model predict_GeDS_linear get_internal_knots last_row_with_value get_mboost_family validate_iterations

################################################################################
################################################################################
############################# Auxiliar functions ###############################
################################################################################
################################################################################

########################
## 1. Rescale weights ##
########################
rescale_weights<- function (weights) {
  if (max(abs(weights - floor(weights))) < sqrt(.Machine$double.eps))
    return(weights)
  return(weights/sum(weights) * sum(weights > 0))
}

############################
## 2. validate_iterations ##
############################
validate_iterations <- function(iterations, default, name) {
  if (missing(iterations) || is.null(iterations)) {
    iterations <- default
  } else if (!is.numeric(iterations) || iterations %% 1 != 0 || iterations < 0) {
    warning(sprintf("%s must be a non-negative integer; was set to %dL", name, default))
    iterations <- default
  } else {
    iterations <- as.integer(iterations)
  }
  return(iterations)
}

##########################
## 3. get_mboost_family ##
#########################
#' @import mboost
#' @import stats
get_mboost_family <- function(family) {
  if (!is.character(family)) {
    stop("Family must be a character string.")
  }
  
  switch(family,
         # Gaussian
         "Squared Error (Regression)" = stats::gaussian(link = "identity"),
         # Gamma
         "Negative Gamma Likelihood" = stats::Gamma(link = "inverse"),
         # Binomial 
         "Negative Binomial Likelihood (logit link)" = stats::binomial(link = "logit"),
         "Negative Binomial Likelihood -- probit link" = stats::binomial(link = "probit"),
         "Negative Binomial Likelihood -- cauchit link" = stats::binomial(link = "cauchit"),
         "Negative Binomial Likelihood -- log link" = stats::binomial(link = "log"),
         "Negative Binomial Likelihood -- cloglog link" = stats::binomial(link = "cloglog"),
         # Poisson
         "Poisson Likelihood" = stats::poisson(link = "log"),
         stop("Invalid distribution type")
  )
}


#######################################################################################################
## 4. Function to get which is the last row with at least one non-NA in Coefficients/Stored matrices ##
#######################################################################################################
last_row_with_value <- function(matrix) {
  non_na_rows <- which(!apply(is.na(matrix), 1, all))
  if(length(non_na_rows) == 0) {
    return(NULL)
  } else {
    return(max(non_na_rows))
  }
}

###########################################
## 5. Function to get the internal knots ##
###########################################
get_internal_knots <- function(knots, depth = 1) {
  # Helper function to extract internal knots
  extract_knots <- function(k) {
    if (length(k) > 2) {
      return(k[-c(1, length(k))])
    } else {
      return(NULL)
    }
  }
  # Recursive call based on depth
  if (depth > 1) {
    knots <- get_internal_knots(knots, depth - 1)
  }
  # Bivariate
  if (is.list(knots)) {
    # Get internal knots for each component and store them in a list
    ikX <- extract_knots(knots$Xk)
    ikY <- extract_knots(knots$Yk)
    return(list(ikX = ikX, ikY = ikY))
  } else { 
    # Univariate
    return(extract_knots(knots))
  }
}

###################################################################################
## 6. Function for computing GeDS-class object linear predictions (i.e. stage A) ##
###################################################################################
#' @importFrom stats na.omit
predict_GeDS_linear <- function(Gmod, X, Y, Z){
  
  terms <- all.vars(Gmod$Formula)
  num_predictors <- length(terms[-1])
  
  max.intknots <- Gmod$Args$max.intknots
  q <- Gmod$Args$q
  
  ## Univariate GeDS
  if (num_predictors == 1) {
    
    X <- as.matrix(X)
    j <- NROW(Gmod$Stored) # last_row_with_value(Gmod$Stored), last_row_with_value(Gmod$Coefficients)
    
    # Knots
    if (j > q && j != max.intknots + 1) {
      knt <- na.omit(Gmod$Stored[j - q,]) # an exit from stage A is performed with the spline fit l = k + 1 - q
      } else {
        knt <- na.omit(Gmod$Stored[j,])
      }
    
    knt <- knt[-c(1, length(knt))]
    int.knt <- knt[-c(1, length(knt))]
    if (length(knt) == 2) int.knt <- NULL
    
    # Coefficients
    if (j > q && j != max.intknots + 1) {
      coeff <- na.omit(Gmod$Coefficients[j - q,])
      } else {
        coeff <- na.omit(Gmod$Coefficients[j,])
      }
    
    # Initialize output vector
    Y_hat <- numeric(nrow(X))
    # Number of intervals
    n_intervals <- length(knt) - 1
    # Add first and last intervals to cover all possible X values
    intervals <- c(-Inf, int.knt, Inf)
    
    # Initialize lists for coefficients
    b0_list <- numeric(n_intervals)
    b1_list <- numeric(n_intervals)
    
    # Calculate predictions for the current predictor
    for (int in 1:n_intervals) {
      # Find X values within current interval
      idx <- X >= intervals[int] & X < intervals[int+1]
      # Calculate predicted Y values for current interval
      epsilon <- 1e-10 # to avoid b1 = Inf in the rare occasions where knt[int+1] = knt[int];
      # check 'cross_validation\HPC\GeDS_initial_learner\ex5\additive\scripts\cv_ex5_1160.R' (seed=6967) for an example.
      b1 <- (coeff[int+1] - coeff[int]) / max(knt[int+1] - knt[int], epsilon)
      b0 <- coeff[int] - b1 * knt[int]
      Y_hat[idx] <- b0 + b1 * X[idx]
      # Store coefficients
      b0_list[int] <- b0
      b1_list[int] <- b1
    }
    return(list(Y_hat = Y_hat, knt = knt, int.knt = int.knt, b0 = b0_list, b1 = b1_list, theta = coeff))
    
  ## Bivariate GeDS
  } else if (num_predictors == 2){
    
    X <- as.matrix(X)
    Y <- as.matrix(Y)
    j <- NROW(Gmod$Stored$previousX) #last_row_with_value(Gmod$Stored$previousX)
    
    # Knots and Coefficients
    if (j > q && j != max.intknots + 1) {
      Xknt <- na.omit(Gmod$Stored$previousX[j - q,]) # an exit from stage A is performed with the spline fit l = k + 1 - q
      Yknt <- na.omit(Gmod$Stored$previousY[j - q,])
      theta <- as.numeric(na.omit(Gmod$Coefficients[j - q,]))
      } else {
        Xknt <- na.omit(Gmod$Stored$previousX[j,]) # an exit from stage A is performed with the spline fit l = k + 1 - q
        Yknt <- na.omit(Gmod$Stored$previousY[j,])
        theta <- as.numeric(na.omit(Gmod$Coefficients[j,]))
      }
    
    Xknt <- Xknt[-c(1, length(Xknt))]
    Yknt <- Yknt[-c(1, length(Yknt))]
    Xint.knt <- Xknt[-c(1, length(Xknt))]
    Yint.knt <- Yknt[-c(1, length(Yknt))]
    
    if(length(Xknt) == 2) {Xint.knt <- NULL}
    if(length(Yknt) == 2) {Yint.knt <- NULL}
    
    # SplineReg_biv
    lin <- SplineReg_biv(X, Y, Z, InterKnotsX=Xint.knt, InterKnotsY=Yint.knt,
                         Xextr=Gmod$Args$Xextr, Yextr=Gmod$Args$Yextr, n=2,
                         coefficients = theta)
    Y_hat <- lin$Predicted
    
    return(list(Y_hat = Y_hat, knt = list("Xknt" = Xknt, "Yknt" = Yknt),
                int.knt = list("Xint.knt" = Xint.knt, "Yint.knt" = Yint.knt),
                "theta" = lin$Theta))
    
  } else {
    print("Only 1 (i.e. Y ~ f(X)) or 2 (i.e. Z ~ f(X, Y)) predictors are allowed for NGeDS models.")
  }
}

############################################################################
## 7. Function for computing piecewise multivariate additive linear model ##
############################################################################
piecewise_multivar_linear_model <- function(X, model, base_learners = NULL) {
  
  X <- data.frame(X)
  
  # Initialize output vector
  Y_hat <- numeric(nrow(X))
  
  # To compute the boosted prediction made by specific base learners:
  if(!is.null(base_learners)){
    model$base_learners <- model$base_learners[names(model$base_learners) %in% names(base_learners)]
  }
  
  # For each base learner (and its corresponding model)
  for (base_learner in names(model$base_learners)) {
    # Get the model for the current base learner
    model_bl <-  model$base_learners[[base_learner]]
    # Get the predictor variable
    predictor <- base_learners[[base_learner]]$variables
    # Add first and last intervals to cover all possible X values
    intervals <- c(-Inf, model_bl$linear.int.knots, Inf)
    # Number of intervals
    n_intervals <- length(intervals)-1
    
    # Calculate predictions for the current predictor
    for (int in 1:n_intervals) {
      # Find X values within current interval
      idx <- X[,predictor] >= intervals[int] & X[,predictor] < intervals[int+1]
      # Add predicted Y values for current interval to Y_hat
      Y_hat[idx] <- Y_hat[idx] + model_bl$coefficients$b0[int] + model_bl$coefficients$b1[int] * X[idx,predictor]
    }
  }
  return(Y_hat)
}

#####################################################
## 8. Function for computing additive linear model ##
#####################################################
# 8.1. Univariate base-learners
univariate_bl_linear_model <- function(pred_vars, model, shrinkage, base_learners = NULL) {
  
  # Initialize output vector
  Y_hat <- numeric(nrow(pred_vars))
  
  # To compute the boosted prediction made by specific base learners:
  if(!is.null(base_learners)){
    model$base_learners <- model$base_learners[names(model$base_learners) %in% names(base_learners)]
  }
  
  # For each base learner (and its corresponding model)
  for (base_learner in names(model$base_learners)) {
    
    X <- pred_vars[, base_learners[[base_learner]][1]]
    
    bl      <- model$base_learners[[base_learner]]
    int.knt <- bl$linear.int.knots
    theta   <- bl$coefficients
    
    lin <- SplineReg_LM(X, InterKnots=int.knt, extr=range(X), n=2,
                        coefficients = theta)
    
    Y_hat <- Y_hat + lin$Predicted
    
  }
  return(Y_hat)
}

# 8.2. Bivariate base-learners
bivariate_bl_linear_model <- function(pred_vars, model, shrinkage, base_learners = NULL, type = "boost") {
  
  # Initialize output vector
  Y_hat <- numeric(nrow(pred_vars))
  
  # To compute the boosted prediction made by specific base learners:
  if(!is.null(base_learners)){
    model$base_learners <- model$base_learners[names(model$base_learners) %in% names(base_learners)]
  }
  
  # For each base learner (and its corresponding model)
  for (base_learner in names(model$base_learners)) {
    
    X <- pred_vars[, base_learners[[base_learner]]$variables[1]]
    Y <- pred_vars[, base_learners[[base_learner]]$variables[2]]
    
    bl <- model$base_learners[[base_learner]]
    # (I) Bivariate boosted base learners
    if(type == "boost"){
      for (mod in names(bl$iterations)){
        Xint.knt <- bl$iterations[[mod]]$int.knt$Xint.knt
        Yint.knt <- bl$iterations[[mod]]$int.knt$Yint.knt
        theta <- bl$iterations[[mod]]$coef
        lin <- SplineReg_biv(X, Y, InterKnotsX=Xint.knt, InterKnotsY=Yint.knt,
                             Xextr=range(X), Yextr=range(Y), n=2,
                             coefficients = theta)
        if (mod=="model0") {Y_hat <- Y_hat + lin$Predicted}
        else {Y_hat <- Y_hat + shrinkage*lin$Predicted}
      }
    # (II) Bivariate gam
    } else if (type == "gam"){
      Xint.knt <- bl$linear.int.knots$ikX
      Yint.knt <- bl$linear.int.knots$ikY
      theta <- bl$coefficients
      lin <- SplineReg_biv(X, Y, InterKnotsX=Xint.knt, InterKnotsY=Yint.knt,
                           Xextr=range(X), Yextr=range(Y), n=2,
                           coefficients = theta)
      Y_hat <- Y_hat + lin$Predicted
    }
  }
  return(Y_hat)
}


################################
### 9. compute_avg_int.knots ###
################################
# Function for computing the internal knots of base learners (averaging knot location, stage B.1.)
compute_avg_int.knots <- function(final_model, base_learners = base_learners, X_sd, X_mean, normalize_data, n) {
  warning_displayed <- FALSE  # Initialize variable to track warning display
  # Select GeDS base-learners
  base_learners =  base_learners[sapply(base_learners, function(x) x$type == "GeDS")]
  # Check if base_learners is empty
  if (length(base_learners) == 0) {
    cat(paste0("No GeDS base-learners found. Returning NULL when computing averaging knot location for n = ", n))
    return(kk_list = NULL)
  }
  
  kk_list <- lapply(names(base_learners), function(bl) {
    
    pred_vars <- base_learners[[bl]]$variables
    
    # Univariate
    if (length(pred_vars) == 1) {
      intknt <- get_internal_knots(final_model$base_learners[[bl]]$knots)
      if (normalize_data){
        if (!is.null(intknt)) intknt <- intknt * X_sd[[pred_vars[1]]] + X_mean[[pred_vars[1]]]
      }
      if (length(intknt) >= n - 1) return(makenewknots(intknt, n))
      cat(paste0(bl, " has less than ", n - 1, " linear internal knots. "))
      warning_displayed <<- TRUE  # Update the warning flag
      return(intknt)
      # Bivariate
    } else if (length(pred_vars) == 2) {
      intknt <- list(X = get_internal_knots(final_model$base_learners[[bl]]$knots$Xk), 
                     Y = get_internal_knots(final_model$base_learners[[bl]]$knots$Yk))
      if (normalize_data){
        if (!is.null(intknt$X)) {intknt$X <- intknt$X * X_sd[[pred_vars[1]]] + X_mean[[pred_vars[1]]]}
        if (!is.null(intknt$Y)) {intknt$Y <- intknt$Y * X_sd[[pred_vars[2]]] + X_mean[[pred_vars[2]]]}
      }
      if (length(intknt$X) >= n - 1) {
        ikX <- makenewknots(intknt$X, n) 
      } else {
        cat(paste0(bl, " has less than ", n - 1, " linear internal knots for ", pred_vars[1], ". "))
        warning_displayed <<- TRUE  # Update the warning flag
        ikX <- intknt$X
      }
      
      # Number of linear knots warning
      if (length(intknt$Y) >= n - 1) {
        ikY <- makenewknots(intknt$Y, n) 
      } else {
        cat(paste0(bl, " has less than ", n - 1, " linear internal knots for ", pred_vars[2], ". "))
        warning_displayed <<- TRUE  # Update the warning flag
        ikY <- intknt$Y
      }
      return(list(ikX = ikX, ikY = ikY))
    }
  })
  # Check if any warning was displayed and show the appropriate follow-up message
  if (warning_displayed) {
    if (n == 3) {
      cat("Quadratic averaging knot location is not computed for base learners with less than 2 linear internal knots.\n")
    } else if (n == 4) {
      cat("Cubic averaging knot location is not computed for base learners with less than 3 linear internal knots.\n")
    }
  }
  # Naming the elements of the list with the base-learner names
  names(kk_list) <- names(base_learners)
  return(kk_list = kk_list)
}
alattuada/GeDS documentation built on April 26, 2024, 11:36 a.m.