R/gwl_bw_estimation.R

Defines functions gwl_bw_estimation

Documented in gwl_bw_estimation

#' Bandwidth estimation for Geographically Weighted Lasso
#'
#' This function performs a bruteforce selection of the optimal bandwidth for the selected kernel to perform a geographically weighted lasso.
#' The user should be aware that this function could be really long to run depending of the settings.
#' We recommend starting with `nbw = 5` and `nfolds = 5` at first to ensure that the function is running properly and producing the desired output.
#'
#' @param x.var input matrix, of dimension nobs x nvars; each row is an observation vector. `x` should have 2 or more columns.
#' @param y.var response variable for the lasso
#' @param dist.mat a distance matrix. can be generated by [compute_distance_matrix()]
#' @param adaptive TRUE or FALSE Whether to perform an adaptive bandwidth search or not. A fixed bandwidth means that than samples are selected if they fit a determined fixed radius around a location.
#' in a aptative bandwidth , the radius around a location varies to gather a fixed number of samples around the investigated location
#' @param adptbwd.thresh the lowest fraction of samples to take into account for local regression. Must be 0 < `adptbwd.thresh` < 1
#' @param kernel the geographical kernel shape to compute the weight. passed to [GWmodel::gw.weight()]
#' Can be `gaussian`, `exponential`, `bisquare`, `tricube`, `boxcar`
#' @param alpha the elasticnet mixing parameter. set 1 for lasso, 0 for ridge. see [glmnet::glmnet()]
#' @param progress if TRUE, print a progress bar
#' @param nbw the number of bandwidth to test
#' @param nfolds the number f folds for the glmnet cross validation
#'
#' @return a `gwlest` object. It is a list with `rmspe` (the RMSPE of the model with the associated badwidth), `NA` (the number of NA in the dataset), `bw` (the optimal bandwidth), `bwd.vec` (the vector of tested bandwidth)
#' 
#' @references 
#' A. Comber and P. Harris. \emph{Geographically weighted elastic net logistic regression (2018).
#' Journal of Geographical Systems, vol. 20, no. 4, pages 317–341}.
#' \doi{10.1007/s10109-018-0280-7}.\cr
#' 
#' @export
#'
#' @examples
#' 
#' predictors <- matrix(data = rnorm(2500), 50,50)
#' y_value <- sample(1:1000, 50)
#' coords <- data.frame("Lat" = rnorm(50), "Long" = rnorm(50))
#' distance_matrix <- compute_distance_matrix(coords)
#' 
#' \donttest{
#'   myst.est <- gwl_bw_estimation(x.var = predictors, 
#'                                 y.var = y_value,
#'                                 dist.mat = distance_matrix,
#'                                 adaptive = TRUE,
#'                                 adptbwd.thresh = 0.5,
#'                                 kernel = "bisquare",
#'                                 alpha = 1,
#'                                 progress = TRUE,
#'                                 n=10,
#'                                 nfolds = 5)
#'   
#'   
#'   myst.est
#'  } 
#' 

gwl_bw_estimation <- function(x.var,
                      y.var,
                      dist.mat,
                      adaptive = TRUE,
                      adptbwd.thresh = 0.1,
                      kernel = "bisquare",
                      alpha = 1,
                      progress = TRUE,
                      nbw = 100,
                      nfolds = 5){

  # some tests
  coords <- NULL
  
  if(is.list(dist.mat)){  
    coords <- dist.mat$coords
    dist.mat <- dist.mat$dist.mat
    }

  x.var <- as.matrix(x.var)
  
  stopifnot(is.numeric(y.var),
            nrow(x.var) == length(y.var),
            nrow(dist.mat) == ncol(dist.mat),
            nrow(dist.mat) == nrow(x.var),
            adptbwd.thresh > 0 & adptbwd.thresh < 1,
            alpha >=0 & alpha <= 1,
            is.logical(progress),
            nrow(x.var) == length(y.var)
            )

  # prepare bwd vector to test
  if(adaptive) bwd.vec <- round(c(seq(adptbwd.thresh, 1, length.out = nbw))*nrow(x.var), 0)
  if(!adaptive) bwd.vec <- round(seq(max(dist.mat)*0.01, max(dist.mat)*1.01, length.out = nbw))

  mean.bw.error <- na.count <- rep(NA, length(bwd.vec))
  yhat <- rep(NA, nrow(x.var))

  if(progress){
    pb <- progress::progress_bar$new(format = "  Computing [:bar] :percent eta: :eta", total = length(bwd.vec)*nrow(x.var),
                                     clear = FALSE, width = 60)
  }
  for (i in 1:length(bwd.vec)) {
    bw <- bwd.vec[i]
    na.count.j = 0

    for (j in 1:nrow(x.var)) {
      #pt.j <- x.var[j, ]

      # dist.mat contains the distances
      # select the neighborhood corresponding to the bw and removing obs j
      if(adaptive) index.j <- order(dist.mat[j, ])[1:bw]
      if(!adaptive) index.j <- which(dist.mat[j, ] < bw)

      # removing observation j from its neighborhood
      index.j <- index.j[-1]

      # data.j <- data[index.j, ]
      y <- as.vector(y.var[index.j])
      x <- as.matrix(x.var[index.j, ])

      # NA check (remove NA values)
      if (sum(is.na(y)) > 0) {
        index.na <- as.vector(is.na(y))
        # data.j <- data.j[!index.na, ]
        y <- y[!index.na]
        x <- as.matrix(x[!index.na, ])
      }

      # distance matrix between one observation line and the rest of the observations
      # dists.j <- gw.dist(coordinates(pt.j), coordinates(data.j))
      dist.j <- dist.mat[index.j, j]
      # create weight matrix according to distances
      # gaussian: wgt = exp(-.5*(vdist/bw)^2)
      weight.j <- as.vector(GWmodel::gw.weight(c(dist.j), bw, kernel, adaptive))
      

      # run glm cross-validation, returns a value for lambda
      lasso.mod <- glmnet::cv.glmnet(x = x, y = y, family = "gaussian", weights = weight.j, nfolds = nfolds,
                             standardize = T, alpha = alpha, type.measure = "mse", offset = NULL, lambda = NULL)


      # predict yhat values using the smallest lambda
      # predictions = predict(lasso.mod, newx = pred.vals, s = "lambda.min")
      yhat[j] <- stats::predict(lasso.mod, newx = x.var[j, ], s = "lambda.min")
      # mean cv error

      if(progress){
        pb$tick()
      }

    }

    # mean prediction error according to bw
    mean.bw.error[i] <- sqrt(mean((y.var - yhat)^2))
    na.count[i] <- na.count.j
    # if(progress) cat("\t", i)
  }

  # bandwidth corresponding to the smallest cv value
  bw <- bwd.vec[which.min(mean.bw.error)]
  
  out <- list(rmspe = mean.bw.error, 'NA' = na.count, bw = bw, bwd.vec = bwd.vec, kernel = kernel,alpha = alpha, adaptive = adaptive)
  class(out) <- "gwlest"

  return(out)
}

Try the GWlasso package in your browser

Any scripts or data that you put into this service are public.

GWlasso documentation built on April 3, 2025, 7:08 p.m.