Nothing
#' Bandwidth estimation for Geographically Weighted Lasso
#'
#' This function performs a bruteforce selection of the optimal bandwidth for the selected kernel to perform a geographically weighted lasso.
#' The user should be aware that this function could be really long to run depending of the settings.
#' We recommend starting with `nbw = 5` and `nfolds = 5` at first to ensure that the function is running properly and producing the desired output.
#'
#' @param x.var input matrix, of dimension nobs x nvars; each row is an observation vector. `x` should have 2 or more columns.
#' @param y.var response variable for the lasso
#' @param dist.mat a distance matrix. can be generated by [compute_distance_matrix()]
#' @param adaptive TRUE or FALSE Whether to perform an adaptive bandwidth search or not. A fixed bandwidth means that than samples are selected if they fit a determined fixed radius around a location.
#' in a aptative bandwidth , the radius around a location varies to gather a fixed number of samples around the investigated location
#' @param adptbwd.thresh the lowest fraction of samples to take into account for local regression. Must be 0 < `adptbwd.thresh` < 1
#' @param kernel the geographical kernel shape to compute the weight. passed to [GWmodel::gw.weight()]
#' Can be `gaussian`, `exponential`, `bisquare`, `tricube`, `boxcar`
#' @param alpha the elasticnet mixing parameter. set 1 for lasso, 0 for ridge. see [glmnet::glmnet()]
#' @param progress if TRUE, print a progress bar
#' @param nbw the number of bandwidth to test
#' @param nfolds the number f folds for the glmnet cross validation
#'
#' @return a `gwlest` object. It is a list with `rmspe` (the RMSPE of the model with the associated badwidth), `NA` (the number of NA in the dataset), `bw` (the optimal bandwidth), `bwd.vec` (the vector of tested bandwidth)
#'
#' @references
#' A. Comber and P. Harris. \emph{Geographically weighted elastic net logistic regression (2018).
#' Journal of Geographical Systems, vol. 20, no. 4, pages 317–341}.
#' \doi{10.1007/s10109-018-0280-7}.\cr
#'
#' @export
#'
#' @examples
#'
#' predictors <- matrix(data = rnorm(2500), 50,50)
#' y_value <- sample(1:1000, 50)
#' coords <- data.frame("Lat" = rnorm(50), "Long" = rnorm(50))
#' distance_matrix <- compute_distance_matrix(coords)
#'
#' \donttest{
#' myst.est <- gwl_bw_estimation(x.var = predictors,
#' y.var = y_value,
#' dist.mat = distance_matrix,
#' adaptive = TRUE,
#' adptbwd.thresh = 0.5,
#' kernel = "bisquare",
#' alpha = 1,
#' progress = TRUE,
#' n=10,
#' nfolds = 5)
#'
#'
#' myst.est
#' }
#'
gwl_bw_estimation <- function(x.var,
y.var,
dist.mat,
adaptive = TRUE,
adptbwd.thresh = 0.1,
kernel = "bisquare",
alpha = 1,
progress = TRUE,
nbw = 100,
nfolds = 5){
# some tests
coords <- NULL
if(is.list(dist.mat)){
coords <- dist.mat$coords
dist.mat <- dist.mat$dist.mat
}
x.var <- as.matrix(x.var)
stopifnot(is.numeric(y.var),
nrow(x.var) == length(y.var),
nrow(dist.mat) == ncol(dist.mat),
nrow(dist.mat) == nrow(x.var),
adptbwd.thresh > 0 & adptbwd.thresh < 1,
alpha >=0 & alpha <= 1,
is.logical(progress),
nrow(x.var) == length(y.var)
)
# prepare bwd vector to test
if(adaptive) bwd.vec <- round(c(seq(adptbwd.thresh, 1, length.out = nbw))*nrow(x.var), 0)
if(!adaptive) bwd.vec <- round(seq(max(dist.mat)*0.01, max(dist.mat)*1.01, length.out = nbw))
mean.bw.error <- na.count <- rep(NA, length(bwd.vec))
yhat <- rep(NA, nrow(x.var))
if(progress){
pb <- progress::progress_bar$new(format = " Computing [:bar] :percent eta: :eta", total = length(bwd.vec)*nrow(x.var),
clear = FALSE, width = 60)
}
for (i in 1:length(bwd.vec)) {
bw <- bwd.vec[i]
na.count.j = 0
for (j in 1:nrow(x.var)) {
#pt.j <- x.var[j, ]
# dist.mat contains the distances
# select the neighborhood corresponding to the bw and removing obs j
if(adaptive) index.j <- order(dist.mat[j, ])[1:bw]
if(!adaptive) index.j <- which(dist.mat[j, ] < bw)
# removing observation j from its neighborhood
index.j <- index.j[-1]
# data.j <- data[index.j, ]
y <- as.vector(y.var[index.j])
x <- as.matrix(x.var[index.j, ])
# NA check (remove NA values)
if (sum(is.na(y)) > 0) {
index.na <- as.vector(is.na(y))
# data.j <- data.j[!index.na, ]
y <- y[!index.na]
x <- as.matrix(x[!index.na, ])
}
# distance matrix between one observation line and the rest of the observations
# dists.j <- gw.dist(coordinates(pt.j), coordinates(data.j))
dist.j <- dist.mat[index.j, j]
# create weight matrix according to distances
# gaussian: wgt = exp(-.5*(vdist/bw)^2)
weight.j <- as.vector(GWmodel::gw.weight(c(dist.j), bw, kernel, adaptive))
# run glm cross-validation, returns a value for lambda
lasso.mod <- glmnet::cv.glmnet(x = x, y = y, family = "gaussian", weights = weight.j, nfolds = nfolds,
standardize = T, alpha = alpha, type.measure = "mse", offset = NULL, lambda = NULL)
# predict yhat values using the smallest lambda
# predictions = predict(lasso.mod, newx = pred.vals, s = "lambda.min")
yhat[j] <- stats::predict(lasso.mod, newx = x.var[j, ], s = "lambda.min")
# mean cv error
if(progress){
pb$tick()
}
}
# mean prediction error according to bw
mean.bw.error[i] <- sqrt(mean((y.var - yhat)^2))
na.count[i] <- na.count.j
# if(progress) cat("\t", i)
}
# bandwidth corresponding to the smallest cv value
bw <- bwd.vec[which.min(mean.bw.error)]
out <- list(rmspe = mean.bw.error, 'NA' = na.count, bw = bw, bwd.vec = bwd.vec, kernel = kernel,alpha = alpha, adaptive = adaptive)
class(out) <- "gwlest"
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.