R/bw_avg.R

Defines functions bw.CV.F bw.CV.A bw.avg

Documented in bw.avg bw.CV.A bw.CV.F

#' A function for bandwidth selection to calibrate a GWPR model, based on the mean over time of the data.
#' The formula for calculating the CV value for Panel is based on the study of:
#' YU, Danlin. 2010. Exploring spatiotemporally varying regressed relationships: the geographically weighted panel regression analysis.
#' In : Proceedings of the Joint International Conference on Theory, data Handling and modeling in GeoSpatial Information Science. p. 134-
#'
#' @param formula      Regression model formula : Y ~ X1 + ... + Xk
#' @param data         dataFrame for the Panel data
#' @param SDF          large SpatialPolygonsdataFrame on which is based the data
#' @param index        List for the indexes : (c(" ID, Time"))
#' @param approach     score used to optimize the bandwidth (see GWmodel::bw.gwr)
#' @param kernel       gaussian, exponential, bisquare, tricube, boxcar (see GWmodel::gw.weight)
#' @param adaptive     TRUE or FALSE (see GWmodel::gw.weight)
#' @param p	           the power of the Minkowski distance, default is 2, i.e. the Euclidean distance (see GWmodel::bw.gwr)
#' @param longlat      if TRUE, great circle distances will be calculated (see GWmodel::bw.gwr)
#' @param dMat         a distance matrix or vector (Optional parameter, see GWmodel::gw.weight)
#'
#' @return double
#'
#' @export
#' @examples
#' data(USStates)
#' USStates@data$id <- c(1:length(unique(USStates@data[,"state"])))
#' data <- merge(USStates@data, Produc, by="state", all=True)
#' dMat <- GWmodel::gw.dist(sp::coordinates(USStates), p=2, longlat=FALSE)
#' Equation <- log(gsp) ~ log(pcap) + log(pc) + log(emp) + unemp
#' bwAVG.A <- bw.avg(formula=Equation, data=data, SDF=USStates, index=c("id","year"), approach="AICc", kernel="bisquare", adaptive=T, p=2, longlat=FALSE, dMat=dMat)
bw.avg <- function(formula, data, SDF, index, approach=c("CV","AICc"), kernel="bisquare", adaptive=FALSE, p=2, longlat=FALSEALSE, dMat){

  #Data preparation
  pdata <- plm::pdata.frame(data, index = index, drop.index = FALSE, row.names = FALSE,
                            stringsAsFactors = default.stringsAsFactors())
  panelModel <- plm::plm(formula = formula, model="pooling", data=pdata, index=index)
  n <- length(unique(pdata[,index[1]]))
  t <- length(unique(pdata[,index[2]]))
  K <- length(all.vars(formula)[-1])
  y <- plm::pmodel.response(panelModel)
  x <- model.matrix(panelModel)[,-1]

  #Verification that the panel dataset is balanced
  if(dim(pdata)[1]!=n*t) stop("Averaging process available only for balanced panels")

  #Averaging the dataset over time
  idx <- as.numeric(rep(seq(1,n),each=t))
  yAVG <- tapply(y,idx,mean)
  tpms <- function(q) tapply(q,idx,mean)
  xAVG <- apply(x,2,tpms)
  dataAVG <- as.data.frame(cbind(yAVG,xAVG))
  colnames(dataAVG) <- c(formula[[2]], colnames(x))
  SDF@data <- dataAVG

  #Optimization process with transformed data
  formulaAVG <- formula(dataAVG)
  bw <- GWmodel::bw.gwr(formula=formulaAVG, data=SDF, approach=approach, kernel=kernel,
                        adaptive=adaptive, p=p, longlat=longlat, dMat=dMat)
  return(bw)
}

#' A function for bandwidth selection to calibrate a GWPR model, based on the mean over time of the data.

#' Arguments of the function
#' @param formula Regression model formula : Y ~ X1 + ... + Xk
#' @param data dataFrame for the Panel data
#' @param index List for the indexes : (c(" ID, Time"))
#' @param effect the effects introduced in the model, one of "individual", "time", or "twoways" (see plm::plm)
#' @param model one of "pooling", "within", "between", "random", "fd", or "ht" (see plm::plm)
#' @param kernel gaussian,exponential, bisquare, tricube, boxcar (see GWmodel::gw.weight)
#' @param dMat a distance matrix or vector (Optional parameter, see GWmodel::gw.weight)
#' @param bws bandwidths to be used for calculations of CV-score
#'
#' @return double
#'
#' @export
#'
#' @examples
#' data(USStates)
#' USStates@data$id <- c(1:length(unique(USStates@data[,"state"])))
#' data <- merge(USStates@data, Produc, by="state", all=True)
#' dMat <- GWmodel::gw.dist(sp::coordinates(USStates), p=2, longlat=FALSE)
#' Equation <- log(gsp) ~ log(pcap) + log(pc) + log(emp) + unemp
#' bwCV.A <-  bw.CV.A(formula=Equation, data=data, index=c("id","year"), effect='individual', model="within", kernel="bisquare", dMat=dMat, bws=c(30:40))
bw.CV.A <- function(formula, data, index, effect=c("individual", "time", "twoways", "nested"),
                    model=c("within", "random", "ht", "between", "pooling", "fd"),
                    kernel="bisquare", dMat, bws){

  #Data preparation
  Pdata <- plm::pdata.frame(data, index = index, drop.index = FALSE, row.names = FALSE,
                            stringsAsFactors = default.stringsAsFactors())
  n <- length(unique(Pdata[,index[1]]))
  t <- length(unique(Pdata[,index[2]]))
  CVsMat <- matrix(NA, nrow = length(bws), ncol = 2)
  colnames(CVsMat) <- c("Bandwidth", "CV-score")
  resid <- c()

  for(i in 1:length(bws)){
    bw <- bws[i]

    for(j in 1:n){
      W.j <- GWmodel::gw.weight(as.numeric(dMat[j,]), bw=bw, kernel=kernel, adaptive=T)
      W.j[j] <- 0
      W.j <- diag(W.j)
      W.j <- kronecker(W.j,diag(t))
      W.j <- diag(W.j)
      Pdata[['wgt']] <- W.j
      plm.j <- plm::plm(formula=formula, model=model, data=Pdata, index=index, weights=wgt)

      for(l in 1:t){
        resid[(j-1)*t+l] <- plm::pmodel.response(plm.j)[(j-1)*t+l] - predict(plm.j)[(j-1)*t+l]
      }
    }
    CVsMat[i,1] <- bw
    CVsMat[i,2] <- t(resid)%*%(resid)
    Pdata[['wgt']] <- NULL
    resid <- NULL
  }

  bw <- subset(CVsMat, CVsMat[,2]==min(CVsMat[,2], na.rm=TRUE))

  return(as.numeric(bw[,'Bandwidth']))
}

#' A function for bandwith selection to calibrate a GWPR model, based on the mean over time of the data.
#'
#' @param formula       Regression model formula : Y ~ X1 + ... + Xk
#' @param data          dataFrame for the Panel data
#' @param index         List for the indexes : (c(" ID, Time"))
#' @param effect        the effects introduced in the model, one of "individual", "time", or "twoways" (see plm::plm)
#' @param model         one of "pooling", "within", "between", "random", "fd", or "ht" (see plm::plm)
#' @param kernel        gaussian,exponential, bisquare, tricube, boxcar (see GWmodel::gw.weight)
#' @param dMat          a distance matrix or vector (Optional parameter, see GWmodel::gw.weight)
#' @param interval      vector of lowest and highest distances to be used in the bw optimization process
#'
#' @return double
#'
#' @export
#' @examples
#' USStates@data$id <- c(1:length(unique(USStates@data[,"state"])))
#' data <- merge(USStates@data, Produc, by="state", all=True)
#' dMat <- GWmodel::gw.dist(sp::coordinates(USStates), p=2, longlat=FALSE)
#' Equation <- log(gsp) ~ log(pcap) + log(pc) + log(emp) + unemp
#' bwCV.F <-  bw.CV.F(formula=Equation, data=data, index=c("id","year"), effect='individual', model="within", kernel="bisquare", dMat=dMat, interval=c(1500000, 2500000))
bw.CV.F <- function(formula, data, index, effect=c("individual", "time", "twoways", "nested"),
                    model=c("within", "random", "ht", "between", "pooling", "fd"),
                    kernel="bisquare", dMat, interval){

  #Data preparation
  Pdata <- plm::pdata.frame(data, index = index, drop.index = FALSE, row.names = FALSE,
                            stringsAsFactors = default.stringsAsFactors())
  n <- length(unique(Pdata[,index[1]]))
  t <- length(unique(Pdata[,index[2]]))
  resid <- c()

  #Optimization Process
  CV <- function(bw, n, kernel, adaptive){
    for(j in 1:n){
      W.j <- GWmodel::gw.weight(as.numeric(dMat[j,]), bw=bw, kernel=kernel, adaptive=adaptive)
      W.j[j] <- 0
      W.j <- diag(W.j)
      W.j <- kronecker(W.j,diag(t))
      W.j <- diag(W.j)
      Pdata[['wgt']] <- W.j
      plm.j <- plm::plm(formula=formula, model=model, data=Pdata, index=index, weights = wgt)

      for(l in 1:t){
        resid[(j-1)*t+l] <- plm::pmodel.response(plm.j)[(j-1)*t+l] - predict(plm.j)[(j-1)*t+l]
      }
    }
    CVscore <- t(resid)%*%(resid)
    return(CVscore)
  }

  bw.cv <- optimize(CV, interval=interval, n=n, kernel=kernel, adaptive=F, maximum=F)
  bw <- as.numeric(bw.cv)
  names(bw) <- c('Bandwidth', 'CV-score')

  return(bw[['Bandwidth']])
}
LAEQ/gwpr documentation built on June 28, 2020, 8:23 p.m.