R/QGI.R

Defines functions QGI

library(foreach)
library(doParallel)
library(MatchIt)
library(devtools)

#' Plot Propensity Model
#'
#' Matches samples of groups of treated and control observations that have a spatial distribution (i.e., latitude and longitude coordinates), so that matched groups have (1) similar covariate distributions [via a propensity score matching], and (2) do not result in paired matches being geographically proximate.  The package also provides a distance-decay estimate of the treatment effect (i.e., the impact a treatment had on a defined outcome) following the procedure outlined in Runfola and Batra et al. (2020) <https://doi.org/10.3390/su12083225>.
#' @param df A R dataframe containing your observations
#' @param distanceCol The column in your dataframe containing the distance in kilometers between each observation and the intervention locations
#' @param controlVars List of all independent variables to be used to model propensity and outcome
#' @param outcomeVar The column in your dataframe containing the outcome variable 
#' @param density Total number of estimations to be made, equally spaced between the lower and upper distance bound. Defaults to 50.
#' @param lowerDistBound Smallest distance (in kilometers) to calculate results for. Defaults to 0km.
#' @param upperDistBound Largest distance (in kilometers) to calculate results for. Defaults to the median of distance in the data.
#' @param enforcedMinimumDistance Minimum distance (in kilometers) between a location considered treated and untreated, across the full dataset.  Defaults to 5km.
#' @param maximum_match_diff From 0 to 1, the largest difference in match quality allowed. 
#' @param cores The number of computer cores to use to fit the QGI model.  Defaults to number of cores on machine - 1.
#' @keywords
#' @export
#' @examples
#' QGI(read.csv("../df.csv"), independent_vars, x,x,x, 30*16)

QGI <- function(df, 
                distanceCol = "distance",
                controlVars, 
                outcomeVar = "outcome", 
                density = 50, 
                lowerDistBound=0.0, 
                upperDistBound="Default", 
                maximum_match_diff=0.9, 
                enforcedMinimumDistance=5,
                strategyNA = "Omit",
                pScoreMethod = "linear",
                pScoreDistance = "logit",
                matchDiscardStrategy = "both",
                pScoreReEstimate = TRUE,
                minN = 10,
                matchQualityWeighting = TRUE,
                cores = "Default") 
  {
  
  if(cores == "Default")
    {
      cores=parallel::detectCores()
    }

  if(strategyNA == "Omit")
    {
      df <- na.omit(df)
    }

  cl <- parallel::makeForkCluster(cores[1]-1) 
  doParallel::registerDoParallel(cl)

  if(upperDistBound == "Default")
    {
    upperDistBound = as.numeric(summary(df[,distanceCol])[3])
    }
  
  mc = 0
  #define sampling frame, with as much coverage as possible
  #given a defined density
  
  if(density <= 1)
    {
    density = 2
    print("Sample Density set to minimum allowed (2)")
    }

  #trials = seq(upperDistBound, lowerDistBound, (lowerDistBound-upperDistBound)/(density-1))
  trials = seq(lowerDistBound, upperDistBound, (upperDistBound-lowerDistBound)/(density-1))
  
  mc <- foreach(i = 1:length(trials), .combine=rbind) %dopar% {    
    
  #Rename the distance column to avoid later confusion with p-score distances
  df$geogDistance <- df[,distanceCol]
  
  #Set initial treatment value to 999 (representative of a null)
  df$Treatment <- 999
  
  #Identify the distance threshold for this iteration
  dist_thresh <- trials[i] 

  #Create a binary indicating if the location is considered "treated" this iteration
  #Or control.
  df$Treatment[df$geogDistance <= (0 + dist_thresh)] <- 1
  df$Treatment[df$geogDistance >= (enforcedMinimumDistance + dist_thresh)] <- 0
  
  #Count the number of treatment and control cases for output.
  df$Control = 0
  df$Control[df$Treatment == 0] <- 1

  treatCount = sum(df$Treatment)
  controlCount = sum(df$Control)

  #Remove any cases that were not assigned to treatment or control
  #Can happen if there are data errors (i.e., NAs in distance)
  df <- df[!df$Treatment == 999,]
  
  #Remove the distance column in prep for propensity matching
  df[, !(names(df) == "distance")]
  
  #Pscore variables
  pVars <- c(controlVars,"Treatment")

  #Analysis variables
  aVars <- c(controlVars,"Treatment","outcome")
  
  #construct propensity score formula
  f1 <- as.formula(paste("Treatment", paste(controlVars, collapse = " + "), sep = " ~ "))
  
  #Calculate our propensity scores using matchit
  pscore.Calc <- matchit(f1, 
                          data=df[pVars], 
                          method=pScoreMethod, 
                          distance=pScoreDistance,
                          discard=matchDiscardStrategy,
                          reestimate=pScoreReEstimate)
  
  matched.df <- match.data(pscore.Calc)
    
  if(nrow(matched.df) > minN)
  {
    #construct outcome modeling formula
    idx = as.numeric(rownames(matched.df))
    matched.df$outcome <- df[idx,][[outcomeVar]]
    f2 <- as.formula(paste("outcome", paste(c("Treatment", controlVars), collapse = " + "), sep = " ~ "))
    
    trtModel <- lm(f2, data=matched.df[aVars])
    
    
    match_diff = abs(summary(pscore.Calc)$sum.matched[[1]][1] - summary(pscore.Calc)$sum.matched[[2]][1])
    
    return( list(i, dist_thresh, trtModel$coef["Treatment"][[1]], nrow(df), match_diff, summary(trtModel)$r.squared, coef(summary(trtModel))[2,4], coef(summary(trtModel))[2,2], nrow(matched.df), treatCount, controlCount))
  } 

  }


  #Create visualizations and outputs
  tDF <- data.frame(t(matrix(unlist(mc), nrow=11, byrow=T)))
  print("col")
  colnames(tDF) <- c("ItId", "thresh", "coef", "obs", "match_diff", "R2", "TreatSig", "StdError", "ItSampleSize", "ItTreatmentCount", "ItControlCount")
  print("out")
  
  # #Scale for data visualization and (optionally) weighting
  tDF$matchWeight = 1 - ((tDF$match_diff - min(tDF$match_diff)) / (max(tDF$match_diff) - min(tDF$match_diff)))

  # #Third-order polynomial for the distance-decay
  tDF$b = tDF$thresh_km**2
  tDF$c = tDF$thresh_km**3
  tDF$d = tDF$thresh_km**4
  
  tDF <- tDF[order(tDF$thresh_km),]

  if(matchQualityWeighting == TRUE)
  {
    mean_mdl = lm(coef ~ thresh_km + b + c, data = tDF, weights =matchWeight)
    std_mdl = lm(StdError ~ thresh_km + b + c, data = tDF, weights =matchWeight)
  } else {
    mean_mdl = lm(coef ~ thresh_km + b + c, data = tDF)
    std_mdl = lm(StdError ~ thresh_km + b + c, data = tDF)
  }

  makeVisualization <- function (tDF){
  par(mfrow = c(1, 1),     # 2x2 layout
    oma = c(2, 2, 0, 0), # two rows of text at the outer left and bottom margin
    mar = c(3, 3, 0, 0), # space for one row of text at ticks and to separate plots
    mgp = c(2, 1, 0),    # axis label at 2 rows distance, tick labels at 1 row
    xpd = FALSE)   
  ylim_upper <- max(tDF$coef + 1.96*tDF$StdError, na.rm = TRUE)
  ylim_lower <- min(tDF$coef - 1.96*tDF$StdError, na.rm = TRUE)
  plot(tDF$thresh_km, tDF$coef, cex = tDF$size, xlab="Distance (km)", ylab=outcomeVar, ylim=c(min(0,ylim_lower), max(ylim_upper,0)))

  newx <- seq(min(tDF$thresh_km), max(tDF$thresh_km), length.out=1000)

  preds <- predict(mean_mdl, newdata=data.frame(thresh_km=newx, b=newx**2, c=newx**3), interval='confidence')
  preds_std <- predict(std_mdl, newdata=data.frame(thresh_km=newx, b=newx**2, c=newx**3), interval='confidence')
  polygon(c(rev(newx), newx), c(rev( preds[ ,3] + 1.96*preds_std[1:1000]),preds[ ,2] - 1.96*preds_std[1:1000]), col=rgb(0.2, 0.2, 0.25,0.25), border = NA)
  
  #polygon(c(rev(newx), newx), c(rev(preds[ ,3]), preds[ ,2]), col=rgb(0, 1, 0,0.25), border = NA)
  lines(newx, preds[ ,3] + 1.96*preds_std[1:1000], lty = 'dashed', col = 'black')
  lines(newx, preds[ ,2] - 1.96*preds_std[1:1000], lty = 'dashed', col = 'black')
  lines(newx, preds[ ,3], lty = 'dashed', col = 'yellow')
  lines(newx, preds[ ,2], lty = 'dashed', col = 'yellow')
  legend(5, 400, legend=c("Best Fit", "Lower Match Quality", "Higher Match Quality"), col=c("blue","black", "black"), lty=c(1,NA, NA), pch=c(NA,1,1), cex=0.8, pt.cex=c(NA, 0.5, 1))
  lines(tDF$thresh_km, fitted(mean_mdl), col="blue")
  abline(h = 0, lty = 2)
  }

 
  # #Overall impact 
  print(mean(unlist(tDF[tDF$thresh_km >=2.66 & tDF$thresh_km <=6,]["coef"][1])))
  upper_std = preds[ ,3] + 1.96*preds_std[1:1000]
  print(mean(unlist(tDF[tDF$thresh_km >=min(newx[which(upper_std<0)]) & tDF$thresh_km <=max(newx[which(upper_std<0)]),]["coef"][1])))
  print(paste('significant distance intervel: ',min(newx[which(upper_std<0)]),max(newx[which(upper_std<0)])))
  lower_std = preds[ ,2] - 1.96*preds_std[1:1000]
  print(mean(unlist(tDF[tDF$thresh_km >=min(newx[which(lower_std>0)]) & tDF$thresh_km <=max(newx[which(lower_std>0)]),]["coef"][1])))
  print(paste('significant distance intervel: ',min(newx[which(lower_std>0)]),max(newx[which(lower_std>0)])))
  print(paste('plotted distance range: ',min(tDF$thresh_km),' ',max(tDF$thresh_km)))
  return(makeVisualization(tDF))

}

  #plot(tDF$thresh_km, tDF$coef, cex = tDF$size, xlab="Distance (km)", ylab=outcome_label, ylim=c(-0.5,0.5))
  #return(c(tDF, min(newx[which(lower_std>0)]),max(newx[which(lower_std>0)])))
  
wmgeolab/QGI documentation built on Oct. 13, 2020, 1:28 a.m.