R/plot.rd.R
In rddapp: Regression Discontinuity Design Application

Documented in plot.rd

#' Plot the Regression Discontinuity
#' 
#' \code{plot.rd} plots the relationship between the running variable and the outcome.
#' It is based on the \code{plot.RD} function in the "rdd" package.
#' 
#' @method plot rd
#'
#' @param x An \code{rd} object, typically the result of \code{\link{rd_est}}.
#' @param preds An optional vector of predictions generated by \code{\link{predict.rd}}.
#'   If not supplied, prediction is completed within the \code{plot.rd} function. 
#' @param fit_line A character vector specifying models to be shown as fitted lines.
#'   Options are \code{c("linear", "quadratic", "cubic", "optimal", "half", "double")}.
#' @param fit_ci A string specifying whether and how to plot prediction confidence intervals
#'   around the fitted lines. Options are \code{c("area", "dot", "hide")}.
#' @param fit_ci_level A numeric value between 0 and 1 specifying the confidence level of prediction CIs. The default is 0.95. 
#' @param bin_n An integer specifying the number of bins for binned data points. If \code{bin_n} is 0, raw data points are plotted.  
#'   If \code{bin_n} is < 0, data points are suppressed. The default is 20.
#' @param bin_level A numeric value between 0 and 1 specifying the confidence level for CIs around binned data points. The default is 0.95.
#' @param bin_size A string specifying how to plot the number of observations in each bin, by \code{"size"} or \code{"shape"}.
#' @param quant_bin A logical value indicating whether the data are binned by quantiles. The default is \code{TRUE}.
#' @param xlim An optional numeric vector containing the x-axis limits.
#' @param ylim An optional numeric vector containing the y-axis limits.
#' @param include_rugs A logical value indicating whether to include the 1d plot for both axes. The default is \code{FALSE}.
#' @param ... Additional graphic arguments passed to \code{plot}.
#' 
#' @references Drew Dimmery (2016). rdd: Regression Discontinuity Estimation. R package
#'    version 0.57. https://CRAN.R-project.org/package=rdd
#'
#' @importFrom stats qnorm qt quantile na.omit aggregate
#' @importFrom graphics lines arrows legend rug strwidth abline
#' 
#' @include rd_est.R
#' @include predict.rd.R
#' @include treat_assign.R
#'
#' @export
#'
#' @examples 
#' set.seed(12345) 
#' dat <- data.frame(x = runif(1000, -1, 1), cov = rnorm(1000))
#' dat$tr <- as.integer(dat$x >= 0)
#' dat$y <- 3 + 2 * dat$x + 3 * dat$cov + 10 * (dat$x >= 0) + rnorm(1000)
#' rd <- rd_est(y ~ x + tr | cov, data = dat, cutpoint = 0, t.design = "geq") 
#' plot(rd)

plot.rd <- function(x, preds = NULL, 
  fit_line = c("linear", "quadratic", "cubic", "optimal", "half", "double"), 
  fit_ci = c("area", "dot", "hide"), fit_ci_level = .95, bin_n = 20, bin_level = .95, 
  bin_size = c("shade", "size"), quant_bin = TRUE, xlim = NULL, ylim = NULL, 
  include_rugs = FALSE, ...) {
  
  if (!inherits(x, "rd"))
    stop("Not an object of class rd.")
  
  if (is.factor(x$frame[, x$cov]) == TRUE)
    stop("Plotting is currently not supported for covariates that are factors.")
  
  if ("cutpoint" %in% names(x$call)) 
    cut <- eval.parent(x$call$cutpoint) 
  else cut <- 0
  
  if ("t.design" %in% names(x$call)) 
    t.design <- eval.parent(x$call$t.design) 
  else t.design <- "l"

  if (is.null(preds))
    preds <- predict(x)
  
  if (!"Z" %in% names(x$frame))
    x$frame$Z <- treat_assign(x$frame$X, cut, t.design)
  
  d <- as.data.frame(x$frame)
  if (length(x$na.action) > 0) 
    d <- d[-x$na.action, ] 
  
  # CALCULATE CI
  preds$Yhat.linear.ub    = preds$Yhat.linear + qnorm((1 - fit_ci_level) / 2)    * preds$YSE.linear
  preds$Yhat.quadratic.ub = preds$Yhat.quadratic + qnorm((1 - fit_ci_level) / 2) * preds$YSE.quadratic
  preds$Yhat.cubic.ub     = preds$Yhat.cubic + qnorm((1 - fit_ci_level) / 2)     * preds$YSE.cubic
  preds$Yhat.optimal.ub   = preds$Yhat.optimal + qnorm((1 - fit_ci_level) / 2)   * preds$YSE.optimal
  preds$Yhat.half.ub      = preds$Yhat.half + qnorm((1 - fit_ci_level) / 2)      * preds$YSE.half
  preds$Yhat.double.ub    = preds$Yhat.double + qnorm((1 - fit_ci_level) / 2)    * preds$YSE.double
  preds$Yhat.linear.lb    = preds$Yhat.linear - qnorm((1 - fit_ci_level) / 2)    * preds$YSE.linear
  preds$Yhat.quadratic.lb = preds$Yhat.quadratic - qnorm((1 - fit_ci_level) / 2) * preds$YSE.quadratic
  preds$Yhat.cubic.lb     = preds$Yhat.cubic - qnorm((1 - fit_ci_level) / 2)     * preds$YSE.cubic
  preds$Yhat.optimal.lb   = preds$Yhat.optimal - qnorm((1 - fit_ci_level) / 2)   * preds$YSE.optimal
  preds$Yhat.half.lb      = preds$Yhat.half - qnorm((1 - fit_ci_level) / 2)      * preds$YSE.half
  preds$Yhat.double.lb    = preds$Yhat.double - qnorm((1 - fit_ci_level) / 2)    * preds$YSE.double

  covs <- setdiff(names(d), c("X", "Y", "Z"))
  
  ## RESIDUALIZE DATA
  # if (residualize & length(covs) > 0){
  #   d$Y <- d$Y + predict(lm(sprintf("Y ~ 1+%s", paste(c("", covs), collapse = " + ")), d), 
  #     terms = covs, type = "term")
  #   # d$X <- resid(lm(sprintf("X ~ 1+%s", paste(c("", covs), collapse = " + ")), d))
  # }
  
  ## BIN DATA
  if (is.null(bin_n)) 
    bin_n <- -1
  
  if (bin_n > 0) {
    if (quant_bin){
      cut_ptile <- mean(d$X < cut)
      
      ptiles_l <- seq(0, 1, length.out = ceiling((bin_n + 1) * cut_ptile))
      ptiles_r <- seq(0, 1, length.out = ceiling((bin_n + 1) * (1 - cut_ptile)))
      
      b_l <- quantile(d$X[d$X < cut], ptiles_l[-length(ptiles_l)], na.rm = TRUE, type = 1)
      b_r <- quantile(d$X[d$X > cut], ptiles_r[-1], na.rm = TRUE, type = 1)
      b <- c(b_l, cut, b_r)
      bins <- within(data.frame(Xmid = b, bcode = 1:length(b)), {
        Xmid = Xmid + c(diff(Xmid) / 2, 0)
      })
      bins <- bins[-nrow(bins), ]
      
    } else {
      bin_ratio <- (cut - min(d$X, na.rm = TRUE)) / (max(d$X, na.rm = TRUE) - 
        min(d$X, na.rm = TRUE))
      bin_n_l <- ceiling((bin_n + 1) * bin_ratio) 
      bin_n_r <- ceiling((bin_n + 1) * (1 - bin_ratio)) 
      
      b <- c(seq(min(d$X, na.rm = TRUE), cut, length.out = bin_n_l),  # left
        seq(cut, max(d$X, na.rm = TRUE), length.out = bin_n_r)[-1])
      bins <- within(data.frame(Xmid = b[-1], bcode = 1:length(b[-1])), {
        Xmid = Xmid - (Xmid[1] - b[1]) / 2
      })
      
    }

    d$bcode <- .bincode(d$X, b, F, T)
    
    bin_summ <- aggregate(Y ~ bcode + Z, d, 
      function(x) c(Ymean = mean(x), Ysd = sd(x), N = sum(!is.na(x))), na.action = na.omit)
    
    bin_summ <- cbind(bin_summ[, 1:2], bin_summ[, 3])
    bin_summ <- merge(bin_summ, bins, by = c("bcode"), all = TRUE)
    
    bin_summ$Yse <- bin_summ$Ysd / sqrt(bin_summ$N)
    bin_summ$Yub <- qt(1 - (1 - bin_level) / 2, bin_summ$N - 1) * bin_summ$Yse + bin_summ$Ymean
    bin_summ$Ylb <- qt((1 - bin_level) / 2, bin_summ$N - 1) * bin_summ$Yse + bin_summ$Ymean
    
    bin_summ <- bin_summ[!is.na(bin_summ$Ymean),]
  } else {
    bin_summ <- d
    
    bin_summ$Xmid <- bin_summ$X
    bin_summ$Ymean <- bin_summ$Y
    
    bin_size <- setdiff(bin_size, "shade")
    bin_level <- 0
  }
  
  ## PLOT
  cols <- c("red", "blue", "green4", "red4", "blue4", "darkgray")[1:length(fit_line)]
  plot.new()
  if(is.null(xlim)) 
    xlim <- range(d$X)
  if(is.null(ylim))
    ylim <- range(d$Y)
    dy <- 0.1
    ylim <- c((1-dy)*ylim[1], (1+dy)*ylim[2])
  plot.window(xlim = xlim, ylim = ylim)
  
  # POINTS
  if (bin_level > 0 & bin_level < 100)
    arrows(bin_summ$Xmid, bin_summ$Ylb, y1 = bin_summ$Yub, angle = 90, code = 3, length = .05)
  
  if (bin_n >= 0)
    points(bin_summ$Xmid, bin_summ$Ymean, type = "p", 
      pch = 21,
      # col = ifelse(bin_summ$Z, "white", "black"),
      bg = ifelse(bin_summ$Z, "black", "white"),
      cex = if (bin_n > 0 & "size" %in% bin_size & min(bin_summ$N) != max(bin_summ$N))
        (bin_summ$N - min(bin_summ$N)) / (max(bin_summ$N) - min(bin_summ$N)) * 2 + 1 
        else .9
      # bg = if("shade" %in% bin_size & min(bin_summ$N) != max(bin_summ$N)) 
      #   gray(1 - (bin_summ$N - min(bin_summ$N)) / (max(bin_summ$N) - min(bin_summ$N))) 
      #   else "black"
    )
  
  # LINEAR
  if ("linear" %in% fit_line) {
    by(preds, preds$Tr, 
      function(df_p) {
        color <- cols[which(fit_line == "linear")]
        # fit line
        lines(df_p$X, df_p$Yhat.linear, type = "l", lwd = 2, lty = 2, col = color)
        # ci area
        if ("area" %in% fit_ci) {
          ci_area <- rbind(setNames(df_p[order(df_p$X), c("X", "Yhat.linear.ub")], c("X", "Y")),
            setNames(df_p[order(df_p$X, decreasing = TRUE), c("X", "Yhat.linear.lb")], c("X", "Y")))
          polygon(ci_area[,1], ci_area[,2], border = NA, col = adjustcolor(color, alpha.f = .2))
        } 
        if("dot" %in% fit_ci) {
          lines(df_p$X, df_p$Yhat.linear.lb, type = "l", lwd = 2, lty = 3, col = color)
          lines(df_p$X, df_p$Yhat.linear.ub, type = "l", lwd = 2, lty = 3, col = color)
        }
      }
    )
  }
  
  # QUADRATIC
  if ("quadratic" %in% fit_line) {
    by(preds, preds$Tr, 
      function(df_p) {
        color <- cols[which(fit_line == "quadratic")]
        # fit line
        lines(df_p$X, df_p$Yhat.quadratic, type = "l", lwd = 2, lty = 2, col = color)
        # ci area
        if("area" %in% fit_ci) {
          ci_area <- rbind(setNames(df_p[order(df_p$X), c("X", "Yhat.quadratic.ub")], c("X", "Y")), 
            setNames(df_p[order(df_p$X, decreasing = TRUE), c("X", "Yhat.quadratic.lb")], c("X", "Y")))
          polygon(ci_area[,1], ci_area[,2], border = NA, col = adjustcolor(color, alpha.f = .2))
        } 
        if("dot" %in% fit_ci) {
          lines(df_p$X, df_p$Yhat.quadratic.lb, type = "l", lwd = 2, lty = 3, col = color)
          lines(df_p$X, df_p$Yhat.quadratic.ub, type = "l", lwd = 2, lty = 3, col = color)
        }
      }
    )
  }
  
  # CUBIC
  if ("cubic" %in% fit_line) {
    by(preds, preds$Tr, 
      function(df_p){
        color <- cols[which(fit_line == "cubic")]
        # fit line
        lines(df_p$X, df_p$Yhat.cubic, type = "l", lwd=2, lty = 2, col = color)
        # ci area
        if ("area" %in% fit_ci) {
          ci_area <- rbind(setNames(df_p[order(df_p$X), c("X", "Yhat.cubic.ub")], c("X", "Y")), 
            setNames(df_p[order(df_p$X, decreasing = TRUE), c("X", "Yhat.cubic.lb")], c("X", "Y")))
          polygon(ci_area[,1], ci_area[,2], border = NA, col = adjustcolor(color, alpha.f = .2))
        } 
        if ("dot" %in% fit_ci) {
          lines(df_p$X, df_p$Yhat.cubic.lb, type = "l", lwd = 2, lty = 3, col = color)
          lines(df_p$X, df_p$Yhat.cubic.ub, type = "l", lwd = 2, lty = 3, col = color)
        }
      }
    )
  }
  
  # OPTIMAL
  if ("optimal" %in% fit_line) {
    by(preds, preds$Tr, 
      function(df_np) {
        color <- cols[which(fit_line == "optimal")]
        # fit line
        points(Yhat.optimal ~ X, 
          data = subset(df_np, !(is.na(df_np$Yhat.optimal) | is.na(df_np$X))), 
          type = "l", lwd = 2, pch = 17, cex = .8, col = color)
        # ci area
        if ("area" %in% fit_ci) {
          ci_area <- rbind(setNames(df_np[order(df_np$X), c("X", "Yhat.optimal.lb")], c("X", "Y")), 
            setNames(df_np[order(df_np$X, decreasing = TRUE), c("X", "Yhat.optimal.ub")], c("X", "Y")))
          ci_area <- na.omit(ci_area)
          polygon(ci_area[,1], ci_area[,2], border = NA, col = adjustcolor(color, alpha.f = .2))
        } 
        if ("dot" %in% fit_ci) {
          lines(Yhat.optimal.ub ~ X, 
            data = subset(df_np, !(is.na(df_np$Yhat.optimal.ub) | is.na(df_np$X))), 
            type = "l", lwd = 2, lty = 3, col = color)
          lines(Yhat.optimal.lb ~ X, 
            data = subset(df_np, !(is.na(df_np$Yhat.optimal.lb) | is.na(df_np$X))), 
            type = "l", lwd = 2, lty = 3, col = color)
        }
      }
    )
  }
  
  # HALF
  if ("half" %in% fit_line) {
    by(preds, preds$Tr, 
      function(df_np) {
        color <- cols[which(fit_line == "half")]
        # fit line
        points(Yhat.half ~ X, 
          data = subset(df_np, !(is.na(df_np$Yhat.half) | is.na(df_np$X))), 
          type = "l", lwd = 2, pch = 3, cex = .8, col = color)
        # ci area
        if ("area" %in% fit_ci) {
          ci_area <- rbind(setNames(df_np[order(df_np$X), c("X", "Yhat.half.lb")], c("X", "Y")), 
            setNames(df_np[order(df_np$X, decreasing = TRUE), c("X", "Yhat.half.ub")], c("X", "Y")))
          ci_area <- na.omit(ci_area)
          polygon(ci_area[,1], ci_area[,2], border = NA, col = adjustcolor(color, alpha.f = .2))
        } 
        if ("dot" %in% fit_ci) {
          lines(Yhat.half.ub ~ X, 
            data = subset(df_np, !(is.na(df_np$Yhat.half.ub) | is.na(df_np$X))), 
            type = "l", lwd = 2, lty = 3, col = color)
          lines(Yhat.half.lb ~ X, 
            data = subset(df_np, !(is.na(df_np$Yhat.half.lb) | is.na(df_np$X))), 
            type = "l", lwd = 2, lty = 3, col = color)
        }
      }
    )
  }
  
  # DOUBLE
  if ("double" %in% fit_line) {
    by(preds, preds$Tr, 
      function(df_np) {
        color <- cols[which(fit_line == "double")]
        # fit line
        points(Yhat.double ~ X, 
          data = subset(df_np, !(is.na(df_np$Yhat.double) | is.na(df_np$X))), 
          type = "l", lwd = 2, pch = 4, cex = .8, col = color)
        # ci area
        if ("area" %in% fit_ci) {
          ci_area <- rbind(setNames(df_np[order(df_np$X), c("X", "Yhat.double.lb")], c("X", "Y")), 
            setNames(df_np[order(df_np$X, decreasing = TRUE), c("X", "Yhat.double.ub")], c("X", "Y")))
          ci_area <- na.omit(ci_area)
          polygon(ci_area[,1], ci_area[,2], border = NA, col = adjustcolor(color, alpha.f = .2))
        } 
        if ("dot" %in% fit_ci) {
          lines(Yhat.double.ub ~ X, 
            data = subset(df_np, !(is.na(df_np$Yhat.double.ub) | is.na(df_np$X))), 
            type = "l", lwd = 2, lty = 3, col = color)
          lines(Yhat.double.lb ~ X, 
            data = subset(df_np, !(is.na(df_np$Yhat.double.lb) | is.na(df_np$X))), 
            type = "l", lwd = 2, lty = 3, col = color)
        }
      }
    )
  }
  
  # LEGEND
  pos <- legend(
    x = "bottomright",
    legend = c("raw data:", "treated", "untreated"),
    text.width = strwidth("raw data:"),
    pch = c(NA, 19, 1),
    horiz = TRUE,
    bty = "n"
  )
  
  if (bin_n > 0 & length(bin_size) > 0 & min(bin_summ$N) != max(bin_summ$N)) {
    legend(x = pos$rect$left, y = pos$rect$top, yjust = .5,
      pch = c(NA,21,21), 
      legend = as.expression(c("bin size:", bquote(italic(n) == .(min(bin_summ$N))), 
        bquote(italic(n) == .(max(bin_summ$N))))),
      text.width = strwidth("raw data:"),
      horiz = TRUE,
      col = "black",
      bty = "n", 
      # pt.bg = if("shade" %in% bin_size & min(bin_summ$N) != max(bin_summ$N)) 
      #   c(NA,gray(1:0)) 
      #   else "black"
      pt.cex = if ("size" %in% bin_size & min(bin_summ$N) != max(bin_summ$N)) 
        c(NA, 0, 1) * 2 + 1 
        else c(NA, .9, .9)
    )
  }
  
  if (length(fit_line) > 0) {
    pos0 <- legend(x = "topleft", legend = "", bty = "n")
    
    if (any(fit_line %in% c("linear", "quadratic", "cubic"))) {
      pos1 <- legend(x = pos0$rect$left + pos0$rect$w + strwidth("parametric"), y = pos0$rect$top, 
        legend = rep("", sum(fit_line %in% c("linear", "quadratic", "cubic")) + 1),
        col = c(NA, cols[which(fit_line %in% c("linear", "quadratic", "cubic"))]),
        lty = c(NA, rep(2,sum(fit_line %in% c("linear", "quadratic", "cubic")))),
        lwd = 2, bty = "n", merge = FALSE)
      
      text(pos1$rect$left, pos1$text$y, adj = 1, 
        labels = c("parametric:", fit_line[which(fit_line %in% c("linear", "quadratic", "cubic"))]))
    } else {
      pos1 <- pos0
    }
    
    if (any(fit_line %in% c("optimal", "half", "double"))) {
      pos2 <- legend(x = pos1$rect$left + pos1$rect$w + strwidth("non-parametric"), y = pos0$rect$top, 
        legend = rep("", sum(fit_line %in% c("optimal", "half", "double")) + 1),
        col = c(NA, cols[which(fit_line %in% c("optimal", "half", "double"))]),
        # pch = c(NA, c(17,3,4)[which(c("optimal", "half", "double") %in% fit_line)]),
        lty = c(NA, rep(1, sum(fit_line %in% c("optimal", "half", "double")))),
        lwd = 2, bty = "n")
      
      text(pos2$rect$left, pos2$text$y, adj = 1, 
        labels = c("non-parametric:", fit_line[which(fit_line %in% c("optimal", "half", "double"))]))
    }    
  }
  
  ## OTHER ELEMENTS
  if (include_rugs) {
    rug(x = d$X, side = 1, ticksize = .01)
    rug(x = d$Y, side = 2, ticksize = .01)
  }

  # plot the cutoff
  abline(v = cut, col = "black", lty = 2)
  
  box()
  axis(1)
  axis(2)
  
}