diftrans: Compute the Differences-in-Transports Estimator

### Header ---------------------------
###
### Title: reproduce_results.R
###
### Description: R script to reproduce results of Daljord, Pouliot, Hu, and Xiao (2021).
###
### Author: Omkar A. Katta
###
### Notes: User must change the information in the Preliminaries Section
###       as needed. In particular, the `version` variable, the file paths,
###       and the plotting parameters can be changed to fit the user's preferences.
###
###       User needs to change file paths to source functions used to prepare and
###       plot the data.
###
###       Due to privacy restrictions, the original data set cannot be distributed.
###       Hence, an anonymized version of the data set will be provided.
###       The results of this script will therefore differ slightly from the paper's results.
###

### Preliminaries ---------------------------

# Install Packages (as needed)
if (!require(diftrans, quietly = T)){
  if (!require(devtools, quietly = T)){
    install.packages("devtools")
  }
  devtools::install_github("omkarakatta/diftrans")
}
if (!require(magrittr, quietly = T)){
  install.packages("magrittr")
}
if (!require(ggplot2, quietly = T)){
  install.packages("ggplot2")
}
if (!require(gridExtra, quietly = T)){
  install.packages("gridExtra")
}
if (!require(dplyr, quietly = T)){
  install.packages("dplyr")
}
if (!require(tidyr, quietly = T)){
  install.packages("tidyr")
}
if (!require(stargazer, quietly = T)){
  install.packages("stargazer")
}
if (!require(here, quietly = T)){
  install.packages("here")
}

library(diftrans)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(dplyr)
library(tidyr)
library(stargazer)
library(latex2exp)

# source functions: CHANGE FILE PATH
source(here::here("data-raw", "prepare_data.R"))
source(here::here("data-raw", "plotting_functions.R"))


# What data do you want to use?
# Option 1: "original" i.e. use entire data set
# Option 2: "v1" i.e. exclude December 2010
# Option 3: "v2" i.e. exclude December 2010, January 2011, and February 2012
version <- "original"

# Do you want to see figures?
show_fig <- TRUE
show_fig1 <- FALSE
show_fig2 <- FALSE
show_fig3 <- FALSE
show_footnote8 <- FALSE
show_fig4 <- FALSE
show_fig5 <- FALSE
show_fig6 <- FALSE
show_fig7 <- FALSE
show_fig8 <- FALSE
show_fig9 <- FALSE
show_fig10 <- FALSE

# Do you want to save figures?
save_fig <- FALSE
save_fig1 <- FALSE
save_fig2 <- FALSE
save_fig3 <- FALSE
save_fig4 <- FALSE
save_fig5 <- FALSE
save_fig6 <- FALSE
save_fig7 <- FALSE
save_fig8 <- FALSE
save_fig9 <- FALSE
save_fig10 <- FALSE

# Do you want to run diff-in-diff analysis
show_diff_in_diff <- FALSE

# Plotting Preferences

grayscale <- FALSE # toggle to TRUE for black-and-white plots
temp <- ifelse(grayscale, "grayscale", "color")

fontsize <- 20 # change font size of plot, axis, and legend titles
fontsizeaxis <- 15 # change font size of axis labels
linetype0 <- "solid" # main line type
linetype1 <- "dashed" # secondary line type
linetype2 <- "dotted" # tertiary line type
linetype3 <- "twodash"
linetype4 <- "longdash"

# Where will you store plots?
# img_path <- ... # uncomment and replace ... with destination file path
# suffix <- "_"  # naming convention

# How large do you want your plots to be? (inches)
# default_width <- 7
# default_height <- 3

### Prepare Data ---------------------------

if (version == "original") {
  Beijing <- Beijing_sample
  Tianjin <- Tianjin_sample
  Shijiazhuang <- Shijiazhuang_sample
  seedplus <- 0
  message(paste("Version: ", version, sep = ""))
} else if (version == "v1") {
  # filter out Dec 2010
  Beijing <- Beijing_sample %>%
    filter(!(year == 2010 & month == 12))
  Tianjin <- Tianjin_sample %>%
    filter(!(year == 2010 & month == 12))
  Shijiazhuang <- Shijiazhuang_sample %>%
    filter(!(year == 2010 & month == 12))
  seedplus <- 1
  message(paste("Version: ", version, sep = ""))
} else if (version == "v2") {
  # filter out Dec 2010, Jan 2011, and Feb 2011
  Beijing <- Beijing_sample %>%
    filter(!(year == 2010 & month == 12)) %>%
    filter(!(year == 2011 & month %in% c(1, 2)))
  Tianjin <- Tianjin_sample %>%
    filter(!(year == 2010 & month == 12)) %>%
    filter(!(year == 2011 & month %in% c(1, 2)))
  Shijiazhuang <- Shijiazhuang_sample %>%
    filter(!(year == 2010 & month == 12)) %>%
    filter(!(year == 2011 & month %in% c(1, 2)))
  seedplus <- 2
  message(paste("Version: ", version, sep = ""))
} else {
  stop("Specify a valid value for `version`.")
}

### Figure 1 ---------------------------

fignum <- 1
if (show_fig | show_fig1) {

  BTS <- do.call("rbind", list(Beijing, Tianjin, Shijiazhuang))
  fig1_prep <- BTS %>%
    select(ym, city, swtprice) %>%
    group_by(ym, city) %>%
    summarize(swtprice = mean(swtprice, na.rm = T)) %>%
    ungroup()

  fig1_plot <- ggplot() +
    bmp_vline(xint = as.Date("2011-01-01")) +
    bmp_vline(xint = as.Date("2014-01-01")) +
    bmp_point(x = ym,
              y = swtprice,
              data = fig1_prep,
              color = factor(city,
                             levels = c("Beijing", "Tianjin", "Shijiazhuang")),
              size = 1.5,
              alpha = 0.5,
              show.legend = F) +
    bmp_plot(data = fig1_prep,
             color = city,
             size = 0.5,
             xlab = "Date (months)", ylab = "Price (RMB 1000)",
             xtype = "ym",
             ytype = "continuous",
             ybreaks = seq(100, 225, 20),
             ylabels = seq(100, 225, 20),
             legend.position = c(0.1, 0.85),
             legend.direction = "vertical",
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5)) +
    geom_line(data = fig1_prep, aes(x = ym, y = swtprice,
                                    color = factor(city,
                                                   levels = c("Beijing", "Tianjin", "Shijiazhuang")),
                                    linetype = factor(city,
                                                      levels = c("Beijing", "Tianjin", "Shijiazhuang"))),
              size = 0.5) +
    scale_linetype_manual(values = c(linetype0, linetype1, linetype2),
                          name = "")

  if (save_fig | save_fig1){
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""), path = img_path,
           width = default_width, height = default_height+1, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig", fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))
}

### Figure 2 ---------------------------

fignum <- 2
if (show_fig | show_fig2) {

  pre <- prep_data(Beijing, prep = "pmf",
                   lowerdate = "2010-01-01", upperdate = "2011-01-01")
  # find prices of new cars in 2011
  post <- Beijing %>%
    filter(ym < as.Date("2012-01-01")) %>%
    pivot_wider(id_cols = c(month, color, noticenum),
                       names_from = year,
                       values_from = MSRP) %>%
    filter(is.na(`2010`)) %>%
    rename(MSRP = `2011`) %>%
    select(MSRP)

  fig2_plot <- ggplot() +
    bmp_twohist(data1 = pre, data2 = post,
                x = log(MSRP), scale = 1, binwidth = 1/6) +
    stat_function(data = pre,
                  mapping = aes(x = log(MSRP)),
                  color = get_color_palette(2, grayscale)[1],
                  fun = dnorm,
                  args = list(mean = mean(log(pre$MSRP)),
                              sd = sd(log(pre$MSRP)))) +
    stat_function(data = post,
                  mapping = aes(x = log(MSRP)),
                  color = get_color_palette(2, grayscale)[2],
                  fun = dnorm,
                  args = list(mean = mean(log(post$MSRP)),
                              sd = sd(log(post$MSRP)))) +
    bmp_plot(data = Beijing,
             color = policy_dummy,
             fill = policy_dummy,
             legendlabels = c("pre-lottery", "post-lottery"),
             xlab = "log MSRP (log RMB)",
             ylab = "Density",
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5))

  if (save_fig | save_fig2) {
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""), path = img_path,
           width = default_width, height = default_height+1, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig", fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))
}

### Figure 3 ---------------------------

fignum <- 3
if (show_fig | show_fig3) {

  pre <- prep_data(Beijing, prep = "dist",
                   lowerdate = "2010-01-01", upperdate = "2011-01-01")
  post <- prep_data(Beijing, prep = "dist",
                    lowerdate = "2011-01-01", upperdate = "2012-01-01")

  fig3_plot <- ggplot() +
    bmp_twohist(data1 = pre, data2 = post,
                scale = 1000,
                x = MSRP, binwidth = 20) +
    bmp_plot(data = Beijing,
             color = policy_dummy,
             fill = policy_dummy,
             legendlabels = c("pre-lottery", "post-lottery"),
             xtype = "continuous",
             xbreaks = seq(0, 1400, by = 100),
             ytype = "continuous",
             ybreaks = seq(0, 0.008, by = 0.002),
             xlab = "MSRP (RMB 1000)",
             ylab = "Density",
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5))

  if (save_fig | save_fig3) {
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""), path = img_path,
           width = default_width, height = default_height+1, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig", fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))
}

### Footnote 8 ---------------------------

if (show_fig | show_footnote8) {
  support <- prep_data(data = Beijing, prep = "support")
  pre <- prep_data(Beijing, prep = "pmf",
                   support = support,
                   lowerdate = "2010-01-01", upperdate = "2011-01-01")
  post <- prep_data(Beijing, prep = "pmf",
                    support = support,
                    lowerdate = "2011-01-01", upperdate = "2012-01-01")

  # create L1 penalty cost
  Lpcost <- function(min_pre_support, min_post_support, p) {
    costm <- matrix(NA_real_, nrow = length(min_pre_support), ncol = length(min_post_support))
    for (i in seq_along(min_pre_support)) {
        costm[i, ] <- abs(min_pre_support[i] - min_post_support)^p
    }
    costm
  }


  # initialize values
  bandwidth_seq <- seq(0, 100000, 1000)
  lambda_seq <- c(0, 0.01)
  results <- matrix(NA_real_,
                    nrow = length(bandwidth_seq), ncol = length(lambda_seq))


  d <- 25000
  lambda <- 0.01
  l1_cost <- Lpcost(support, support, 1)
  l0_cost <- build_costmatrix(support = support, bandwidth = d)
  cost_mat <- l0_cost + lambda + l1_cost
  OT <- transport(
      as.numeric(sum(post$count) / sum(pre$count) * pre$count),
      as.numeric(post$count),
      cost_mat
  )

  pre_support <- unique(pre$MSRP[pre$count != 0 & !is.na(pre$MSRP)])
  post_support <- unique(post$MSRP[post$count != 0 & !is.na(post$MSRP)])
  OT_revised <- OT %>%
    dplyr::mutate(from = pre_support[from]) %>%
    dplyr::mutate(to = post_support[to])
  common <- unique(sort(c(OT_revised$from, OT_revised$to)))
  OT_final <- OT_revised %>%
    dplyr::mutate(abs_diff = abs(from - to)) %>%
    dplyr::group_by(abs_diff) %>%
    dplyr::summarise(total = sum(mass))

  #~ running sum of penalized optimal transport at d* = 25000
  total_mass <- sum(OT_final$total[OT_final$abs_diff > 25000])
  OT_running_sum <- OT_final %>%
    dplyr::arrange(desc(abs_diff)) %>%
    dplyr::mutate(total_lag = dplyr::lag(total)) %>%
    tidyr::replace_na(list(total_lag = 0)) %>%
    dplyr::mutate(cumulative = cumsum(total_lag)) %>%
    dplyr::mutate(percentage = cumulative / total_mass * 100) %>%
    dplyr::arrange(abs_diff) %>%
    dplyr::filter(abs_diff >= 25000)

  # plot
  ggplot(OT_running_sum, aes(x = abs_diff, y = percentage)) +
    geom_line() +
    theme_bmp(sizefont = (fontsize - 8), axissizefont = (fontsizeaxis - 5)) +
    xlab("d") +
    ylab("Mass above d (% of total mass > 25000)") +
    scale_y_continuous(breaks = seq(0, max(OT_running_sum$percentage), 10)) +
    scale_x_continuous(breaks = seq(25000, max(OT_running_sum$abs_diff), 100000))

  # table
  OT_running_sum %>%
    dplyr::filter(abs_diff %% 5000 < 1000) %>%
    dplyr::select(abs_diff, cumulative, percentage) %>%
    knitr::kable(., format = "rst", booktabs = T, linesep = "", longtable = T, align = 'c')
}


### Figure 4 ---------------------------

fignum <- 4
if (show_fig | show_fig4) {

  pre <- prep_data(Beijing, prep = "pmf",
                   lowerdate = "2010-01-01", upperdate = "2011-01-01")
  post <- prep_data(Beijing, prep = "pmf",
                    lowerdate = "2011-01-01", upperdate = "2012-01-01")

  scalex <- 1000

  color_fig4a <- ifelse(grayscale,
                        get_color_palette(2, grayscale)[[1]],
                        get_color_palette(2, grayscale)[[1]])
  fig4a_OK <- ggplot() +
    geom_segment(data = pre,
                 mapping = aes(x = MSRP / scalex, xend = MSRP / scalex,
                               y = 0, yend = count),
                 alpha = 0.5,
                 color = color_fig4a) +
    bmp_plot(data = Beijing,
             xtype = "continuous",
             xbreaks = seq(0, max(Beijing$MSRP, na.rm = T)/scalex, by = 200000/scalex),
             ytype = "continuous",
             ybreaks = seq(0, max(pre$count, post$count), by = 5000),
             ylims = c(0, max(pre$count, post$count)),
             ylab = "Probability Mass Function",
             xlab = "MSRP (RMB 1000)",
             ggtitle = "Pre-Lottery",
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5))

  color_fig4b <- ifelse(grayscale,
                        get_color_palette(3, grayscale)[[2]],
                        get_color_palette(2, grayscale)[[2]])
  fig4b_OK <- ggplot() +
    geom_segment(data = post,
                 mapping = aes(x = MSRP / scalex, xend = MSRP / scalex,
                               y = 0, yend = count),
                 alpha = 0.5,
                 color = color_fig4b) +
    bmp_plot(data = Beijing,
             xtype = "continuous",
             xbreaks = seq(0, max(Beijing$MSRP, na.rm = T)/scalex, by = 200000/scalex),
             ytype = "continuous",
             ybreaks = seq(0, max(pre$count, post$count), by = 5000),
             ylims = c(0, max(pre$count, post$count)),
             # ylab = "Probability Mass Function",
             xlab = "MSRP (RMB 1000)",
             ggtitle = "Post-Lottery",
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5))

  # fig4_plot_show <- grid.arrange(fig4a_OK, fig4b_OK, ncol = 2)
  fig4_plot <- arrangeGrob(fig4a_OK, fig4b_OK, ncol = 2)

  if (save_fig | save_fig4) {
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""), fig4_plot, path = img_path,
           width = default_width, height = default_height+1, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig", fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))
}

### Figure 5 ---------------------------

fignum <- 5
if (show_fig | show_fig5) {

  pre <- prep_data(Beijing, prep = "dist",
                   lowerdate = "2010-01-01", upperdate = "2011-01-01")
  post <- prep_data(Beijing, prep = "dist",
                    lowerdate = "2011-01-01", upperdate = "2012-01-01")

  upperxlim <- 1400000
  scale <- 1000

  da <- 30000 / scale
  fig5a_OK <- ggplot() +
    bmp_twohist(data1 = pre, data2 = post,
                x = MSRP, scale = scale, binwidth = da) +
    bmp_plot(data = Beijing,
             color = policy_dummy,
             fill = policy_dummy,
             legendlabels = c("pre-lottery", "post-lottery"),
             xtype = "continuous",
             xbreaks = seq(0, upperxlim / scale, by = 100000 / scale),
             xlims = c(0, upperxlim/scale),
             ytype = "continuous",
             ybreaks = seq(0, 0.008, by = 0.002),
             # xlab = "MSRP (RMB 1000)",
             ylab = "Density",
             xangle = 45,
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5)) +
    ggtitle(TeX(paste("\\textit{d} = ", da*scale, sep = "")))
  # fig5a_OK

  db <- 50000 / scale
  fig5b_OK <- ggplot() +
    bmp_twohist(data1 = pre, data2 = post,
                x = MSRP, scale = scale, binwidth = db) +
    bmp_plot(data = Beijing,
             color = policy_dummy,
             fill = policy_dummy,
             legendlabels = c("pre-lottery", "post-lottery"),
             xtype = "continuous",
             xbreaks = seq(0, upperxlim / scale, by = 100000 / scale),
             xlims = c(0, upperxlim/scale),
             ytype = "continuous",
             ybreaks = seq(0, 0.008, by = 0.002),
             # xlab = "MSRP (RMB 1000)",
             # ylab = "Density",
             xangle = 45,
             ggtitle = paste("d = ", db*scale, sep = ""),
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5)) +
    ggtitle(TeX(paste("\\textit{d} = ", db*scale, sep = "")))
  # fig5b_OK

  dc <- 70000 / scale
  fig5c_OK <- ggplot() +
    bmp_twohist(data1 = pre, data2 = post,
                x = MSRP, scale = scale, binwidth = dc) +
    bmp_plot(data = Beijing,
             color = policy_dummy,
             fill = policy_dummy,
             legendlabels = c("pre-lottery", "post-lottery"),
             xtype = "continuous",
             xbreaks = seq(0, upperxlim / scale, by = 100000 / scale),
             xlims = c(0, upperxlim/scale),
             ytype = "continuous",
             ybreaks = seq(0, 0.008, by = 0.001),
             xlab = "MSRP (RMB 1000)",
             ylab = "Density",
             xangle = 45,
             ggtitle = paste("d = ", dc*scale, sep = ""),
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5)) +
    ggtitle(TeX(paste("\\textit{d} = ", dc*scale, sep = "")))
  # fig5c_OK

  dd <- 90000 / scale
  fig5d_OK <- ggplot() +
    bmp_twohist(data1 = pre, data2 = post,
                x = MSRP, scale = scale, binwidth = dd) +
    bmp_plot(data = Beijing,
             color = policy_dummy,
             fill = policy_dummy,
             legendlabels = c("pre-lottery", "post-lottery"),
             xtype = "continuous",
             xbreaks = seq(0, upperxlim / scale, by = 100000 / scale),
             xlims = c(0, upperxlim/scale),
             ytype = "continuous",
             ybreaks = seq(0, 0.008, by = 0.001),
             xlab = "MSRP (RMB 1000)",
             # ylab = "Density",
             xangle = 45,
             ggtitle = paste("d = ", dd*scale, sep = ""),
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5)) +
    ggtitle(TeX(paste("\\textit{d} = ", dd*scale, sep = "")))
  # fig5d_OK

  # fig5_plot_show <- grid.arrange(fig5a_OK, fig5b_OK,
  #                                fig5c_OK, fig5d_OK,
  #                                nrow = 2)
  fig5_plot <- arrangeGrob(fig5a_OK, fig5b_OK,
                           fig5c_OK, fig5d_OK,
                           nrow = 2)

  if (save_fig | save_fig5) {
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""), fig5_plot, path = img_path,
           width = default_width, height = default_height+5, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig", fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))
}

### Figure 6 ---------------------------

fignum <- "6"
if (show_fig | show_fig6) {

  set.seed(11 + seedplus)
  support <- prep_data(data = Beijing, prep = "support")

  pre <- prep_data(Beijing, prep = "pmf",
                   support = support,
                   lowerdate = "2010-01-01", upperdate = "2011-01-01")
  post <- prep_data(Beijing, prep = "pmf",
                    support = support,
                    lowerdate = "2011-01-01", upperdate = "2012-01-01")
  bandwidth_seq <- seq(0, 100000, 1000)

  real <- diftrans(pre, post, bandwidth_seq = bandwidth_seq, conservative = F)

  numsims <- 500
  placebo_results <- matrix(NA, nrow = numsims, ncol = length(bandwidth_seq))
  for (i in seq_len(numsims)) {
    print(paste("Simulation Number", i, "out of", numsims, sep = " "))
    # draw n_B,2010 samples from Beijing, 2010
    synth_pre1 <- data.frame(MSRP = pre$MSRP,
                             count = rmultinom(1, sum(pre$count), pre$count))
    # draw n_B,2011 samples from Beijing, 2010
    synth_pre2 <- data.frame(MSRP = pre$MSRP,
                             count = rmultinom(1, sum(post$count), pre$count))
    placebo <- diftrans(synth_pre1, synth_pre2,
                        bandwidth_seq = bandwidth_seq, conservative = F)
    placebo_results[i, ] <- placebo$main
  }

  placebo_mean <- apply(placebo_results, 2, mean) * 100
  placebo_sd <- apply(placebo_results, 2, sd) * 100
  probs <- c(0.90, 0.95, 0.99)
  placebo_quantiles <- apply(placebo_results, 2, quantile, prob = probs) * 100

  plot_mat <- do.call(rbind, list(placebo_mean, placebo_sd, placebo_quantiles))
  rownames(plot_mat) <- c("mean", "sd", paste("quantile", probs, sep = ""))
  plot_table <- t(plot_mat) %>%
    as.data.frame() %>%
    mutate(bandwidth = bandwidth_seq,
           real = real$main * 100) %>%
    select(bandwidth, real, everything())

  # close to mean placebo = 0.05%
  lower <- 0.04
  upper <- 0.06
  valid <- plot_table$bandwidth[plot_table$mean < upper & plot_table$mean > lower]

  # table 4
  d_table <- plot_table %>%
    filter(bandwidth %in% c(valid,
                            4000, 5000, 6000, 7000, 8000, 9000, 10000,
                            15000, 20000, 25000, 30000, 35000, 40000, 45000,
                            50000, 70000, 90000))
  knitr::kable(d_table, format = "latex", booktabs = T, linesep = "")

  # plot real cost and mean placebo cost - fig6
  fig6_plot <- ggplot(data = plot_table, aes(x = bandwidth)) +
    geom_line(aes(y = real, color = "0", linetype = "0")) +
    geom_line(aes(y = mean, color = "5", linetype = "5")) +
    scale_color_manual(values = get_color_palette(2, grayscale),
                       labels = c("real", "placebo"),
                       name = "") +
    scale_linetype_manual(values = c(linetype0, linetype1, linetype2),
                          labels = c("real", "placebo"),
                          name = "") +
    bmp_vline(xint = 25000) +
    xlab(TeX("\\textit{d}")) +
    ylab("Transport Cost (%)") +
    theme_bmp(sizefont = (fontsize - 8),
              axissizefont = (fontsizeaxis - 5)) +
    scale_x_continuous(breaks = seq(0, 100000, 10000))

  # simulation results only, no real cost (additional figure)
  fig6_plot2 <- ggplot(data = plot_table, aes(x = bandwidth)) +
    geom_line(aes(y = quantile0.99, color = "1", linetype = "1")) +
    geom_line(aes(y = quantile0.95, color = "2", linetype = "2")) +
    geom_line(aes(y = quantile0.9, color = "3", linetype = "3")) +
    geom_line(aes(y = mean, color = "5", linetype = "5")) +
    geom_line(aes(y = sd, color = "6", linetype = "6")) +
    scale_color_manual(values = get_color_palette(5, grayscale),
                       labels = c("99%ile", "95%ile", "90%ile", "mean", "sd"),
                       name = "") +
    scale_linetype_manual(values = c(linetype0, linetype1, linetype2, linetype3, linetype4),
                          labels = c("99%ile", "95%ile", "90%ile", "mean", "sd"),
                          name = "") +
    bmp_vline(xint = 25000) +
    xlab(TeX("\\textit{d}")) +
    ylab("Transport Cost (%)") +
    theme_bmp(sizefont = (fontsize - 8),
              axissizefont = (fontsizeaxis - 5),
              legend.position = c(0.8, 0.7)) +
    scale_y_continuous(breaks = seq(0, 1.6, 0.1)) +
    scale_x_continuous(breaks = seq(0, 100000, 10000))

  if (save_fig | save_fig6) {
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""),
           path = img_path, fig6_plot,
           width = default_width, height = default_height, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig",
                  fignum, suffix, "OK.jpg", sep = ""))

    ggsave(paste("fig", fignum, suffix,"zoomed", suffix, "OK.jpg", sep = ""),
           path = img_path, fig6_plot2,
           width = default_width, height = default_height, units = "in")
    message(paste("fig", fignum, "_zoomed", " is saved in ", img_path, " as fig",
                  fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))


}

### Figure 7 ---------------------------

fignum <- 7
if (show_fig | show_fig7) {

  pre <- prep_data(Tianjin, prep = "dist",
                   lowerdate = "2010-01-01", upperdate = "2011-01-01")
  post <- prep_data(Tianjin, prep = "dist",
                    lowerdate = "2011-01-01", upperdate = "2012-01-01")

  fig7_plot <- ggplot() +
    bmp_twohist(data1 = pre,
                data2 = post,
                x = MSRP,
                scale = 1000,
                binwidth = 20) +
    bmp_plot(data = Tianjin,
             color = postBeijing,
             fill = postBeijing,
             legendlabels = c("pre-lottery", "post-lottery"),
             xtype = "continuous",
             xbreaks = seq(0, 1400, by = 100),
             ytype = "continuous",
             ybreaks = seq(0, 0.008, by = 0.002),
             xlab = "MSRP (RMB 1000)",
             ylab = "Density",
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5))

  if (save_fig | save_fig7) {
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""), path = img_path,
           width = default_width, height = default_height, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig", fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))
}


### Figure 8 ---------------------------

fignum <- "8"
if (show_fig | show_fig8) {

  set.seed(11 + seedplus)
  support <- prep_data(data = Beijing, prep = "support")

  preB <- prep_data(Beijing, prep = "pmf",
                 support = support,
                 lowerdate = "2010-01-01", upperdate = "2011-01-01")
  postB <- prep_data(Beijing, prep = "pmf",
                  support = support,
                  lowerdate = "2011-01-01", upperdate = "2012-01-01")
  preT <- prep_data(Tianjin, prep = "pmf",
                 support = support,
                 lowerdate = "2010-01-01", upperdate = "2011-01-01")
  postT <- prep_data(Tianjin, prep = "pmf",
                  support = support,
                  lowerdate = "2011-01-01", upperdate = "2012-01-01")
  bandwidth_seq <- seq(0, 100000, 1000)

  realB <- diftrans(preB, postB, bandwidth_seq = bandwidth_seq, conservative = F)
  realT <- diftrans(preT, postT, bandwidth_seq = bandwidth_seq, conservative = F)
  realB_2d <- diftrans(preB, postB, bandwidth_seq = bandwidth_seq, conservative = T)

  numsims <- 500
  placebo_resultsB <- matrix(NA, nrow = numsims, ncol = length(bandwidth_seq))
  placebo_resultsT <- matrix(NA, nrow = numsims, ncol = length(bandwidth_seq))
  for (i in seq_len(numsims)) {
    print(paste("Simulation Number", i, "out of", numsims, sep = " "))
    # draw n_B,2010 samples from Beijing, 2010
    synth_pre1B <- data.frame(MSRP = preB$MSRP,
                               count = rmultinom(1, sum(preB$count), preB$count))
    # draw n_B,2011 samples from Beijing, 2010
    synth_pre2B <- data.frame(MSRP = preB$MSRP,
                               count = rmultinom(1, sum(postB$count), preB$count))
    placeboB <- diftrans(synth_pre1B, synth_pre2B,
                         bandwidth_seq = bandwidth_seq, conservative = F)
    placebo_resultsB[i, ] <- placeboB$main

    # draw n_T,2010 samples from Tianjin, 2010
    synth_pre1T <- data.frame(MSRP = preT$MSRP,
                               count = rmultinom(1, sum(preT$count), preT$count))
    # draw n_T,2011 samples from Tianjin, 2010
    synth_pre2T <- data.frame(MSRP = preT$MSRP,
                               count = rmultinom(1, sum(postT$count), preT$count))
    placeboT <- diftrans(synth_pre1T, synth_pre2T,
                         bandwidth_seq = bandwidth_seq, conservative = F)
    placebo_resultsT[i, ] <- placeboT$main
  }

  placebo_meanB <- apply(placebo_resultsB, 2, mean) * 100
  placebo_sdB <- apply(placebo_resultsB, 2, sd) * 100
  probs <- c(0.90, 0.95, 0.99)
  placebo_quantilesB <- apply(placebo_resultsB, 2, quantile, prob = probs) * 100

  placebo_meanT <- apply(placebo_resultsT, 2, mean) * 100
  placebo_sdT <- apply(placebo_resultsT, 2, sd) * 100
  probs <- c(0.90, 0.95, 0.99)
  placebo_quantilesT <- apply(placebo_resultsT, 2, quantile, prob = probs) * 100

  plot_matB <- do.call(rbind, list(placebo_meanB, placebo_sdB, placebo_quantilesB))
  rownames(plot_matB) <- c("mean", "sd", paste("quantile", probs, sep = ""))
  plot_tableB <- t(plot_matB) %>%
    as.data.frame() %>%
    mutate(bandwidth = bandwidth_seq,
           real = realB$main * 100) %>%
    select(bandwidth, real, everything())

  plot_matT <- do.call(rbind, list(placebo_meanT, placebo_sdT, placebo_quantilesT))
  rownames(plot_matT) <- c("mean", "sd", paste("quantile", probs, sep = ""))
  plot_tableT <- t(plot_matT) %>%
    as.data.frame() %>%
    mutate(bandwidth = bandwidth_seq,
           real = realT$main * 100) %>%
    select(bandwidth, real, everything())


  # close to mean placebo = 0.05%
  lower <- 0.04
  upper <- 0.06
  validB <- plot_tableB$bandwidth[plot_tableB$mean < upper & plot_tableB$mean > lower]
  validT <- plot_tableT$bandwidth[plot_tableT$mean < upper & plot_tableT$mean > lower]

#~   # replicate table 4 -- Beijing
#~   d_tableB <- plot_tableB %>%
#~   filter(bandwidth %in% c(validB,
#~                           0,
#~                           4000, 5000, 6000, 7000, 8000, 9000, 10000,
#~                           15000, 20000, 25000, 30000, 35000, 40000, 45000,
#~                           50000, 70000, 90000))
#~   knitr::kable(d_tableB, format = "latex", booktabs = T, linesep = "")
#~ 
#~ 
#~   # replicate table 4 -- Tianjin
#~   d_tableT <- plot_tableT %>%
#~   filter(bandwidth %in% c(validT,
#~                           0,
#~                           4000, 5000, 6000, 7000, 8000, 9000, 10000,
#~                           15000, 20000, 25000, 30000, 35000, 40000, 45000,
#~                           50000, 70000, 90000))
#~   knitr::kable(d_tableT, format = "latex", booktabs = T, linesep = "")


  # average of absolute difference of placebos, with 2d --- table 5
  max_bw <- max(bandwidth_seq)
  half_max_bw <- which(bandwidth_seq == max_bw / 2)
  diff_2d <- matrix(NA_real_, nrow = half_max_bw, ncol = 6)
  for (i in seq_len(half_max_bw)) {
    bw <- bandwidth_seq[i]
    B_2d <- placebo_resultsB[, which(bandwidth_seq == 2 * bw)]
    T_d <- placebo_resultsT[, which(bandwidth_seq == bw)]
    diff <- abs(B_2d - T_d)
    diff_2d[i, ] <- c(bw,
                      mean(diff) * 100,
                      sd(diff) * 100,
                      quantile(diff, prob = probs) * 100)
  }

  knitr::kable(diff_2d[which(bandwidth_seq %in%
                             c(validB,
                               0,
                               4000, 5000, 6000, 7000, 8000, 9000, 10000,
                               15000, 20000, 25000, 30000, 35000, 40000, 45000,
                               50000)), ],
               format = "latex", booktabs = T, linesep = "")

  # empirical costs with 2d --- table 5
  valid_d <- c(0,
               4000, 5000, 6000, 7000, 8000, 9000, 10000,
               15000, 20000,
               23000, 24000,
               25000, 30000, 35000, 40000, 45000,
               50000)
  real_costs <- diftrans(pre_main = preB, post_main = postB,
                         pre_control = preT, post_control = postT, bandwidth_seq = valid_d,
                         conservative = T)

}

### Figure 9 ---------------------------

fignum <- 9
if (show_fig | show_fig9) {

  BTS <- do.call("rbind", list(Beijing, Tianjin, Shijiazhuang))
  supportB <- prep_data(BTS %>% filter(city == "Beijing"),
                        prep = "support")
  supportT <- prep_data(BTS %>% filter(city == "Tianjin"),
                        prep = "support")


  post_Bpmf <- prep_data(Beijing, prep = "pmf",
                         support = supportB,
                         lowerdate = "2015-01-01", upperdate = "2016-01-01")
  pre_Bpmf <- prep_data(Beijing, prep = "pmf",
                        support = supportB,
                        lowerdate = "2014-01-01", upperdate = "2015-01-01")
  post_Tpmf <- prep_data(Tianjin, prep = "pmf",
                         support = supportT,
                         lowerdate = "2015-01-01", upperdate = "2016-01-01")
  pre_Tpmf <- prep_data(Tianjin, prep = "pmf",
                        support = supportT,
                        lowerdate = "2014-01-01", upperdate = "2015-01-01")

  bandwidth_seq = seq(0, 40000, 1000)

  temp <- diftrans(pre_Bpmf, post_Bpmf, pre_Tpmf, post_Tpmf,
                      bandwidth_seq = bandwidth_seq,
                      conservative = F,
                      quietly = T)
  bandwidth_selection <- temp %>%
    pivot_longer(cols = c(main, control, diff),
                 names_to = "type",
                 values_to = "diffprop") %>%
    mutate(type = case_when(type == "main" ~ "Beijing",
                            type == "control" ~ "Tianjin",
                            type == "diff" ~ "a"))
  # bandwidth_selection %>% filter(type == "a") %>% mutate(percent_diffprop = 100*diffprop) %>% View()

  fig9_plot <- ggplot(data = bandwidth_selection,
                       aes(x = bandwidth)) +
    geom_line(aes(y = diffprop*100, color = type, linetype = type)) +
    bmp_vline(xint = 2000) +
    bmp_plot(data = bandwidth_selection,
             color = type,
             legendlabels = c("Difference", "Beijing placebo", "Tianjin placebo"),
             xlab = TeX("\\textit{d}"),
             ylab = "Transport Cost (%)",
             ytype = "continuous",
             ybreaks = seq(-50, 100, 10),
             xtype = "continuous",
             xbreaks = seq(0, 40000, 5000),
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5)) +
    scale_linetype_manual(values = c(linetype0, linetype1, linetype2),
                          labels = c("Difference", "Beijing placebo", "Tianjin placebo"),
                          name = "")

  if (save_fig | save_fig9) {
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""), path = img_path,
           width = default_width, height = default_height, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig", fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))
}

### Figure 10 ---------------------------

fignum <- 10
if (show_fig | show_fig10) {

  # BTS <- do.call("rbind", list(Beijing, Tianjin, Shijiazhuang))
  supportB <- prep_data(Beijing,
                        prep = "support")
  supportT <- prep_data(Tianjin,
                        prep = "support")
  post_Bpmf <- prep_data(Beijing, prep = "pmf",
                         support = supportB,
                         lowerdate = "2011-01-01", upperdate = "2012-01-01")
  pre_Bpmf <- prep_data(Beijing, prep = "pmf",
                        support = supportB,
                        lowerdate = "2010-01-01", upperdate = "2011-01-01")
  post_Tpmf <- prep_data(Tianjin, prep = "pmf",
                         support = supportT,
                         lowerdate = "2011-01-01", upperdate = "2012-01-01")
  pre_Tpmf <- prep_data(Tianjin, prep = "pmf",
                        support = supportT,
                        lowerdate = "2010-01-01", upperdate = "2011-01-01")

  bandwidth_seq = seq(0, 40000, 1000)

  cons_dit <- diftrans(pre_Bpmf, post_Bpmf, pre_Tpmf, post_Tpmf,
                          bandwidth_seq = bandwidth_seq,
                          conservative = T,
                          save_dit = T)
  dit <- diftrans(pre_Bpmf, post_Bpmf, pre_Tpmf, post_Tpmf,
                     bandwidth_seq = bandwidth_seq,
                     conservative = F,
                     save_dit = T)

  bandwidth_selection <- cons_dit$out %>%
    select(-main2d)  %>%
    pivot_longer(cols = c(main,
                          control,
                          diff,
                          diff2d),
                 names_to = "type",
                 values_to = "diffprop") %>%
    mutate(type = case_when(type == "main" ~ "Beijing",
                            type == "control" ~ "Tianjin",
                            type == "diff" ~ "a",
                            type == "diff2d" ~ "b"))

  mostinform <- dit$optimal_bandwidth
  mostinform2 <- cons_dit$optimal_bandwidth

  fig10_plot <- ggplot(data = bandwidth_selection %>%
                         filter(type != "a"), # control d-d or 2d-d
                       aes(x = bandwidth,
                           linetype = type)) +
    bmp_vline(xint = mostinform) +
    bmp_vline(xint = mostinform2) +
    geom_line(aes(y = diffprop*100, color = type)) +
    bmp_plot(data = bandwidth_selection %>%
               filter(type != "a"),
             color = type,
             legendlabels = c("Difference", "Beijing", "Tianjin"),
             xlab = TeX("\\textit{d}"),
             ylab = "Transport Cost (%)",
             ytype = "continuous",
             ybreaks = seq(-50, 100, 10),
             xtype = "continuous",
             xbreaks = seq(0, 40000, 5000),
             sizefont = (fontsize - 8),
             axissizefont = (fontsizeaxis - 5)) +
    scale_linetype_manual(values = c(linetype0, linetype1, linetype2),
                          labels = c("Difference", "Beijing", "Tianjin"),
                          name = "")

  if (save_fig | save_fig10) {
    ggsave(paste("fig", fignum, suffix, "OK.jpg", sep = ""), path = img_path,
           width = default_width, height = default_height, units = "in")
    message(paste("fig", fignum, " is saved in ", img_path, " as fig", fignum, suffix, "OK.jpg", sep = ""))
  }

  message(paste("Figure ", fignum, " is complete.", sep = ""))
}


### Miscellaneous ---------------------------

# Original Figure 3 (Comparing prices of car models between 2010 and 2011)
fig3_prep <- Beijing %>%
  filter(ym < as.Date("2012-01-01")) %>%
  pivot_wider(id_cols = c(month, color, noticenum),
                     names_from = year,
                     values_from = MSRP) %>%
  filter(!(is.na(`2010`) | is.na(`2011`))) %>%
  mutate(diff = `2011` - `2010`)
nrow(fig3_prep)
summary(fig3_prep$diff)

# Diff-in-diff
if (show_diff_in_diff) {
  did_BT <- do.call("rbind", list(Beijing, Tianjin)) %>%
    filter(ym < "2012-01-01") %>%
    filter(city == "Beijing" | city == "Tianjin") %>%
    mutate(Beijing = ifelse(city == "Beijing", 1, 0)) %>%
    group_by(Beijing, postBeijing, MSRP) %>%
    summarise(sum = sum(sales)) %>%
    uncount(sum)
  did_BT_reg <- lm(log(MSRP) ~ postBeijing + Beijing + postBeijing*Beijing, data = did_BT)

  did_BS <- do.call("rbind", list(Beijing, Shijiazhuang)) %>%
    filter(ym < "2012-01-01") %>%
    filter(city == "Beijing" | city == "Shijiazhuang") %>%
    mutate(Beijing = ifelse(city == "Beijing", 1, 0)) %>%
    group_by(Beijing, postBeijing, MSRP) %>%
    summarise(sum = sum(sales)) %>%
    uncount(sum)
  did_BS_reg <- lm(log(MSRP) ~ postBeijing + Beijing + postBeijing*Beijing, data = did_BS)

  did_BST <- do.call("rbind", list(Beijing, Tianjin, Shijiazhuang)) %>%
    filter(ym < "2012-01-01") %>%
    filter(city == "Beijing" | city == "Shijiazhuang" | city == "Tianjin") %>%
    mutate(Beijing = ifelse(city == "Beijing", 1, 0)) %>%
    group_by(Beijing, postBeijing, MSRP) %>%
    summarise(sum = sum(sales)) %>%
    uncount(sum)
  did_BST_reg <- lm(log(MSRP) ~ postBeijing + Beijing + postBeijing*Beijing, data = did_BST)

  stargazer(did_BT_reg, did_BS_reg, did_BST_reg,
            label = "tab:eight",
            type = "latex",
            order = c(2, 1, 3, 4),
            covariate.labels = c("Beijing", "post", "Beijing $\\times$ post", "Constant"),
            keep.stat = c("rsq","n"),
            align = T,
            p.auto = FALSE,
            # se = list(summary(did_BT_reg)$coefficients[,"t value"],
            #           summary(did_BS_reg)$coefficients[,"t value"],
            #           summary(did_BST_reg)$coefficients[,"t value"]),
            # notes = c("t-statistics are in parentheses.")
            # single.row = TRUE,
            column.labels = c("Tianjin", "Shijiazhuang", "Both"),
            model.numbers = FALSE,
            dep.var.caption = "",
            dep.var.labels.include = FALSE,
            report = "vcs"
  )
}
omkarakatta/diftrans documentation built on Feb. 24, 2023, 9:06 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
omkarakatta/diftrans
Compute the Differences-in-Transports Estimator

data-raw/reproduce_results.R
In omkarakatta/diftrans: Compute the Differences-in-Transports Estimator

R Package Documentation

Browse R Packages

We want your feedback!

omkarakatta/diftrans Compute the Differences-in-Transports Estimator

data-raw/reproduce_results.R In omkarakatta/diftrans: Compute the Differences-in-Transports Estimator

R Package Documentation

Browse R Packages

We want your feedback!

omkarakatta/diftrans
Compute the Differences-in-Transports Estimator

data-raw/reproduce_results.R
In omkarakatta/diftrans: Compute the Differences-in-Transports Estimator