R/arima-spikes-onegeo.R

Defines functions get_rawcounts arima_plot arima_ciplot line_plot run_arima

Documented in arima_ciplot arima_plot get_rawcounts line_plot run_arima

#' Run this first. It applies the ARIMA to predict future values and then produces a dataset with the actual and fitted values.
#'
#' @param df A dataframe including time as \code{timestamp} and searches for your given geography in one column.
#' @param interrupt The date where things change. ARIMA will be predicted on all days before the interrupt.
#' @param geo The column name of the geography of searches that you want.
#' @param polycolor What color do you want the polygon to be?.
#' @keywords
#' @export
#' @examples
#' run_arima(df = data, interrupt = ymd("2019-12-19"), geo = "US")

run_arima <- function(
  df,
  interrupt,
  begin = T,
  end = T,
  geo = "US",
  kalman = F,
  bootstrap = F,
  bootnum = 1000,
  linear = F,
  rsv = F,
  periods = NA,
  maxK = 5,
  first_differencing = NA,
  seasonal_differencing = NA
  ){


  # For data periods larger than 1 day, Google gives data with the date equal to
  # the first date in each period. This doesn't look good in plots. So we need to
  # make it so that the dates in the dataset are the last date in each period. To
  # complicate things, Google will report preliminary data for the most recent
  # period even if that period is not complete.

  # First we figure out how many dates are in each period
  df$timestamp <- ymd(df$timestamp)
  freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)

  # If the end of the last period hasn't even occurred yet, we remove it from the dataset
  maxdate <- max(df$timestamp, na.rm = T)
  if(Sys.Date() < maxdate + freq) df <- df %>% filter(timestamp != maxdate)

  # Finally, we move the dates to be the end of the period. Note that if it
  # is daily data, freq is 1 and so the dates do not actually move.
  df$timestamp <- ymd(df$timestamp) + freq - 1



  # We take only the data that we need. This is the timestamp and the one
  # geography.
  tmpdf <- df[, c("timestamp", geo)]
  names(tmpdf) <- c("timestamp", "geo")

  # We need timestamp to be a date object.
  tmpdf$timestamp <- ymd(tmpdf$timestamp)

  # This allows us to use T as a default for begin and after.
  # If either is T, it just takes the beginning/end of the data
  if(begin == T) begin <- min(ymd(tmpdf$timestamp))
  if(end == T) end <- max(ymd(tmpdf$timestamp))


  # We need to select a begin and end point that is a datapoint. Therefore, we
  # use the closest_date function to select something before or equal to begin...
  begin <- closest_date(data = tmpdf, date = begin, type = "beforeequal")
  # ... and after or equal to end
  end <- closest_date(data = tmpdf, date = end, type = "afterequal")


  # There are two possibilities. Either interupt occurs on a date that we have
  # data for or it occurs between two data points.

  # Case 1: If the interrupt date is actually in the data frame, then this is fairly simple.
  # This keeps interrupt as the same date, because it is equal. The interrupt line
  # looks best on the date immediately before the interruption -- this way it is
  # looks like the fitted line diverges from the actual line right at the interruption
  # line.

  # Case 2: If our interruption is between two dates in the dataset, it's a bit more
  # complicated. We assume that the period including your interruption should
  # be part of the "post-period". Google data is formatted such that the date
  # corresponds to the last date in the period. Therefore, the date after your
  # interruption corresponds to the week that your interruption happened.
  # We use this as the interruption. We still need the line to come at the date
  # before the interruption, so we use "before" to set our interruption.

  # All together, we correct the interrupt to be either the same as it was
  # (case 1) or the next date (case 2). From there, we use the date before then
  # for the interuption line.

  interrupt <- closest_date(data = df, date = interrupt, type = "afterequal")
  interrupt_line <- closest_date(data = df, date = interrupt, type = "before")


  # We want to ensure that all of these are date objectss
  begin <- ymd(begin)
  end <- ymd(end)
  interrupt <- ymd(interrupt)


  # Now we limit the data to only the data between the begin and end period.
  tmpdf <- tmpdf %>% filter(timestamp %within% interval(begin, end))


  # This is an option to change query fractions from searches per 10M to
  # Relative Search Values (RSV), i.e., everything is relative to the max value,
  # which is set to 100.

  if(rsv){
    rsvmaxval <- max(tmpdf$geo, na.rm = T)
    tmpdf$geo <- tmpdf$geo / rsvmaxval * 100
  }


  # We need to know what kind of data this is. We infer with diff.date.
  # If the minimum difference is 1 day, its daily. If it's 7, it's weekly. Etc.
  freq <- min(as.numeric(diff.Date(tmpdf$timestamp)), na.rm = T)


  if(!linear){

    period_values <- 365.25/freq

    # Here we put the searches into time series objects.
    # time_series is the entire timeline
    time_series <- ts(tmpdf$geo, freq = period_values, start = decimal_date(begin))

    # ts_training is just the pre-period
    ts_training <- ts(tmpdf %>% filter(timestamp < interrupt) %>% pull(geo), freq = period_values, start = decimal_date(begin))

    # ts_test is just the post-period
    ts_test <- ts(tmpdf %>% filter(timestamp >= interrupt) %>% pull(geo), freq = period_values, start = decimal_date(interrupt))

    if(all(is.na(periods))){

      # na_kalman lets us impute missing data in the time series objects if the
      # kalman object is set to T
      if(kalman){
        time_series <- na_kalman(time_series, model="auto.arima")
        ts_training <- na_kalman(ts_training, model="auto.arima")
        ts_test <- na_kalman(ts_test, model="auto.arima")
        tmpdf$geo <- as.numeric(time_series)
      }

      mod <- auto.arima(ts_training, approximation  = F, d = first_differencing, D = seasonal_differencing, trace = T)

      if(bootstrap){
        set.seed(1234)
        # h (horizon) should be equal to the length of the after-period
        # bootnum is the number of paths it's allowed to take
        fitted_values <- forecast(mod, h = length(ts_test), bootstrap = TRUE, npaths = bootnum)
      } else{
        fitted_values <- forecast(mod, h = length(ts_test))
      }


    } else {


      # periods is an argument to use multi-season time series and a Fourier model
      # in your ARIMA

      # Create an empty vector
      period_values <- c()
      # If you set periods to "all", it will make it year, month, and week
      if("all" %in% periods) periods <- c("year", "month", "week")


      # This is how many observations you have per...

      #year
      yearval <- 365.25/freq
      #month
      monthval <- (365.25/12)/freq
      # week
      weekval <- 7/freq

      # If year, month, or week is one of your periods it will make sure you have enough data
      # and then add the appropriate value to the empty vector
      if("year" %in% periods & length(ts_training) > yearval * 2) period_values <- c(yearval, period_values)
      if("month" %in% periods & length(ts_training) > monthval * 2) period_values <- c(monthval, period_values)
      if("week" %in% periods & length(ts_training) > weekval * 2) period_values <- c(weekval, period_values)

      # We can only use periods that occur over multiple observations. Periods of each intervention
      # makes no sense.
      period_values <- period_values[period_values > 1]
      # assert("You used your own periods, but none of your periods are valid.", {length(period_values) > 0})
      stopifnot(length(period_values) > 0)

      # Here we put the searches into multi-season time series objects.
      # time_series is the entire timeline
      time_series <- msts(tmpdf$geo, period_values, start = decimal_date(begin))

      # ts_training is just the pre-period
      ts_training <- msts(tmpdf %>% filter(timestamp < interrupt) %>% pull(geo), period_values, start = decimal_date(begin))

      # ts_test is just the post-period
      ts_test <- msts(tmpdf %>% filter(timestamp >= interrupt) %>% pull(geo), period_values, start = decimal_date(interrupt))


      # na_kalman lets us impute missing data in the time series objects if the
      # kalman object is set to T
      if(kalman){
        time_series <- na_kalman(time_series, model="auto.arima")
        ts_training <- na_kalman(ts_training, model="auto.arima")
        ts_test <- na_kalman(ts_test, model="auto.arima")
        tmpdf$geo <- as.numeric(time_series)
      }

      # This is the order of the Fourier. It needs to be less than 1/2 the period.
      K <- floor(c(period_values / 2) - 1e-5)
      # The higher the K, the longer it takes to process. Therefore, we have a maxK
      # option so you can control the maximum order for your Fourier
      K <- sapply(K, function(x) min(c(x, maxK)))

      # Give you some info and remind you that multi-seasonality takes a long time.
      # We don't use multi-seasons for multigeo and multiterms bc it would take too long.
      print(sprintf("[%s] Using multiple periods in the ARIMA... this may take a while", Sys.time()))
      print(sprintf("[%s] Max K value is %s (larger maxK -> longer processing time)", Sys.time(), maxK))
      print(sprintf("[%s] Periods: %s", Sys.time(), paste(period_values, collapse = ", ")))
      print(sprintf("[%s] Maximum orders of Fourier: %s", Sys.time(), paste(K, collapse = ", ")))
      print(sprintf("[%s] Estimating model...", Sys.time()))
      flush.console()


      # We set the model using a Fourier series as an explanatory variable
      mod <- auto.arima(ts_training, approximation  = F, seasonal = F, xreg = fourier(ts_training, K=K, length(ts_training)))

      print(sprintf("[%s] Model Estimation Complete! Calculating forecasts...", Sys.time()))
      flush.console()

      if(bootstrap){
        set.seed(1234)
        # h (horizon) should be equal to the length of the after-period
        # bootnum is the number of paths it's allowed to take
        # fitted_values <- forecast(mod, h = length(ts_test), bootstrap = TRUE, npaths = bootnum)

        # We use the Fourier in the forecast, as well.
        fitted_values <- forecast(mod, bootstrap = T, npaths = 1000, h = length(ts_test), xreg = fourier(ts_test, K=K, length(ts_test)))

      } else{

        # Same as above but not bootstrapped
        fitted_values <- forecast(mod, h = length(ts_test), xreg = fourier(ts_test, K=K, length(ts_test)))
      }

      print(sprintf("[%s] Forecasts Estimation Complete!", Sys.time()))
      flush.console()




    }
  }


  # If linear is True, we have to create that model.
  if(linear){


    # First, we create a dataframe of train data and fit the model.
    x_train <- 1:length(ts_training)
    y_train <- as.numeric(ts_training)
    train <- data.frame("x" = x_train, "y" = y_train)
    mod <- lm(y ~ x, data = train)

    # Next, we create a data frame for the test data.
    x_test <- (length(ts_training) + 1):(length(ts_training) + length(ts_test))
    ft <- data.frame("x" = x_test, "y" = NA)


    if(bootstrap){

      # If bootstrap, we use boot_predict to forecast the values for the test data
      # based upon the model fit on the train data
      conf95 <- boot_predict(mod, newdata = ft, R=1000, condense = F)
      conf95 <- data.frame(conf95)
      names(conf95)[3:5] <- c("fit", "lwr", "upr")
      fitted_values <- data.frame("mean" = conf95$fit, "lower" = conf95$lwr, "upper" = conf95$upr)

    } else{

      # If not bootstrap, we use predict.lm to forecast the values for the test data
      # based upon the model fit on the train data.
      conf95 <- predict(mod, newdata = ft, interval = "confidence", level = 0.95)
      conf95 <- data.frame(conf95)
      fitted_values <- data.frame("mean" = conf95$fit, "lower" = conf95$lwr, "upper" = conf95$upr)
    }



  }


  # Now we need to add the fitted data to the data.frame.
  # tmp2 is a data frame of forecasts on the test data
  tmp2 <- data.frame(fitted=fitted_values$mean, lo=fitted_values$lower, hi=fitted_values$upper)

  # tmp1 is just an empty data.frame for the train data
  tmp1 <- data.frame(matrix(NA, nrow=nrow(tmpdf) - length(fitted_values$mean), ncol=ncol(tmp2)))
  names(tmp1) <- names(tmp2)

  # By binding them together, we get a data.frame with the same number of rows
  # as the tmpdf
  df_to_cbind <- rbind.data.frame(tmp1, tmp2)


  # Depending upon which fitted values we use, the names can come out strange. This
  # fixes that so we at least have a `lo95` and `hi95` column.
  names(df_to_cbind) <- gsub("[.]", "", names(df_to_cbind))
  names(df_to_cbind) <- gsub("^lo$", "lo95", names(df_to_cbind))
  names(df_to_cbind) <- gsub("^hi$", "hi95", names(df_to_cbind))


  # finaldf combines the actual data with the fitted values
  finaldf <- cbind.data.frame(tmpdf, df_to_cbind)

  # For convenience, we set "fitted" to be the same as "actual" for all rows that
  # were originally in the train data.
  finaldf$fitted <- ifelse(is.na(finaldf$fitted), finaldf$geo, finaldf$fitted)

  # We rename "geo" to the actual name for the geography
  names(finaldf) <- gsub("geo", geo, names(finaldf))

  return(finaldf)
}


#' line_plot: Creates a simple line plot of searches over time
#'
#' @param df The dataframe as outputted by \code{run_arima}.
#' @param geo The column name of the geography of searches that you want.
#' @param beginplot The date you want the plot to start. Default is beginning of the time series.
#' @param endplot The date you want the plot to end. Default is end of the time series.
#' @param interrupt The date of the interruption (should be the same as \code{run_arima})
#' @param linelabel Label next to the vertical interruption line
#' @param title The title of the figure. The default is no title.
#' @param xlab The label for the x axis
#' @param ylab The label for the y axis
#' @param lbreak The distance between tick marks in the x axis, i.e., "one year" or "3 month"
#' @param lwd Line width
#' @param width Width of the plot in inches
#' @param height Height of the plot in inches
#' @param save Default is True. If False, the plot is not saved.
#' @param outfn Where to save the plot.
#' @keywords
#' @export
#' @examples
#' line_plot(
#'   df,
#'   beginplot = T,
#'   endplot = T,
#'   interrupt = ymd("2019-12-19")
#'   linelabel = "Interruption",
#'   title = NULL,
#'   xlab = "Date",
#'   ylab = "Query Fraction (Per 10 Million Searches)",
#'   lbreak = "1 year",
#'   lwd = 0.3,
#'   width = 6,
#'   height = 3,
#'   save = T,
#'   outfn = './output/panA.png'
#' )


line_plot <- function(
  df,
  geo = 'US',
  beginplot = T,
  endplot = T,
  interrupt,
  linelabel = "Interruption",
  linelabelpos = 0.02,
  title = NULL,
  xlab = "Date",
  ylab = "Query Fraction\n(Per 10 Million Searches)",
  lbreak = "1 year",
  ylim = NULL,
  hicol = NA,
  locol = NA,
  nucol = NA,
  opcol = NA,
  colorscheme = "red",
  xfmt = date_format("%b %Y"),
  lwd = 0.3,
  extend = F,
  width = 6,
  height = 3,
  save = T,
  outfn
  ){


  # Set a colorscheme
  colorschemer(colorscheme)

  # This allows us to use T as a default for begin and after.
  # If either is T, it just takes the beginning/end of the data
  if(beginplot == T) beginplot <- min(ymd(df$timestamp), na.rm = T)
  if(endplot == T) endplot <- max(ymd(df$timestamp), na.rm = T)


  # If extend is true, you make the plot fit around existing data points.
  if(!extend){
    # We need to select a begin and end point that is a datapoint. Therefore, we
    # use the closest_date function to select something before or equal to begin...
    # ... and after or equal to end
    beginplot <- closest_date(data = df, date = beginplot, type = "beforeequal")
    # This gives us an option where
    endplot <- closest_date(data = df, date = endplot, type = "afterequal")
  }


  # There are two possibilities. Either interupt occurs on a date that we have
  # data for or it occurs between two data points.

  # Case 1: If the interrupt date is actually in the data frame, then this is fairly simple.
  # This keeps interrupt as the same date, because it is equal. The interrupt line
  # looks best on the date immediately before the interruption -- this way it is
  # looks like the fitted line diverges from the actual line right at the interruption
  # line.

  # Case 2: If our interruption is between two dates in the dataset, it's a bit more
  # complicated. We assume that the period including your interruption should
  # be part of the "post-period". Google data is formatted such that the date
  # corresponds to the last date in the period. Therefore, the date after your
  # interruption corresponds to the week that your interruption happened.
  # We use this as the interruption. We still need the line to come at the date
  # before the interruption, so we use "before" to set our interruption.

  # All together, we correct the interrupt to be either the same as it was
  # (case 1) or the next date (case 2). From there, we use the date before then
  # for the interuption line.

  interrupt <- closest_date(data = df, date = interrupt, type = "afterequal")
  interrupt_line <- closest_date(data = df, date = interrupt, type = "before")


  # We need to know what kind of data this is. We infer with diff.date.
  # If the minimum difference is 1 day, its daily. If it's 7, it's weekly. Etc.
  freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)


  # We want to ensure that all of these are date objectss
  beginplot <- ymd(beginplot)
  endplot <- ymd(endplot)
  interrupt <- ymd(interrupt)



  # Change the name of the geography to something that's easier to use
  names(df) <- gsub(geo, "geo", names(df))


  # freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)
  # interrupt <- ymd(interrupt)
  # if(freq == 1){
  #   interrupt_line <- interrupt -1
  # } else{
  #   interrupt_line <- interrupt
  # }
  #
  #
  # if(beginplot==T) beginplot <- ymd(min(ymd(df$timestamp), na.rm = T))
  # if(endplot==T) endplot <- ymd(max(ymd(df$timestamp), na.rm = T))
  #
  #
  # interrupt <- closest_date(data = df, date = interrupt, type = "beforeequal")





  # We just want to set the maximum time and search value occuring after the interruption.
  # We use these to place a point right at the peak.
  maxval <- df %>% filter(timestamp >= interrupt) %>% filter(geo == max(geo, na.rm = T)) %>% pull(geo)
  maxtime <- df %>% filter(timestamp >= interrupt) %>% filter(geo == max(geo, na.rm = T)) %>% pull(timestamp)



  # Here we create the figure.
  p <- ggplot(df)

  # The first step is putting in the vertical line and vertical line label in the right spot.
  # Here, taking the difference between endplot and beginplot standardizes the x axis and linelabelpos
  # can be used to change how close or far away the label is.
  p <- p + annotate("text", x = interrupt_line - as.numeric(as.numeric(endplot - beginplot) * linelabelpos), y = maxval*0.98, label = linelabel, hjust=1, vjust = 1)
  p <- p + geom_vline(xintercept=interrupt_line, linetype="dashed", color="grey74")

  # Next, we draw the line of actual values
  p <- p + geom_line(aes(x=timestamp, y=geo, group=1), color=hicol, linetype="solid", size=lwd)

  # We put a point at the peak
  p <- p + geom_point(aes(x = maxtime, y = maxval), size=2, color=opcol)


  # We pass options along to the axes
  p <- p + scale_y_continuous(limits = ylim)
  p <- p + scale_x_date(date_breaks = lbreak,
                   labels=xfmt,
                   limits = ymd(c(beginplot, endplot)))
  p <- p + labs(
    title= title,
    x = xlab,
    y = ylab
  )

  # And use the default theme
  p <- p + theme_classic()


  # If save, we save the figure
  if(save) ggsave(p, width=width, height=height, dpi=300, filename=outfn)


  # We change the name of the geography back in the dataframe.
  names(df) <- gsub("geo", geo, names(df))


  # Return the ggplot
  return(p)

}


#' arima_ciplot This uses the output from \code{run_arima} to create a figure showing the
#' confidence interval of the ARIMA
#'
#' @param df The dataframe as outputted by \code{run_arima}.
#' @param geo The column name of the geography of searches that you want.
#' @param beginplot The date you want the plot to start
#' @param endplot The date you want the plot to end.
#' @param interrupt The date of the interruption (should be the same as \code{run_arima})
#' @param linelabel Label next to the vertical interruption line
#' @param title The title of the figure. The default is no title.
#' @param xlab The label for the x axis
#' @param ylab The label for the y axis
#' @param ylim length-2 vector with ymin and ymax, Default NULL
#' @param lbreak The distance between tick marks in the x axis, i.e., "one year" or "3 month"
#' @param lwd Line width
#' @param width Width of the plot in inches
#' @param height Height of the plot in inches
#' @param save Default is True. If False, the plot is not saved.
#' @param outfn Where to save the plot.
#' @keywords
#' @export
#' @examples
#' arima_plot(
#'            df,
#'            title = "Searches to Purchase Cigarettes - US",
#'            xlab = "Date",
#'            ylab = "Query Fraction",
#'            outfn = './output/fig.pdf',
#'            beginplot = "2019-09-01",
#'            endplot = "2020-01-15",
#'            lbreak = "1 year",
#'            linelabel = "Tobacco 21 Signed",
#'            interrupt = ymd("2019-12-19"),
#'            width = 6,
#'            height = 3,
#'            lwd = 0.3,
#'            save = T
#'            )
arima_ciplot <- function(
  df,
  geo = 'US',
  title = NULL,
  xlab = "Date",
  xfmt = date_format("%b %Y"),
  ylab = "Greater Than Expected (%)",
  ylim = NULL,
  outfn = './output/fig.pdf',
  vlinelwd = 1,
  vlinecol = "grey74",
  beginplot = T,
  endplot = T,
  hline = T,
  vline = T,
  lbreak = "1 month",
  hicol = NA,
  locol = NA,
  nucol = NA,
  opcol = NA,
  colorscheme = "red",
  polyalpha = 0.9,
  interrupt,
  width = 6,
  height = 3,
  lwd = 0.3,
  save = T,
  extend = F
  ){

  # Set a colorscheme
  colorschemer(colorscheme)

  # This allows us to use T as a default for begin and after.
  # If either is T, it just takes the beginning/end of the data
  if(beginplot == T) begin <- min(ymd(df$timestamp))
  if(endplot == T) end <- max(ymd(df$timestamp))

  # If extend is true, you make the plot fit around existing data points.
  if(!extend){
    # We need to select a begin and end point that is a datapoint. Therefore, we
    # use the closest_date function to select something before or equal to begin...
    # ... and after or equal to end
    beginplot <- closest_date(data = df, date = beginplot, type = "beforeequal")
    # This gives us an option where
    endplot <- closest_date(data = df, date = endplot, type = "afterequal")
  }


  # There are two possibilities. Either interupt occurs on a date that we have
  # data for or it occurs between two data points.

  # Case 1: If the interrupt date is actually in the data frame, then this is fairly simple.
  # This keeps interrupt as the same date, because it is equal. The interrupt line
  # looks best on the date immediately before the interruption -- this way it is
  # looks like the fitted line diverges from the actual line right at the interruption
  # line.

  # Case 2: If our interruption is between two dates in the dataset, it's a bit more
  # complicated. We assume that the period including your interruption should
  # be part of the "post-period". Google data is formatted such that the date
  # corresponds to the last date in the period. Therefore, the date after your
  # interruption corresponds to the week that your interruption happened.
  # We use this as the interruption. We still need the line to come at the date
  # before the interruption, so we use "before" to set our interruption.

  # All together, we correct the interrupt to be either the same as it was
  # (case 1) or the next date (case 2). From there, we use the date before then
  # for the interuption line.

  interrupt <- closest_date(data = df, date = interrupt, type = "afterequal")
  interrupt_line <- closest_date(data = df, date = interrupt, type = "before")


  # We need to know what kind of data this is. We infer with diff.date.
  # If the minimum difference is 1 day, its daily. If it's 7, it's weekly. Etc.
  freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)


  # We want to ensure that all of these are date objectss
  beginplot <- ymd(beginplot)
  endplot <- ymd(endplot)
  interrupt <- ymd(interrupt)


  # Change the name of the geography to something that's easier to use
  names(df) <- gsub(geo, "geo", names(df))


  # freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)
  # interrupt <- ymd(interrupt)
  # # interrupt_line <- interrupt
  #
  # if(beginplot==T) beginplot <- ymd(interrupt)
  # if(endplot==T) endplot <- ymd(max(ymd(df$timestamp), na.rm = T))
  #
  # names(df) <- gsub(geo, "geo", names(df))
  #
  # maxval <- df %>% filter(timestamp >= interrupt) %>% filter(geo == max(geo, na.rm = T)) %>% pull(geo)
  # maxtime <- df %>% filter(timestamp >= interrupt) %>% filter(geo == max(geo, na.rm = T)) %>% pull(timestamp)
  #
  #
  # if(!extend){
  #   beginplot <- closest_date(data = df, date = beginplot, type = "beforeequal")
  #   endplot <- closest_date(data = df, date = endplot, type = "afterequal")
  # }
  # interrupt <- closest_date(data = df, date = interrupt, type = "afterequal")
  # interrupt_line <- closest_date(data = df, date = interrupt, type = "before")
  # # interrupt <- closest_date(data = df, date = interrupt, type = "before")
  #
  # beginplot <- ymd(beginplot)
  # endplot <- ymd(endplot)
  # interrupt <- ymd(interrupt)
  #
  # if(!extend){
  #   beginplot <- closest_date(data = df, date = beginplot, type = "beforeequal")
  #   endplot <- closest_date(data = df, date = endplot, type = "afterequal")
  # }
  #
  # beginplot <- ymd(beginplot)
  # endplot <- ymd(endplot)
  # interrupt <- ymd(interrupt)



  # The CI_plot focuses on the period after the interruption, so from that data...
  tmp <- with(df %>% filter(timestamp %within% interval(interrupt_line, endplot)),
                # we form a dataset with the pctdiff, CI, and color of the polygon
                data.frame(
                  "timestamp" = timestamp,
                  "pctdiff" = geo/fitted - 1,
                  "lo95" = geo/hi95 - 1,
                  "hi95" = geo/lo95 - 1,
                  "polycolor" = nucol
                ))


  # Notice that we include data from the interrupt-line (the data period before the interruption).
  # This is because we don't want to start the polygon when there is a difference between
  # fitted and actual. We want the polygon to fill the diverging lines.


  # Of course, the interrupt_line data point is actually part of the pre-period
  # so we place the confidence interval at that point very close to the actual point.
  mintime <- min(ymd(tmp$timestamp), na.rm = T)
  tmp$lo95 <- ifelse(ymd(tmp$timestamp) == mintime, tmp$pctdiff*0.99, tmp$lo95)
  tmp$hi95 <- ifelse(ymd(tmp$timestamp) == mintime, tmp$pctdiff*1.01, tmp$hi95)

  ## CREATE PLOT

  # We create a "circular" data frame for the polygon
  poly <- with(tmp,
              data.frame(x = c(timestamp, rev(timestamp)), y = c(lo95, rev(hi95)), polycolor=nucol))


  # We start the plot
  p <- ggplot(tmp)

  # We create the polygon giving the CI for the pctdiff
  p <- p + geom_polygon(data = poly, aes(x = x, y = y, fill=nucol), fill=nucol, alpha=polyalpha)

  # On top of that, we plot the percent difference
  p <- p + geom_line(aes(x=timestamp, y=pctdiff, group=1, color=hicol), color=hicol, linetype="solid", size=lwd)


  # If vline or hline, we put a vertical line at the interruption_line or a horizontal
  # line at 0
  if(vline){
    p <- p + geom_vline(xintercept=interrupt_line, linetype="dashed", color=vlinecol, lwd = vlinelwd)
  }
  if(hline){
    p <- p + geom_hline(yintercept=0, linetype="dashed", color=vlinecol, lwd = vlinelwd)
  }

  # We pass arguments onto the axes
  p <- p + scale_x_date(date_breaks = lbreak,
                   labels=xfmt,
                   limits = as.Date(c(beginplot, endplot)))
  p <- p + scale_y_continuous(
   limits = ylim,
   labels = function(x) paste0(x*100, "%")
  )
  p <- p + labs(
    title= title,
    x = xlab,
    y = ylab
  )

  # Use a default theme and get rid of the legend
  p <- p + theme_classic()
  p <- p + theme(legend.position="none")


  # Save if necessary
  if(save) ggsave(p, width=width, height=height, dpi=300, filename=outfn)


  # Rename the data frame back to the actual name of the geography
  names(df) <- gsub("geo", geo, names(df))


  # return plot
  return(p)

}



#' arima_plot: This uses the output from \code{run_arima} to create a figure showing the
#' difference between the actual and expected searches for a single geography.
#'
#' @param df The dataframe as outputted by \code{run_arima}.
#' @param geo The column name of the geography of searches that you want.
#' @param beginplot The date you want the plot to start
#' @param endplot The date you want the plot to end.
#' @param interrupt The date of the interruption (should be the same as \code{run_arima})
#' @param linelabel Label next to the vertical interruption line
#' @param title The title of the figure. The default is no title.
#' @param xlab The label for the x axis
#' @param ylab The label for the y axis
#' @param lbreak The distance between tick marks in the x axis, i.e., "one year" or "3 month"
#' @param lwd Line width
#' @param width Width of the plot in inches
#' @param height Height of the plot in inches
#' @param save Default is True. If False, the plot is not saved.
#' @param outfn Where to save the plot.
#' @keywords
#' @export
#' @examples
#' arima_plot(
#'            df,
#'            title = "Searches to Purchase Cigarettes - US",
#'            xlab = "Date",
#'            ylab = "Query Fraction",
#'            outfn = './output/fig.pdf',
#'            beginplot = "2019-09-01",
#'            endplot = "2020-01-15",
#'            lbreak = "1 year",
#'            linelabel = "Tobacco 21 Signed",
#'            interrupt = ymd("2019-12-19"),
#'            width = 6,
#'            height = 3,
#'            lwd = 0.3,
#'            save = T
#'            )
arima_plot <- function(
  df,
  geo = 'US',
  title = NULL,
  xlab = "Date",
  xfmt = date_format("%b %Y"),
  ylab = "Query Fraction\n(Per 10 Million Searches)",
  outfn = './output/fig.pdf',
  beginplot,
  endplot,
  lbreak = "1 year",
  linelabel = "Interruption",
  linelabelpos = 0.02,
  ylim = NULL,
  hicol = NA,
  locol = NA,
  nucol = NA,
  opcol = NA,
  colorscheme = "red",
  polyalpha = 0.9,
  interrupt,
  width = 6,
  height = 3,
  lwd = 0.3,
  save = T,
  extend = F,
  labels = T,
  bootnum = 1000,
  alpha =0.05,
  labsize = 0.5
  ){

  # Set a colorscheme
  colorschemer(colorscheme)

  # This allows us to use T as a default for begin and after.
  # If either is T, it just takes the beginning/end of the data
  if(beginplot == T) begin <- min(ymd(df$timestamp))
  if(endplot == T) end <- max(ymd(df$timestamp))

  # If extend is true, you make the plot fit around existing data points.
  if(!extend){
    # We need to select a begin and end point that is a datapoint. Therefore, we
    # use the closest_date function to select something before or equal to begin...
    # ... and after or equal to end
    beginplot <- closest_date(data = df, date = beginplot, type = "beforeequal")
    # This gives us an option where
    endplot <- closest_date(data = df, date = endplot, type = "afterequal")
  }


  # There are two possibilities. Either interupt occurs on a date that we have
  # data for or it occurs between two data points.

  # Case 1: If the interrupt date is actually in the data frame, then this is fairly simple.
  # This keeps interrupt as the same date, because it is equal. The interrupt line
  # looks best on the date immediately before the interruption -- this way it is
  # looks like the fitted line diverges from the actual line right at the interruption
  # line.

  # Case 2: If our interruption is between two dates in the dataset, it's a bit more
  # complicated. We assume that the period including your interruption should
  # be part of the "post-period". Google data is formatted such that the date
  # corresponds to the last date in the period. Therefore, the date after your
  # interruption corresponds to the week that your interruption happened.
  # We use this as the interruption. We still need the line to come at the date
  # before the interruption, so we use "before" to set our interruption.

  # All together, we correct the interrupt to be either the same as it was
  # (case 1) or the next date (case 2). From there, we use the date before then
  # for the interuption line.

  interrupt <- closest_date(data = df, date = interrupt, type = "afterequal")
  interrupt_line <- closest_date(data = df, date = interrupt, type = "before")


  # We need to know what kind of data this is. We infer with diff.date.
  # If the minimum difference is 1 day, its daily. If it's 7, it's weekly. Etc.
  freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)


  # We want to ensure that all of these are date objectss
  beginplot <- ymd(beginplot)
  endplot <- ymd(endplot)
  interrupt <- ymd(interrupt)
  df$timestamp <- ymd(df$timestamp)


  # Change the name of the geography to something that's easier to use
  names(df) <- gsub(geo, "geo", names(df))


  # We just want to set the maximum time and search value occuring after the interruption.
  # We use these to place a point right at the peak.
  maxval <- df %>% filter(timestamp >= interrupt) %>% filter(geo == max(geo, na.rm = T)) %>% pull(geo)
  maxtime <- df %>% filter(timestamp >= interrupt) %>% filter(geo == max(geo, na.rm = T)) %>% pull(timestamp)



  ## CREATE PLOT

  # Create a circular data frame for the polygon
  poly <- with(df %>% filter(timestamp %within% interval(interrupt_line, endplot)),
              data.frame(x = c(timestamp, rev(timestamp)), y = c(geo, rev(fitted)), polycolor=nucol))


  # We create the plot including...
  p <- ggplot(df)
  # the polygon
  p <- p + geom_polygon(data = poly, aes(x = x, y = y, fill=nucol), fill=nucol, alpha=polyalpha)
  # vertical line and line label
  p <- p + annotate("text", x = interrupt_line - (as.numeric((endplot - beginplot)) * linelabelpos), y = maxval*0.98, label = linelabel, hjust=1, vjust = 1)
  p <- p + geom_vline(xintercept=interrupt_line, linetype="dashed", color="grey74")
  # the fitted line, which is equal to the actual line for the entire pre period
  p <- p + geom_line(aes(x=timestamp, y=fitted, group=1, color=locol), linetype="solid", size=lwd)
  # the actual line, which covers the fitted line for the pre period
  p <- p + geom_line(aes(x=timestamp, y=geo, group=1, color=hicol), linetype="solid", size=lwd)

  # pass axis arguments
  p <- p + scale_x_date(date_breaks = lbreak,
                   labels=xfmt,
                   limits = as.Date(c(beginplot, endplot)))
  p <- p + labs(
    title= title,
    x = xlab,
    y = ylab
  )
  p <- p + scale_y_continuous(limits = ylim)

  # This creates a legend for the expected and actual values
  p <- p + scale_colour_manual(name = 'Legend', values=c(hicol, locol), labels = c('Actual','Expected'))
  p <- p + scale_fill_manual(values=nucol)
  p <- p + theme(legend.position=c(0.1,0.9))

  # Classic is the default theme
  p <- p + theme_classic()
  p <- p + theme(plot.title = element_text(hjust = 0.5))




  ## We are likely interested in the percentage difference between ARIMA forecasted
  ## and actual searches. We print that out automatically.


  # Set seed for bootstrapping
  set.seed(1234)

  # Make sure everything is in Date format still
  df$timestamp <- ymd(df$timestamp)
  interrupt <- ymd(interrupt)
  endplot <- ymd(endplot)

  # Get a vector of the expected and actual searches
  expectedsearches <- df %>% filter(timestamp %within% interval(interrupt, endplot)) %>% pull(fitted)
  actualsearches <-   df %>% filter(timestamp %within% interval(interrupt, endplot)) %>% pull(geo)

  # Use the boot package to create vectors of bootstrapped means from these
  ratiomeans <- boot(data = na.omit(actualsearches / expectedsearches - 1), statistic = samplemean, R = bootnum)
  # expectedmeans <- boot(data = expectedsearches, statistic = samplemean, R = bootnum)
  # actualmeans <- boot(data = actualsearches, statistic = samplemean, R = bootnum)

  # Put these bootstrapped vectors together and calculate percent diff
  # bootdf <- data.frame("expectedmeans" = expectedmeans$t, "actualmeans" = actualmeans$t)
  # bootdf <- bootdf %>% mutate(
  #   pctdiff = ((actualmeans / expectedmeans) - 1)
  # )

  # Extract the bootstrapped percent difference as a vector
  # booted_vec <- bootdf %>% pull(pctdiff)
  booted_vec <- ratiomeans$t

  # Report the mean and CI of this vector
  mn <- mean(actualsearches / expectedsearches - 1, na.rm = T)
  hi95 <- as.numeric(quantile(booted_vec, 1-(alpha/2), na.rm = T))
  lo95 <- as.numeric(quantile(booted_vec, (alpha/2), na.rm = T))
  lab <- sprintf("%1.0f%% Increase (95%%CI %1.0f to %1.0f)", mn*100, lo95*100, hi95*100)
  print(lab)


  # If labels are on, this will print that result on the top of your figure
  if(labels){
    g = grobTree(textGrob(lab, x=0.5, hjust=0.5, y=1, vjust=1, gp=gpar(fontsize=labsize*12)))
    p <- p + annotation_custom(g)
  }

  # Save if necessary
  if(save) ggsave(p, width=width, height=height, dpi=300, filename=outfn)

  # Change geo name back to geography
  names(df) <- gsub("geo", geo, names(df))
  print(sprintf("beginplot is %s", beginplot))

  # Return plot
  return(p)

}





#' get_rawcounts using estimates from Comscore
#'
#' @param df The dataframe as outputted by \code{run_arima}.
#' @param month 1 or 2, 1 is the earlier month and 2 is the later. Default 2
#' @param pct_desktop Assumption of the percentage of searches that are executed on Desktops. Default 0.35
#' @param geo Location
#' @param interrupt The date of the interruption (should be the same as \code{run_arima})
#' @param qf_denominator Denominator for fractions given in gtrendspy. Default 10M.
#' @param endperiod Compute raw counts from the interrupt to an enddate
#' @keywords
#' @export
#' @examples
#' get_rawcounts <- function(
#'   df,
#'   month = 2,
#'   pct_desktop = 0.35,
#'   geo = "US",
#'   interrupt = "2020-03-01",
#'   qf_denominator = 10000000,
#'   endperiod = T
#'   )


get_rawcounts <- function(
  df,
  month = 2,
  pct_desktop = 0.35,
  geo = "US",
  interrupt = "2020-03-01",
  qf_denominator = 10000000,
  endperiod = T,
  numobspermonth = NA
  ){


    # First, we need to parse comscore's website. We get the data as an rvest object
    html_data <- read_html("https://www.comscore.com/Insights/Rankings?cs_edgescape_cc=US#tab_search_query/")

    #... and pull out the url for the Google spreadsheet containing the data
    spts <- html_data %>% html_nodes("script")
    spts_text <- spts %>% html_text()
    script_with_spreadsheet <- grep("public_spreadsheet_url", spts_text, value = T)
    sheeturl <- strapplyc(script_with_spreadsheet, 'public_spreadsheet_url = "(.+pubhtml)";') %>% unlist()

    # We read the spreadsheet (authentication not required)
    sheets_deauth()
    d <- read_sheet(sheeturl, sheet = "search_query")
    names(d)[2] <- "engines"

    # ... and find the value for Google searches
    google_searches <- d %>% filter(grepl("google", tolower(engines)))


    # We choose the month and the value depending upon which month the user wants
    if(month==1){
      monthname <- names(d)[3]
      num_searches <- d[1, 3]  %>% as.numeric()
    } else {
      monthname <- names(d)[4]
      num_searches <- d[1, 4] %>% as.numeric()
    }

    print(sprintf("Using Comscore estimates for %s: %s Million Searches", monthname, num_searches))


    # We then take our data and figure out how many observations are in the month
    # we pulled
    tmpdf <- df
    names(tmpdf) <- gsub(geo, "geo", names(tmpdf))
    tmpdf$month <- month(tmpdf$timestamp)
    month_as_date <- as.Date(tolower(paste0(monthname,"-15")), format = "%b-%Y-%d")
    month_month <- month(month_as_date)
    month_year <- year(month_as_date)
    num_obs_in_month <- tmpdf %>%
            filter(
              month(timestamp) == month_month,
              year(timestamp) == month_year
            ) %>% nrow()


    # If the month we pulled is not in the dataset, we just use the frequency of the data
    if(num_obs_in_month == 0){
      freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)
      num_obs_in_month <- 30 / freq
    }

    if(!is.na(numobspermonth)) num_obs_in_month <- numobspermonth

    print(sprintf("Assuming %s observations per month", num_obs_in_month))


    # Now we have the total number of searches for the month, the number of observations
    # in our dataset per month, and our query fraction. The rest is arithmetic.


    tmpdf <- tmpdf %>% mutate(
      # comscore reports in units of 1M searches
      num_desktop_searches_per_month = num_searches * 1000000,

      # From above
      num_obs_in_month = num_obs_in_month,

      # Puts total searches in the same scale as the observations
      num_desktop_searches_per_obs = num_desktop_searches_per_month / num_obs_in_month,

      # From function arguments
      pct_searches_desktop = pct_desktop,

      # Bakc out the number of total searches per observation
      num_total_searches_per_obs = num_desktop_searches_per_obs / pct_searches_desktop
    )

    tmpdf <- tmpdf %>% mutate(
      # Actual number of searches
      rawcount_actual = geo * num_total_searches_per_obs / qf_denominator,
      # Fitted number of searches
      rawcount_fitted = fitted * num_total_searches_per_obs / qf_denominator,
    )

    # only take those columns we need
    tmpdf <- tmpdf %>% select(
      timestamp,
      geo,
      fitted,
      num_desktop_searches_per_month,
      num_obs_in_month,
      num_desktop_searches_per_obs,
      pct_searches_desktop,
      num_total_searches_per_obs,
      rawcount_actual,
      rawcount_fitted
    )


    # This gives us the option to report some summary data
    if(!is.na(interrupt)){

      # If endperiod is T, take the last date in the data
      if(is.logical(endperiod)) endperiod <- max(tmpdf$timestamp, na.rm = T)

      # Make sure everything is in date format
      endperiod <- ymd(endperiod)
      interrupt <- ymd(interrupt)

      # Sum up all actual searches since the interruption
      totalsearches <- sum(tmpdf %>% filter(timestamp %within% interval(interrupt, endperiod)) %>% pull(rawcount_actual), na.rm = T)
      # Sum up all fitted searches since the interruption
      expectedsearches <- sum(tmpdf %>% filter(timestamp %within% interval(interrupt, endperiod)) %>% pull(rawcount_fitted), na.rm = T)
      # Take the difference
      excesssearches <- totalsearches - expectedsearches

      # Report results
      print(sprintf("Actual Searches from %s to %s: %s", interrupt, endperiod, totalsearches))
      print(sprintf("Expected Searches from %s to %s: %s", interrupt, endperiod, expectedsearches))
      print(sprintf("Excess Searches from %s to %s: %s", interrupt, endperiod, excesssearches))
    }

    # Replace the name of geo with the actual geography
    names(tmpdf) <- gsub("geo", geo, names(tmpdf))


    # Return the data frame with raw counts
    return(tmpdf)

}
tlcaputi/gtrendR documentation built on Nov. 3, 2022, 10:46 p.m.