R/arima-spike-multiterms.R

Defines functions multiterm_spaghetti multiterm_barplot multi_term_arima

Documented in multi_term_arima multiterm_barplot multiterm_spaghetti

#' multi_term_arima
#'
#' @param df A dataframe including time as \code{timestamp} and searches for your given geography in one column.
#' @param interrupt The date where things change. ARIMA will be predicted on all days before the interrupt.
#' @param beginperiod How far back you want the "pre" period to go
#' @param preperiod This creates a beginperiod but with a number of days instead of a date
#' @param endperiod How far after the interruption you want to go
#' @param scaletitle Title of the scale
#' @param scalelimits vector of two values for min and max for the scale
#' @param linecol Line color
#' @param lowcol Color for low values
#' @param midcol Color for mid values
#' @param highcol Color for high values
#' @param save Default is True, If False, don't save
#' @param width Width of file in inches
#' @param height Height of file in inches
#' @param outfn Output filename
#' @keywords
#' @export
#' @examples
#' multiterms <- multi_term_arima(
#'
#'   ## A folder containing all of your gtrends data
#'   input_dir = "./input",
#'
#'   ## Which data to use
#'   geo = "US", # Geography you want to use
#'   terms_to_use = NA, # Terms you'd like to analyze. If NA then all terms
#'   timeframe_to_use = NA, # Only analyze data with filenames that contain a certain timeframe. If NA then all timeframes
#'
#'
#'   ## Parameters of time periods
#'   beginperiod = T, # Beginning of the before period, if T then beginning of data
#'   preperiod = 90, # If beginperiod is logical, preperiod is the number of days before interrupt to include in before period
#'   endperiod = T, # End of the end period, if T then end of data
#'   interrupt = "2020-03-01", # Date for interruption, splitting before and after periods
#'
#'
#'   ## Analytical arguments
#'   bootstrap = T, # Bootstrap CIs
#'   bootnum = 1000, # Number of bootstraps
#'   kalman = T # If T, impute with Kalman
#' )


multi_term_arima <- function(
  input_dir = "C:/Users/tcapu/Google Drive/modules/gtrendR/READMEcode/input",
  geo = "US",
  terms_to_use = NA,
  timeframe_to_use = NA,
  beginperiod = T,
  preperiod = 90,
  endperiod = T,
  interrupt = "2020-03-01",
  bootstrap = T,
  bootnum = 1000,
  na = "kalman",
  kalman = F,
  include_data = T,
  linear = F,
  min0 = F,
  logit = T,
  scale = T,
  alpha = 0.05,
  arima.approx = T
  ){



  # We want users to be able to use "pre-period" to set the number of days in
  # their before period even without a date.
  if(!is.na(preperiod) & is.na(beginperiod)){
    beginperiod <- ymd(interrupt) - preperiod
  }


  # Make sure the interruption is a Date object
  interrupt <- ymd(interrupt)


  # Find all files in the input directory
  files <- dir(input_dir, pattern=".csv", full.names = T)

  # These can be used to include or exclude certain terms
  if(!is.na(terms_to_use)){
    files <- grep(paste0(terms_to_use, collapse = "|"), files, value = T)
  }

  if(!is.na(timeframe_to_use)){
    files <- grep(timeframe_to_use, files, value = T)
  }


  # We are going to collect data through a for loop. ct is just a counter.
  summ_dat <- list()
  full_dat <- list()
  ct <- 1
  for (f in files){


    # For every individual term file, we first collect what the term is.
    term <- basename(f)
    term <- gsub("day|week|month|year|[.]csv", "", term)
    term <- trimws(term)

    # Then we read in the data
    df <- read.csv(f, header = T, stringsAsFactor = F)

    # For data periods larger than 1 day, Google gives data with the date equal to
    # the first date in each period. This doesn't look good in plots. So we need to
    # make it so that the dates in the dataset are the last date in each period. To
    # complicate things, Google will report preliminary data for the most recent
    # period even if that period is not complete.

    # First we figure out how many dates are in each period
    df$timestamp <- ymd(df$timestamp)
    freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)

    # If the end of the last period hasn't even occurred yet, we remove it from the dataset
    maxdate <- max(df$timestamp, na.rm = T)
    if(Sys.Date() < maxdate + freq) df <- df %>% filter(timestamp != maxdate)

    # Finally, we move the dates to be the end of the period. Note that if it
    # is daily data, freq is 1 and so the dates do not actually move.
    df$timestamp <- ymd(df$timestamp) + freq - 1


    # Ensure that timestamp is a date object and that the geography is a standard name
    df$timestamp <- ymd(df$timestamp)
    names(df) <- gsub(geo, "geo", names(df))

    # If beginperiod/endperiod is T, then we use the min/max date from the dataset
    if(is.logical(beginperiod)) beginperiod <- min(df$timestamp, na.rm = T)
    if(is.logical(endperiod)) endperiod <- max(df$timestamp, na.rm = T)


    # We restrict the dataset using the beginperiod/endperiod
    df <- df %>% filter(timestamp %within% interval(beginperiod, endperiod))


    if(scale | logit) df$geo <- df$geo / (max(df$geo, na.rm = T) * 1.1)

    if(logit){
      df$geo <- gtools::logit(df$geo)
    }



    # We need to know what kind of data this is. We infer with diff.date.
    # If the minimum difference is 1 day, its daily. If it's 7, it's weekly. Etc.
    freq <- min(as.numeric(diff.Date(df$timestamp)), na.rm = T)


    # Here we put the searches into time series objects.

    df$timestamp <- ymd(df$timestamp)

    # time_series is the entire timeline
    time_series <- ts(df$geo, freq = 365.25/freq, start = decimal_date(beginperiod))

    # ts_training is just the pre-period
    ts_training <- ts(df %>% filter(timestamp < interrupt) %>% pull(geo), freq = 365.25/freq, start = decimal_date(beginperiod))

    # ts_test is just the post-period
    ts_test <- ts(df %>% filter(timestamp >= interrupt) %>% pull(geo), freq = 365.25/freq, start = decimal_date(interrupt))


    # na_kalman lets us impute missing data in the time series objects if the
    # kalman object is set to T

    if(na == "kalman" || kalman == T){
      ts_training <-  na_kalman(ts_training, model = "auto.arima")
      # time_series <-  na_kalman(time_series, model = "auto.arima")
      # ts_test <-      na_kalman(ts_test, model = "auto.arima")
      # df$geo <- as.numeric(time_series)
    } else{
      if(na == "locf"){
        ts_training <- na_locf(ts_training)
        # time_series <- na_locf(time_series)
        # ts_test <- na_locf(ts_test)
        # df$geo <- as.numeric(time_series)
      }
    }



    set.seed(1234)
    # If linear is True, then we use a linear model. If not...
    if(!linear){

      # We run the model
      # print(ts_training)
      # ts_training <- ifelse(is.finite(ts_training), ts_training, NA)
      # mod <- auto.arima(ts_training, lambda = 0)
      mod <- auto.arima(ts_training, approximation = arima.approx)

      # We use the built-in arima.forecast function. The bootstrap option
      # lets us set the options on the forecast.
      if(bootstrap){
        # h (horizon) should be equal to the length of the after-period
        # bootnum is the number of paths it's allowed to take
        fitted_values <- forecast(mod, h = length(ts_test), bootstrap = TRUE, npaths = bootnum)
      } else{
        fitted_values <- forecast(mod, h = length(ts_test))
      }

    } else{


      # If linear is True, we have to create that model.

      # First, we create a dataframe of train data and fit the model.
      x_train <- 1:length(ts_training)
      y_train <- as.numeric(ts_training)
      train <- data.frame("x" = x_train, "y" = y_train)
      mod <- lm(y ~ x, data = train)

      # Next, we create a data frame for the test data.
      x_test <- (length(ts_training) + 1):(length(ts_training) + length(ts_test))
      ft <- data.frame("x" = x_test, "y" = NA)


      if(bootstrap){

        # If bootstrap, we use boot_predict to forecast the values for the test data
        # based upon the model fit on the train data
        conf95 <- boot_predict(mod, newdata = ft, R=1000, condense = F)
        conf95 <- data.frame(conf95)
        names(conf95)[3:5] <- c("fit", "lwr", "upr")
        fitted_values <- data.frame("mean" = conf95$fit, "lower" = conf95$lwr, "upper" = conf95$upr)

      } else{

        # If not bootstrap, we use predict.lm to forecast the values for the test data
        # based upon the model fit on the train data.
        conf95 <- predict(mod, newdata = ft, interval = "confidence", level = 0.95)
        conf95 <- data.frame(conf95)
        fitted_values <- data.frame("mean" = conf95$fit, "lower" = conf95$lwr, "upper" = conf95$upr)
      }



    }



    # Create a data frame with the same number of rows as ts_test for the predicted values
    preds <- data.frame(
      "actual" = as.numeric(ts_test),
      "fitted" = fitted_values$mean,
      "lo" = fitted_values$lower,
      "hi" = fitted_values$upper
    )

    # Fix the column names
    names(preds) <- gsub("[.]", "", names(preds))
    names(preds) <- gsub("^lo$", "lo95", names(preds))
    names(preds) <- gsub("^hi$", "hi95", names(preds))

    preds <- preds %>% select(actual, fitted, lo95, hi95)

    preds$actual  <-     as.numeric(preds$actual)
    preds$fitted  <-     as.numeric(preds$fitted)
    preds$lo95    <-     as.numeric(preds$lo95)
    preds$hi95    <-     as.numeric(preds$hi95)

    if(logit){
      preds$actual <- gtools::inv.logit(preds$actual)
      preds$fitted <- gtools::inv.logit(preds$fitted)
      preds$lo95   <- gtools::inv.logit(preds$lo95)
      preds$hi95   <- gtools::inv.logit(preds$hi95)
    }

    # Also create a data frame that's the same size as the original data

    # Create an empty data frame with rows = length(ts_train)
    tmp <- data.frame(matrix(NA, nrow = length(ts_training), ncol = ncol(preds)))
    names(tmp) <- names(preds)
    tmp$actual <- as.numeric(ts_training)
    # bind it
    full <- data.frame(rbind(tmp, preds))


    # This ensures that each value in both data frame is positive. When modelling a low
    # volume term, lo95 is often negative, which doesn't make sense (can't have negative
    # searches).

    if(min0){
      preds <- preds %>% mutate_if(is.numeric, minpos)
      full <- full %>% mutate_if(is.numeric, minpos)
    }

    # Summarizes the ratio of actual to fitted searches using only those
    # observations where the actual value is not 0

    # Get a vector of the expected and actual searches
    expectedsearches  <- preds %>% pull(fitted)
    actualsearches    <- preds %>% pull(actual)

    pctdiffs <- actualsearches / expectedsearches - 1

    # print(ts_training)
    # print(ts_test)
    # print(expectedsearches)
    # print(pctdiffs)
    # print(mean(actualsearches, na.rm = T) / mean(expectedsearches, na.rm = T) - 1)
    # print(head(preds))
    # print(tail(preds))


    # If you don't use this method of na.omit then you can get big differences in
    # main effect sizes

    # Use the boot package to create vectors of bootstrapped means from these
    ratiomeans <- boot(data = na.omit(pctdiffs), statistic = samplemean, R = bootnum)
    # expectedmeans <- boot(data = expectedsearches, statistic = samplemean, R = bootnum)
    # actualmeans <- boot(data = actualsearches, statistic = samplemean, R = bootnum)

    # Put these bootstrapped vectors together and calculate percent diff
    # bootdf <- data.frame("expectedmeans" = expectedmeans$t, "actualmeans" = actualmeans$t)
    # bootdf <- bootdf %>% mutate(
    #   pctdiff = ((actualmeans / expectedmeans) - 1)
    # )

    # Extract the bootstrapped percent difference as a vector
    # booted_vec <- bootdf %>% pull(pctdiff)
    booted_vec <- ratiomeans$t

    # Report the mean and CI of this vector
    mn <- mean(actualsearches / expectedsearches - 1, na.rm = T)
    hi95 <- as.numeric(quantile(booted_vec, 1-(alpha/2), na.rm = T))
    lo95 <- as.numeric(quantile(booted_vec, (alpha/2), na.rm = T))

    summ <- data.frame(
        "term" = term,
        "mean" = mn,
        "lo95" = lo95,
        "hi95" = hi95
    )



    # if(!bootstrap_13rw){
    #   summ <- with(preds %>% filter(actual > 0),
    #     data.frame(
    #       "term" = term,
    #       "mean" = (mean(actual / fitted, na.rm = T) - 1),
    #       "lo95" = (mean(actual / hi95, na.rm = T) - 1),
    #       "hi95" = (mean(actual / lo95, na.rm = T) - 1)
    #   ))
    # } else {
    #   summ <- with(preds %>% filter(actual > 0),
    #     data.frame(
    #       "term" = term,
    #       "mean" = (mean(actual / fitted, na.rm = T) - 1),
    #       "lo95" = NA,
    #       "hi95" = NA
    #   ))
    #
    #   B <- replicate(1000, expr={
    #   			ix <- sample(1:nrow(preds), nrow(preds), replace=T)
    #   			mean(preds$actual[ix] / preds$fitted[ix] - 1, na.rm = T)
    #   		})
    #
    #   summ$lo95 <- quantile(B, 0.025)
    #   summ$hi95 <- quantile(B, 0.975)
    #
    # }




    # change column names with the term
    names(full) <- paste0(term, "_", names(full))
    full$timestamp <- df$timestamp


    # Place these data.frames in the list
    summ_dat[[ct]] <- summ
    full_dat[[ct]] <- full
    ct <- ct + 1

  }

  # bind the summ_dat together
  summary <- do.call(rbind.data.frame, summ_dat)


  # Merge the full data together
  full <- Reduce(function(x,y) merge(x = x, y = y, by = "timestamp"), full_dat)

  # Return born in a list format.
  out <- list("summary" = summary, "full" = full)
  return(out)

    #
    #
    #
    # # Now we need to add the fitted data to the data.frame.
    #
    # # tmp2 is a data frame of forecasts on the test data
    # tmp2 <- data.frame(fitted=fitted_values$mean, lo=fitted_values$lower, hi=fitted_values$upper)
    #
    # # tmp1 is just an empty data.frame for the train data
    # tmp1 <- data.frame(matrix(NA, nrow=nrow(tmpdf) - length(fitted_values$mean), ncol=ncol(tmp2)))
    # names(tmp1) <- names(tmp2)
    #
    # # By binding them together, we get a data.frame with the same number of rows
    # # as the tmpdf
    # df_to_cbind <- rbind.data.frame(tmp1, tmp2)
    #
    #
    # # Depending upon which fitted values we use, the names can come out strange. This
    # # fixes that so we at least have a `lo95` and `hi95` column.
    # names(df_to_cbind) <- gsub("[.]", "", names(df_to_cbind))
    # names(df_to_cbind) <- gsub("^lo$", "lo95", names(df_to_cbind))
    # names(df_to_cbind) <- gsub("^hi$", "hi95", names(df_to_cbind))
    #
    #
    # # finaldf combines the actual data with the fitted values
    # finaldf <- cbind.data.frame(tmpdf, df_to_cbind)
    #
    # # For convenience, we set "fitted" to be the same as "actual" for all rows that
    # # were originally in the train data.
    # finaldf$fitted <- ifelse(is.na(finaldf$fitted), finaldf$geo, finaldf$fitted)
    #
    #
    #
    #
    # time_series <- ts(df$geo, freq = 365.25/freq, start = decimal_date(beginperiod))
    # ts_training <- ts(df %>% filter(timestamp <= interrupt) %>% pull(geo), freq = 365.25/freq, start = decimal_date(beginperiod))
    # ts_test <- ts(df %>% filter(timestamp > interrupt) %>% pull(geo), freq = 365.25/freq, start = decimal_date(interrupt))
    #
    # if(kalman){
    #
    #   if(sum(!is.na(ts_training)) < 3 | sum(!is.na(ts_test)) < 3){
    #     warning("Kalman doesn't work with so few observations")
    #     next
    #   }
    #   time_series <- na_kalman(time_series, model="auto.arima")
    #   ts_training <- na_kalman(ts_training, model="auto.arima")
    #   ts_test <- na_kalman(ts_test, model="auto.arima")
    #   df$geo <- as.numeric(time_series)
    # }
    #
    # if(!linear){
    #   mod <- auto.arima(ts_training)
    #
    #   ## EXTRACT FITTED VALUES
    #   if(bootstrap){
    #     set.seed(1234)
    #     fitted_values <- forecast(mod, h = length(ts_test), bootstrap = TRUE, npaths = bootnum)
    #   } else{
    #     fitted_values <- forecast(mod, h = length(ts_test))
    #   }
    #
    # } else{
    #
    #   x_train <- 1:length(ts_training)
    #   y_train <- as.numeric(ts_training)
    #   train <- data.frame("x" = x_train, "y" = y_train)
    #
    #   mod <- lm(y ~ x, data = train)
    #   x_test <- (length(ts_training) + 1):(length(ts_training) + length(ts_test))
    #   ft <- data.frame("x" = x_test, "y" = NA)
    #
    #
    #   if(bootstrap){
    #
    #     conf95 <- boot_predict(mod, newdata = ft, R=1000, condense = F)
    #     conf95 <- data.frame(conf95)
    #     names(conf95)[3:5] <- c("fit", "lwr", "upr")
    #     fitted_values <- data.frame("mean" = conf95$fit, "lower" = conf95$lwr, "upper" = conf95$upr)
    #
    #
    #   } else{
    #
    #     conf95 <- predict(mod, newdata = ft, interval = "confidence", level = 0.95)
    #     conf95 <- data.frame(conf95)
    #     fitted_values <- data.frame("mean" = conf95$fit, "lower" = conf95$lwr, "upper" = conf95$upr)
    #   }
    #
    # }
    #

}






#' multiterm_barplot: Use the data from multi_term_arima to create a barplot
#'
#' @param multiterm_list A dataframe including time as \code{timestamp} and searches for your given geography in one column.
#' @param interrupt The date where things change. ARIMA will be predicted on all days before the interrupt.
#' @param beginperiod How far back you want the "pre" period to go
#' @param preperiod This creates a beginperiod but with a number of days instead of a date
#' @param endperiod How far after the interruption you want to go
#' @param scaletitle Title of the scale
#' @param scalelimits vector of two values for min and max for the scale
#' @param linecol Line color
#' @param lowcol Color for low values
#' @param midcol Color for mid values
#' @param highcol Color for high values
#' @param save Default is True, If False, don't save
#' @param width Width of file in inches
#' @param height Height of file in inches
#' @param outfn Output filename
#' @keywords
#' @export
#' @examples
#' panG <- multiterm_barplot(
#'   df = multiterms,
#'
#'   ## Graphing Parameters
#'   title = NULL, # If NULL, no Title
#'   xlab = "Terms", # x axis label
#'   label_df = NA, # Use a two-column dataframe to label the barplot x axis
#'   ylab = "Greater than Expected (%)", # y axis label
#'   space = 0.8, # space between bars
#'
#'   ## Set a colorscheme
#'   colorscheme = "blue",  # Color schemes set in this package "red", 'blue" or "jamaim"
#'
#'   # ... customize any color using these
#'   hicol = NA, # Color of bars
#'
#'   ## Saving arguments
#'   save = T, # If T, save plot
#'   outfn = './output/panG.png', # Location to save plot
#'   width = 6, # Width in inches
#'   height = 3 # Height in inches
#' )

multiterm_barplot <- function(
  multiterm_list,
  label_df = NA,
  title = NULL,
  xlab = "Terms",
  ylab = "Greater than Expected (%)",
  ylim = NULL,
  space = 0.8,
  colorscheme = "blue",
  hicol = NA,
  save = T,
  outfn = './output/panG.png',
  width = 6,
  height = 3,
  barlabels = T
  ){

    # Choose a colorscheme
  colorschemer(colorscheme)

  # You may use the output from multi-term ARIMA. If that output is a list
  if(class(multiterm_list)=="list"){
    # we need the first object
    df <- multiterm_list[[1]]
  } else{
    # it's a data frame and we can use the whole thing
    df <- multiterm_list
  }


  # If we are given a label data frame, we can add labels to the barplot
  if(!is.na(label_df)){
    # just match the term in the multi_term_arima df to the label
    names(label_df) <- c("original", "label")
    df$term <- terms_df$label[match(df$term, terms_df$original)]
  }
  # When that isn't provided (or it is), we want _ to be spaces
  df$term <- gsub("_", " ", df$term)



  # Create the plot
  p <- ggplot(df)

  # Barplot, ordering the terms from highest mean to lowest mean
  p <- p + geom_bar(aes(x = reorder(term, -mean), y = mean), fill = locol,
                    col = hicol, size = 0.5,
                    stat = "identity", position=position_dodge(width=space))
  # Add errorbars from lo95 and hi95
  p <- p + geom_errorbar(aes(x = reorder(term, -mean), ymin = lo95, ymax = hi95), width=.3)

  # If user wants barlabels
  if(barlabels){
    p <- p  + geom_text(aes(
                x = reorder(term, -mean),
                # adds label above if mean is pos or below if mean is neg
                y = ifelse(mean >=0, hi95, lo95),
                label = sprintf("%1.0f%%", mean * 100),
                vjust = ifelse(mean >= 0, 0, 1)
              ))
  }

  # Pass axis arguments
  p <- p + labs(
    title= title,
    x = xlab,
    y = ylab
  )
  p <- p + scale_y_continuous(
    limits = ylim,
    labels = function(x) paste0(x*100, "%")
  ) # Multiply by 100 & add Pct
  p <- p + theme_classic()
  p <- p + theme(legend.position="none")


  # Save if necessary
  if(save) ggsave(p, width=width, height=height, dpi=300, filename=outfn)


  # Return plot
  return(p)
}



#' multiterm_spaghetti: Use the data from multi_term_arima to create a spaghetti plot
#'
#' @param multiterm_list # from multi_term_arima
#' @keywords
#' @export
#' @examples

multiterm_spaghetti <- function(
  multiterm_list,
  interrupt = "2020-03-01",
  terms_to_use = NA,
  terms_to_exclude = NA,
  normalize = T,

  ## Plot Arguments
  beginplot = "2020-03-01", # Start date for the plot. If T, beginning of data
  endplot = "2020-04-03", # End date for the plot. If T, end of data
  title = NULL, # If NULL, no Title
  xlab = "Date", # x axis label
  lbreak = "1 week", # Space between x-axis tick marks
  xfmt = date_format("%b-%d"), # Format of dates on x axis
  ylab = "Query Fraction\n(Per 10 Million Searches)", # y axis label
  lwd = 1, # Width of the line
  spaghettilwd = 0.2, #width of spaghetti
  vlinecol = "grey72", # color of vertical line
  vlinelwd = 1, # width of vertical line

  ylim = c(NA, NA), # y axis limts

  ## Spaghetti specific adjustments
  spaghettialpha = 0.25, # How transparent do you want the spaghetti lines

  ## Set a colorscheme
  colorscheme = "blue",  # Color schemes set in this package "red", 'blue" or "jamaim"

  # ... customize any color using these
  hicol = NA, # Color of US line
  locol = NA, # Color of other lines

  ## Saving arguments
  save = T, # If T, save plot
  outfn = './output/panE.png', # Location to save plot
  width = 6, # Width in inches
  height = 3 # Height in inches
  ){


    # Choose a colorscheme
  colorschemer(colorscheme)

  # You may use the output from multi-term ARIMA. If that output is a list
  if(class(multiterm_list)=="list"){
    # we need the first object
    df <- multiterm_list[[2]]
  } else{
    # it's a data frame and we can use the whole thing
    df <- multiterm_list
  }

  # We only want the actual values for each term
  actual <- df %>% select(timestamp, ends_with("_actual"))

  # This allows us to easily include or exclude certain terms
  if(!is.na(terms_to_use)){
    grepstring <- paste(terms_to_use, collapse="|")
    names_to_use <- grep(grepstring, names(actual), value = T)
    actual <- actual %>% select("timestamp", names_to_use)
  }

  if(!is.na(terms_to_exclude)){
    grepstring <- paste(terms_to_exclude, collapse="|")
    names_to_exclude <- grep(grepstring, names(actual), value = T)
    actual <- actual %>% select(-grepstring)
  }


  # We transform the data from wide to long, which works better with ggplot.
  actual_long <- melt(actual, id.vars = "timestamp", variable.name = "term", value.name = "searches")
  # the data frame is now just timestamp, term, and searches
  actual_long$term <- gsub("_actual$", "", actual_long$term)
  # Ensure timestamp is a date object
  actual_long$timestamp <- ymd(actual_long$timestamp)


  # If normalize, scale everything
  if(normalize){
    actual_long <- actual_long %>% group_by(term) %>% mutate(searches = scale(searches)) %>% ungroup()
  }


  # Ensure these are date objects
  interupt <- ymd(interrupt)
  beginplot <- ymd(beginplot)
  endplot <- ymd(endplot)

  # We want the mean value for all the terms included in this analysis. We do this with group_by
  grouped_data <- actual_long %>% group_by(timestamp) %>% summarise(mean = mean(searches, na.rm = T)) %>% ungroup()
  grouped_data$timestamp <- ymd(grouped_data$timestamp)
  grouped_data <- as.data.frame(grouped_data)


  # We start the plot
  p <- ggplot(actual_long)
  # For each term, we draw a spaghetti line
  p <- p + geom_line(aes(x = timestamp, y = searches, group = term), col = locol, alpha = spaghettialpha, lwd = spaghettilwd)
  # We draw a darker line for the mean value
  p <- p + geom_line(data = grouped_data, aes(x = timestamp, y = mean), col = hicol, lwd = lwd)

  # Pass axis arguments
  p <- p + scale_x_date(date_breaks = lbreak,
                   labels=xfmt,
                   limits = c(ymd(beginplot), ymd(endplot)))
  p <- p + scale_y_continuous(
    limits = ylim
  ) # Multiply by 100 & add Pct
  p <- p + labs(
    title= title,
    x = xlab,
    y = ylab
  )
  p <- p + geom_vline(xintercept=ymd(interrupt), linetype="dashed", color=vlinecol, lwd = vlinelwd)
  p <- p + theme_classic()


  # Save if necessary
  if(save) ggsave(p, width=width, height=height, dpi=300, filename=outfn)

  # Return ggplot
  return(p)

}
tlcaputi/gtrendR documentation built on Nov. 3, 2022, 10:46 p.m.