R/ITSModelingCode.R

Defines functions extrapolate_model add_lagged_covariates make_many_predictions_plug make_many_predictions make_model_smoother smooth_residuals smooth_series generate_prediction_sequence generate_structure_sequence make_autoregressive_series generate_Ybars

Documented in add_lagged_covariates extrapolate_model make_many_predictions make_many_predictions_plug make_model_smoother smooth_residuals smooth_series

# Default smoothing value
SMOOTH_K = 11




#' Generate the model predictions for a series, ignoring lagged components.
#'
#' Generate Ybar given a dataframe 'dat'. Ybar is our model predictions without
#' any lagged terms, fit to the pre-policy data only. This is basically the
#' "structural" component of our model, which can include seasonality.
#'
#' This is so we can show trendlines and so forth on plots.
#'
#' Ybar is defined for both pre and post policy datapoints.
#' @param fit_model A function that takes data, fits a linear model, and returns
#'   the fit model. This function needs an option to include (or not) lagged
#'   covariates.
#' @param outcomename String name of the outcome variable in dat.
#' @param t0 last pre-policy timepoint
#' @param dat Dataframe of the data.
#' @return Predicted sequence for passed dataframe (as vector)
#' @noRd
generate_Ybars = function( fit_model, outcomename, timename, t0, dat ) {
  
  # refit model with no_lags
  #fm = formula( fit0 )
  #vrs = all.vars( fm )
  #lgs = grep( "lag.", vrs, value=TRUE )
  #fmup = paste0( "~ . - ", paste( lgs, collapse =" - " ), collapse="" )
  
  #fit0.lagless = stats::update( fit0, fmup, data=dat )
  M0.lagless = fit_model( dplyr::filter( dat, !!sym(timename) <= t0 ), outcomename, timename, lagless=TRUE )
  
  stats::predict( M0.lagless, newdata = dat )
}




#' Make a synthetic timeseries with autoregressive structure
#' 
#' Given model parameters, make a synthetic timeseries for all points after t0.
#' This series is a plausible continuation of a pre-policy series.  This method needs
#' to be given the structural component, calculated from the intercept, linear trend,
#' temperature, and seasonality.
#
#' Bundling all seasonality, etc., into Ybar, the assumed model is:
#'
#'    $$Y = Ybar + beta1 * Y.pre + residual$$
#'
#' where Ybar is our structural model (with lagged OUTCOME, not lagged residual, so the generate_Ybars
#' method will not give the correct values here).
#
#' @param Ybar Structural (expected) sequence.  This method adds the autoregressive residual component
#'       to this.  Ybar tends to come from generate_structure_sequence()
#' @param Y.init Value of series at time point just before first time point in dat.post
#' @param beta1 Coefficient for lagged outcome
#' @param sigma Standard deviation of the residual errors to add to each step
#' @param expected If TRUE, return the expected value of the series with no residual noise
#' @param include.start If TRUE, return Y.init as the initial element in the returned sequence.
#'
#' @return Vector of the synthetic outcomes.
#' @noRd
make_autoregressive_series = function( Ybar, Y.init, beta1, sigma,
                                       expected=FALSE, include.start = FALSE ) {
  if ( expected ) {
    Ys = c( Y.init, rep( 0, length(Ybar) ) )
  } else {
    Ys = c( Y.init, stats::rnorm( length( Ybar ), mean=0, sd=sigma ) )
  }
  for ( i in seq_along( Ybar ) ) {
    Ys[i+1] = Ybar[[i]] + beta1*Ys[[i]] + Ys[[i+1]]
  }
  
  if ( include.start ) {
    Ys
  } else {
    Ys[-1]
  }
}


#' Generate the predictions of fit0 excluding the lagged outcome variable.
#'
#' This is a utility function used by the make_autoregressive_series() method.
#'
#' We do this so we can subtract this part off before smoothing so our smoother
#' doesn't have to try and capture the seasonality stuff.
#' @param fit0 Model fit to data.
#' @param dat Data to predict outcomes for using fit0.
#' @return Predictions as a numeric vector.
#' @noRd
generate_structure_sequence = function( fit0, dat ) {
  dat$lag.outcome = 0
  stats::predict( fit0, newdata=dat )
}



#' Make a simulated series of time points that follow a fit model.
#'
#' Given a model captured in beta.vec (the vector of coefficients from
#' a regression held in 'fit0') and sigma (the standard deviation of
#' the residuals), make a simulated series of time points that follow
#' the model.
#'
#' @param beta.vec Vector of parameter values
#' @param sigma Value for standard deviation of independent portion of
#'   residual
#' @param dat The dataframe (pre and post policy)
#' @param fit0 The original model fit to the data
#' @param outcomename The outcome of interest
#' @param t0 The last pre-policy timepoint.
#'
#' @return Dataframe, one row per timepoint.  Columns are 'time' (time
#'   of observation), 'Ybar' the predicted outcomes from the model,
#'   and 'Ystar', a concatenation of the pre-policy series followed by
#'   the synthetic sequence.
#' @noRd
generate_prediction_sequence = function( beta.vec, sigma,
                                         dat, fit0, 
                                         outcomename = "Y", timename = "time", t0 = 0 ) {
  stopifnot( is.data.frame( dat ) )
  stopifnot( timename %in% colnames(dat) )
  stopifnot( outcomename %in% colnames(dat) )
  
  fit0$coefficients = beta.vec
  beta1 = stats::coef( fit0 )[[ "lag.outcome" ]]
  
  # Predict what we would see, setting lagged outcome to 0 (to set up simulation).
  dat$Ybar.core = generate_structure_sequence( fit0, dat=dat )
  
  # Calculate what the parameters would suggest we would see in expectation, conditional on the first
  # observed timepoint.
  dat$Ybar = make_autoregressive_series( dat$Ybar.core[-c(1)], dat[[outcomename]][1], beta1=beta1, sigma=sigma,
                                         expected = TRUE,
                                         include.start = TRUE)
  
  dat.post = dplyr::filter( dat, !!sym(timename) > t0 )
  dat.pre = dplyr::filter( dat, !!sym(timename) <= t0 )
  dat.thresh = dat.pre[ nrow( dat.pre ), ]
  
  dat.post$Ystar = make_autoregressive_series( dat.post$Ybar.core,
                                               Y.init=dat.thresh[[outcomename]], beta1=beta1, sigma=sigma )
  
  # For prepolicy data, keep our structural predictions, but
  # our observed data are our 'Ystar'
  prepolicy = dplyr::tibble( !!timename := dat.pre[[timename]],
                          Ybar = dat.pre$Ybar,
                          Ystar = dat.pre[[outcomename]] )
  
  dplyr::bind_rows( prepolicy,
                    dplyr::select( dat.post, !!timename, Ybar, Ystar ) )
}


#' Smooth a series using a static loess smoother
#'
#' Use loess smoother on complete series of residuals including original data
#' pre-policy and synthetic data post policy (i.e., smooth the entire plausible
#' series).
#'
#' This method takes several parameters it does not use, to maintain
#' compatability with smooth_residuals.
#'
#' @param res A dataframe with a 'timename' column and an 'outcomename' column (which
#'   is the column that will be smoothed).
#' @param smooth_k A rough proxy for the number of observations to primarily
#'   consider to kernal weight in the neighborhood of each timepoint (this is a
#'   bandwidth, and the loess smoother gets smooth_k / n as a span value).  We
#'   want to smooth with an absolute bandwidth, not one as function of how long
#'   the time series is.
#' @param outcomename String name of the outcome variable in dat.  Defaut is "Y".
#' @param timename Name of the time column.  Default is "time".
#' @param t0 last pre-policy timepoint
#' @param ... Extra arguments (not used in this function).
#' @param post.only If TRUE fit model and smooth post-policy only. WHY fit model
#'   on post-policy data only?  Because this will make sure the fixed pre-policy
#'   does not dominate too much?  We are focusing on post-policy so we want a
#'   good fitting model for that so we can get our residuals as "white noise" as
#'   possible before smoothing.
#'
#' @return An updated version of the 'res' dataframe with `Ysmooth`, the
#'   smoothed predictions of the original Ystar outcome.  Also includes 'Ystar'
#'   the original sequence to be smoothed.
#'
#' @example examples/example_smooth_residuals.R
#' 
#' @export
#' @importFrom rlang .data
#' @importFrom dplyr mutate
#' @importFrom stats formula
smooth_series = function( res, outcomename = "Y", timename = "time", t0 = 0,
                          smooth_k = SMOOTH_K,
                          post.only = TRUE,
                          ... ) {
  
  stopifnot( is.data.frame(res) )
  stopifnot( timename %in% colnames(res) )
  
  if ( post.only ) {
    res.dat = dplyr::filter( res, .data[[timename]] > t0 )
  } else {
    res.dat = res
  }
  
  form = paste0( outcomename, " ~ ", timename )
  mod = stats::loess( formula( form ), data=res.dat, span=smooth_k / nrow(res.dat) )
  
  res = mutate( res,
                Ysmooth = stats::predict( mod, newdata=res ) )
  
  # Unneeded, loess won't extrapolate
  #if ( post.only ) {
  #  res$Ysmooth[ res$time <= t0 ] = NA
  #}
  
  res$Ysmooth
}





#' Smooth residuals after model fit
#'
#' Smooth a series by fitting the model to the data, smoothing the residuals,
#' and then putting the model predictions back.
#'
#' Use loess smoother on complete series of residuals including original data
#' pre-policy and synthetic data post policy (i.e., smooth the entire plausible
#' series).
#'
#' @inheritParams smooth_series
#' @param fit_model A function that takes data, fits a linear model, and returns
#'   the fit model. This function needs an option to include (or not) lagged
#'   covariates.
#' @param full_output If TRUE give back pieces for diagnostics of smoothing
#'   process.
#' @param covariates A dataframe with all covariates needed in the model fitting
#'   defined by fit_model.
#'
#' @return A numeric vector of the smoothed residuals.  If full_output=TRUE
#'   return a dataframe with several other columns: `resid`, the residuals based
#'   on Ystar and the model, `residStar` the smoothed residuals, 'Ybar.sm' the
#'   structural predictions of the model used for smoothing.  Here the smoothed
#'   values will be 'Ysmooth'.
#' @example examples/example_smooth_residuals.R
#'   
#' @export
#' @seealso See \code{\link{smooth_series}} for a more vanilla version that
#'   smooths without the model fitting step.
#' @importFrom rlang .data
#' @importFrom dplyr mutate filter
smooth_residuals = function( res, t0 = 0, outcomename = "Y", timename = "time",
                             post.only = TRUE,
                             smooth_k = SMOOTH_K,
                             fit_model = fit_model_default,
                             covariates = res,
                             full_output = FALSE ) {
  
  stopifnot( is.data.frame( res ) )
  stopifnot( nrow( res ) == nrow( covariates ) )
  stopifnot( timename %in% colnames(res) )
  
  covariates$Y = res[[outcomename]]
  covariates$time = res[[timename]]
  
  if ( post.only ) {
    res.dat = dplyr::filter( covariates, .data[[timename]] > t0 )
  } else {
    res.dat = covariates
  }
  
  fit0star = fit_model( res.dat, outcomename = "Y", timename = timename, lagless=TRUE )
  
  covariates$Ybar.sm = stats::predict( fit0star, newdata=covariates )
  
  covariates = mutate( covariates, resid = .data$Y - .data$Ybar.sm )
  
  covariates$residStar = smooth_series( covariates, 
                                        outcomename = "resid", timename = "time", t0 = t0, 
                                        smooth_k=smooth_k, post.only=post.only )
  
  covariates = mutate( covariates,
                       Ysmooth = .data$Ybar.sm + .data$residStar )
  
  # Give back columns of interest (for debugging)
  if ( full_output ) {
    dplyr::select( covariates, all_of( c( timename, "resid", "residStar", "Ybar.sm", "Y", "Ysmooth" )  ) )
  } else {
    covariates$Ysmooth
  }
}



#' Make a smoother that fits a model and then smooths residuals
#'
#' This helper function gives back a function that takes the resulting
#' simulation data from a single iteration of the simulation, and fits
#' 'fit_model' to it, smoothes the residuals, and puts the predictions from
#' 'fit_model' back.
#'
#' This can be used to build smoothers that smooth using models other than the
#' model being used for extrapolation (e.g., a model without temperature).
#'
#' Resulting functions have the following parameters: `res` (the data), `t0`
#' (start time), `outcomename`, `post.only` flag (for smoothing only post data
#' or not), and `smooth_k`, a tuning parameter for degree of smoothing.
#'
#'
#' @inheritParams smooth_residuals
#'
#' @return A smoother function that can be passed to the smoothing routines.
#'   This function is of the form listed above.
#' @example examples/make_model_smoother.R
#'
#' @export
make_model_smoother = function( fit_model, covariates ) {
  
  f = function( res, t0, outcomename, timename, post.only = TRUE, smooth_k = SMOOTH_K ) {
    smooth_residuals( res=res, t0=t0, 
                      outcomename=outcomename, 
                      timename = timename,
                      post.only=post.only, 
                      smooth_k=smooth_k,
                      fit_model=fit_model, 
                      covariates=covariates )
  }
  
  return( f )
}







#' Generate a collection of raw counterfactual trajectories
#'
#' Given a fit linear model 'fit0', generate R prediction series starting at t0.
#' This takes model uncertainty into account by pulling from the
#' pseudo-posterior of the model parameters (from Gelman and Hill arm package).
#'
#' @param fit0 The fit linear model to simulate from.
#' @param R Number of series to generate.
#' @param t0 Last time point of pre-policy.  Will start predicting at t0+1.
#' @param dat A dataframe with the covariates needed by the model fit0 for both
#'   pre and post-policy time points
#' @param outcomename The name of the column in dat which is our outcome.
#' @references The `arm` package, see
#'   \url{https://cran.r-project.org/package=arm}
#'
#'   Also see Gelman, A., & Hill, J. (2007). Data analysis using regression and
#'   multilevelhierarchical models (Vol. 1). New York, NY, USA: Cambridge
#'   University Press.
#'
#' @return A data.frame with the collection of predicted series, one row per
#'   time point per replicate (so will have R*nrow(dat) rows).
#'
#' @example examples/make_many_predictions.R
#'
#' @export
make_many_predictions = function( fit0, dat, R, outcomename, timename, t0 ) {
  
  # Generate collection of plausible beta values and sigma values we
  # will use for our predictions
  a = arm::sim( fit0, n.sims=R )
  coefs = stats::coef( a )
  
  # How often are our posterior lag.outcome coefficients larger than 1, which
  # creates expotential blow-up in our series.
  if( mean( coefs[,"lag.outcome"] > 1.0 ) > 0.01 ) {
    warning( sprintf( "Substantial estimated coefficients (%.1f percent) for lagged outcome are greater than 1 which will give substantial instability.",
                      100*mean( mean( coefs[,"lag.outcome"] > 1.0 )  ) ) )
  }
  coefs = as_tibble( coefs )
  
  # get names of coefficients that were not dropped (e.g., due to colinearity)
  cc = names( stats::coef( fit0 )  )[ !is.na( stats::coef( fit0 ) ) ]
  
  colnames( coefs ) = cc
  
  # This dark arts code makes the coefficient matrix a list of named vectors.
  # So each element of the list is a vector of coefficients. Split divides coefs
  # into a list of 1-row dataframes, as.list converts the dataframe to a list
  # and unlist converts that back to a vector.  The map does it to each of our 1
  # row dataframes we get from the split() call.
  coefs = split( coefs, seq( nrow(coefs) ) ) %>%
    purrr::map( as.list ) %>% 
    purrr::map( unlist )
  
  # get our sigmas
  sigmas = a@sigma
  #coefs$sigma = sigmas
  #names( coefs ) = c( "beta0", "beta1", "beta2", "beta3", "beta4", "beta5", "beta6", "sigma" )
  
  
  # For each plausible coefficient vector, generate a predictive series.
  res = purrr::map2_dfr( coefs, sigmas, generate_prediction_sequence, .id="Run",
                         dat=dat, fit0 = fit0, outcomename=outcomename, timename = timename, t0=t0 )
  
  res
}




#' @describeIn make_many_predictions This version makes multiple predictions
#'   using estimated parameters without additional uncertainty. This takes point
#'   estimates from the fit model as fixed parameters. WARNING: This method will
#'   not capture true uncertainty as it is not taking parameter uncertainty into
#'   account.
#' @export
make_many_predictions_plug = function( fit0, dat, R, outcomename, timename, t0 ) {
  
  # Repeatidly generate a predictive series.
  res = plyr::rdply( R, generate_prediction_sequence( stats::coef( fit0 ), stats::sigma( fit0 ),
                                                      dat=dat, fit0 = fit0, 
                                                      outcomename=outcomename, timename = timename,
                                                      t0=t0 ),
                     .id = "Run" )
  
  res
}



#' Augment dataframe with lagged covariates
#'
#' Take outcome and a list of covariates and add new columns with lagged
#' versions.  Assumes rows of dataframe are in time ascending order.  Lagged
#' outcome canonically called 'lag.outcome'.  Covariates 'lag.XXX'.
#' 
#' @param dat   The dataframe
#' @param outcomename   The outcome of interest (string)
#' @param covariates The covariates to lag along with the outcome. This can be
#'   either of two things.  First, it can be a list of string names.  Covariates
#'   can also be a function with a "lags" attribute with the listed covariates
#'   (as returned by, e.g., make_fit_season_model)  (which is a list of string
#'   names). NULL if no covariates other than outcome should be lagged.
#'
#' @return Augmented dataframe with lagged covariates as new columns. Will
#'   clobber old columns if the names (of form "lag.XXXX") conflict.
#' @examples
#' data( "newjersey" )
#' newjersey = add_lagged_covariates(newjersey, "n.warrant", c("sin.m","cos.m" ) )
#' head( newjersey[ c( "n.warrant", "sin.m", "lag.outcome", "lag.sin.m" ) ] )
#' @export
add_lagged_covariates = function( dat,
                                  outcomename,
                                  covariates = NULL ) {
  if ( !( outcomename %in% names(dat) ) ) {
    stop( sprintf( "Outcome '%s' is not a column in passed data", outcomename ) )
  }
  
  # make lagged covariates for modeling
  dat = dplyr::mutate( dat, lag.outcome = dplyr::lag( dat[[outcomename]] ) )
  
  if ( !is.null( covariates ) ) {
    if ( !is.null( attr( covariates, "lags" ) ) ) {
      covariates = attr( covariates, "lags" )
    }
  }
  
  if( !is.null( covariates ) ) {
    lc = paste0( "lag.", covariates )
    for ( i in seq_along( covariates ) ) {
      dat[ lc[[i]] ] = dplyr::lag( dat[[ covariates[i] ]] )
    }
  }
  
  dat
}






#' Extrapolate pre-policy data to post-policy era
#'
#' This function takes a fitted model and uses it to make post-policy
#' predictions by simulating data.
#'
#' @param M0  The fit model
#' @param outcomename Outcome of interest (name of column).
#' @param timename Name of the time variable (name of column).
#' @param dat Dataframe with data being analyzed.
#' @param t0  Last pre-policy timepoint
#' @param R  Number of replications
#' @param summarize Boolean, TRUE means collapse all simulated
#'   trajectories into single aggregate. FALSE means return all paths.
#' @param smooth Boolean.  TRUE means fit a smoother to the
#'   trajectories and look at distribution of smoothed trajectories.
#'   FALSE means look at raw data treajectories.
#' @param smoother  Function to do smoothing, if smoothing set to
#'   TRUE.  Default is smooth_series()
#' @param fix_parameters Keep the parameters in the model M0 as fixed;
#'   do not add parameter uncertainty.
#' @param ... Extra arguments to be passed to smoother (e.g,
#'   bandwidth).
#' @param full_output TRUE means smoother returns residuals as well as
#'   smoothed series.
#' @seealso \code{\link{process_outcome_model}} for wrapper function
#'   for this method that is easier to use.
#' @return Dataframe with columns corresponding to the simulations.
#'   If summarize=TRUE, one row per time point in original data.  If
#'   FALSE, all the details of all the runs are returned.
#' @example examples/extrapolate_model.R
#' @export
extrapolate_model = function( M0, dat, outcomename = "Y", timename = "time", t0 = 0, R=400, summarize=FALSE, smooth=FALSE,
                              smoother = smooth_series, full_output = FALSE, fix_parameters = FALSE, ...) {


  
  if ( fix_parameters ) {
    predictions = make_many_predictions_plug( M0, dat=dat, outcomename=outcomename, timename=timename, R=R, t0 = t0 )
  } else {
    predictions = make_many_predictions( M0, dat=dat, outcomename=outcomename, timename=timename, R=R, t0 = t0 )
  }
 
  # If we are smoothing, smooth all the simulation trajectories
  if ( smooth ) {
    predictions = predictions %>%
      tidyr::nest( data = c(!!sym(timename), Ybar, Ystar) ) %>%
      dplyr::mutate( Ysmooth = purrr::map( data, smoother,
                                           outcomename="Ystar", 
                                           timename = timename, t0 = t0, ... ) ) %>%
      tidyr::unnest(cols = c(data, Ysmooth))
  } else {
    predictions$Ysmooth = NA
  }
  
  # If we are summarizing, collapse runs into envelope.
  if ( summarize ) {
    
    if ( !smooth ) {
      
      p2 = predictions %>% 
        dplyr::group_by( !!sym(timename) ) %>%
        dplyr::summarise( Ymin = stats::quantile( Ystar, 0.025, na.rm=TRUE ),
                          Ymax = stats::quantile( Ystar, 0.975, na.rm=TRUE  ),
                          range = (Ymax - Ymin),
                          SE = stats::sd(Ystar, na.rm=TRUE ),
                          Ystar = stats::median( Ystar, na.rm=TRUE  ) )
      p2$Y = dat[[outcomename]]
      p2$Ysmooth = NA
      p2$Ysmooth1 = NA
      
    } else {
      
      p2 = predictions %>% 
        dplyr::group_by( !!sym(timename) ) %>%
        dplyr::summarise( Ymin = stats::quantile( Ysmooth, 0.025, na.rm=TRUE ),
                          Ymax = stats::quantile( Ysmooth, 0.975, na.rm=TRUE ),
                          range = (Ymax - Ymin),
                          SE = stats::sd(Ysmooth, na.rm=TRUE ),
                          Ysmooth = stats::median( Ysmooth, na.rm=TRUE  ),
                          Ystar = stats::median( Ystar, na.rm=TRUE  ) )
      
      # Add in smoothed true data line, smoothing the same way as we did with our synthetic lines.
      full.dat = tibble( time=dat[[timename]], Y = dat[[outcomename]] )
      full.dat$Ysmooth1 = smoother( full.dat, outcomename="Y", timename = "time", t0=t0, ... )
      
      #full.dat = rename( full.dat, Ystar1 = Ystar, Ysmooth1 = Ysmooth )
      #if ( full_output ) {
      #  full.dat = rename( full.dat, resid1 = resid, residStar1 = residStar )
      #}
      p2 = merge( p2, full.dat, by.x=timename, by.y = "time", all=TRUE )
    }
    
    attr( p2, "model" ) = M0
    
    p2
  } else {
    predictions
  }
}
lmiratrix/simITS documentation built on Sept. 1, 2023, 9:02 p.m.