R/post_stratified_ITS.R
In simITS: Analysis via Simulation of Interrupted Time Series (ITS) Data

Documented in adjust_data aggregate_data calculate_group_weights generate_fake_grouped_data

##
## Post-stratified ITS estimator code
##



#' Calculate proportion of subgroups across time
#'
#' Calculate overall proportion of cases in each group that lie within a given
#' interval of time defined by t_min and t_max.
#'
#' @inheritParams aggregate_data
#' @param t_min The start month to aggregate cases over.
#' @param t_max The final month (default is last month).
#' @return Dataframe of each group along with overall average group weight in
#'   the specified timespan.
#' @example examples/aggregate_data_etc.R
#' @export
calculate_group_weights = function( groupname, dat, t_min, t_max = max( dat$month ), Nname = "N" ) {

  stopifnot( Nname %in% names( dat ) )
  #groupname.q = quo( groupname )
    # select target months to calibrate averages on
    dat = dplyr::filter( dat, month >= t_min, month <= t_max )

    # calculate the total sizes for each group
    sdat = dat %>% dplyr::group_by_at( groupname ) %>%
         dplyr::summarise( N = sum(!!rlang::sym(Nname)) )
    
    sdat = sdat %>% dplyr::ungroup() %>% dplyr::mutate( pi_star = N / sum(N) )

    sdat
}


#' Aggregate grouped data
#'
#' This will take a dataframe with each row being the outcomes, etc., for a
#' given group for a given month and aggregate those groups for each month.
#'
#' @param dat Dataframe with one row for each time point and group that we are
#'   going to post stratify on.  This dataframe should also have an column with
#'   passed name "Nname" indicating the number of cases that make up each given
#'   row. It should have a 'month' column for the time.
#' @param outcomename String name of the outcome variable in dat.
#' @param groupname Name of the column that has the grouping categorical
#'   variable
#' @param Nname Name of variable holding the counts (weight) in each group.
#' @param rich If TRUE, add a bunch of extra columns with proportions of the
#'   month that are each group and so forth.
#' @param is_count If TRUE the data are counts, and should be aggregated by sum
#'   rather than by mean.
#' @param covariates group-invariant covariates to preserve in the augmented
#'   rich dataframe.  These are not used in this method for any calculations.
#'   Pass as list of column names of dat
#' @return Dataframe of aggregated data, one row per month.  If rich=TRUE many
#'   extra columns with further information.
#' @example examples/aggregate_data_etc.R
#' @export
aggregate_data = function( dat, outcomename, groupname, Nname, 
                           is_count=FALSE,
                           rich = TRUE, covariates = NULL ) {

  if ( is.null( covariates ) ) {
    covariates = c()
  } else {
    if ( !all( covariates %in% names( dat ) ) ) {
      stop( "Covariates listed that are not in dataframe" )
    }
  }
  
    if ( is_count ) {
        dd <- dat %>% dplyr::group_by( month ) %>%
            dplyr::summarise( .Y = sum( (!!rlang::sym(outcomename)) ),
                     #  .Y.bar = sum( (!!rlang::sym(outcomename))  ) / sum(N),
                       N = sum( (!!rlang::sym(Nname)) ) )
        dd[ outcomename ] = dd$.Y
       # dd[ paste0( outcomename, ".bar" ) ] = dd$.Y.bar
        dd$.Y = dd$.Y.bar = NULL
    } else {
        dd <- dat %>% dplyr::group_by( month ) %>%
            dplyr::summarise( .Y = sum( (!!rlang::sym(Nname)) * (!!rlang::sym(outcomename)) ) / sum( (!!rlang::sym(Nname)) ),
                       N = sum( (!!rlang::sym(Nname)) ) )
        dd[ outcomename ] = dd$.Y
        dd$.Y = NULL
    }


    if ( rich ) {
        # calculate group sizes
        ddwts = dat %>% dplyr::select( "month", groupname, Nname ) %>%
          dplyr::rename( N = {{Nname}} ) %>%
          dplyr::group_by( month ) %>%
          dplyr::mutate( pi = N / sum( N ) ) %>% 
          dplyr::select( -N ) %>%
          tidyr::pivot_wider( month, 
                              names_from = groupname, values_from = "pi",
                              names_prefix = "pi_" )
        
        # throw in group baselines and covariates as well in wide form
        ddg = dat[ c( "month", groupname, outcomename, covariates ) ]
        ddg = tidyr::spread_( ddg, groupname, outcomename, sep="_" )
        names(ddg) = gsub( groupname, outcomename, names(ddg) )
        stopifnot(nrow(ddg) == nrow( dd ) )  # possibly covariates varied in spread?

        ddg$month = ddwts$month = NULL
        dd = dplyr::bind_cols( dd, ddg, ddwts )
    }

    dd
}


#' Adjust an outcome time series based on the group weights.
#'
#' Reweight the components of a series to match target weights for several
#' categories. This is a good preprocessing step to adjust for time-varying
#' covariates such as changing mix of case types.
#'
#' @param outcomename Name of column that has the outcome to calculated adjusted
#'   values for.
#' @param groupname Name of categorical covariate that determines the groups.
#' @param Nname Name of column in dat that contains total cases (this is the
#'   name of the variable used to generate the weights in pi_star).
#' @param include_aggregate Include aggregated (unadjusted) totals in the output
#'   as well.
#' @param dat  Dataframe of data.  Requires an N column of total cases
#'   represented in each row.
#' @param pi_star The target weights.  Each month will have its groups
#'   re-weighted to match these target weights.
#' @param is_count Indicator of whether outcome is count data or a continuous
#'   measure (this impacts how aggregation is done).
#' @param covariates Covariates to be passed to aggregation (list of string
#'   variable names).
#' @return Dataframe of adjusted data.
#' @example examples/aggregate_data_etc.R
#' @export
adjust_data = function( dat, outcomename, groupname, Nname, pi_star, is_count=FALSE,
                        include_aggregate = FALSE,
                        covariates = NULL ) {

    # add the target subgroup weights to the dataframe
    adat = merge( dat, pi_star[ c( groupname, "pi_star" ) ], by=groupname, all.x = TRUE )

    if ( is_count ) {
        adat[outcomename] = adat[[outcomename]] / adat[[Nname]]
    }

    # calculate adjusted outcomes
    adj.dat = adat %>% dplyr::group_by( month ) %>%
        dplyr::summarise( #.Y = sum( N * ( !!rlang::sym( outcomename ) ) / sum(N) ),
                   .Y.adj = sum( pi_star * !!rlang::sym( outcomename ) ),
                   N = sum(!!rlang::sym(Nname) ) )

    if ( is_count ) {
        adj.dat = dplyr::mutate( adj.dat, #.Y = .Y * N,
                          .Y.adj = .Y.adj * N )
    }

    oname = paste0( outcomename, ".adj" )
    adj.dat[ oname ] = adj.dat$.Y.adj
  #  adj.dat[ outcomename ] = adj.dat$.Y
    adj.dat$.Y.adj = adj.dat$.Y = NULL

    if ( include_aggregate ) {
        sdat = aggregate_data( dat, 
                               outcomename=outcomename, groupname=groupname, Nname=Nname, 
                               is_count=is_count, covariates = covariates )
        adj.dat = merge( adj.dat, sdat, by=c("N","month"), all=TRUE )
    }

    dplyr::arrange( adj.dat, month )
}





####### For simulation studies and illustration #######

#' A fake DGP with time varying categorical covariate for illustrating the code.
#'
#' This code makes synthetic grouped data that can be used to illustrate
#' benefits of post stratification.
#'
#' @param t_min Index of first month
#' @param t_max Index of last month
#' @param t0 last pre-policy timepoint
#' @param method Type of post-stratification structure to generate (three designs of 'complex', 'linear' and 'jersey' were originally concieved of when designing simulation studies with different types of structure).
#' @return Dataframe of fake data, with one row per group per time period.
#' @examples 
#' fdat = generate_fake_grouped_data(t_min=-5,t_max=10, t0 = 0)
#' table( fdat$month )
#' table( fdat$type )
#' @export
generate_fake_grouped_data = function( t_min, t0, t_max, method=c("complex","linear","jersey") ) {
    stopifnot( t_min < t0 )
    stopifnot( t_max > t0 )
    t = t_min:t_max

    method = match.arg(method)

    # number of cases of each type (not impacted by policy)
    # Drug is steadily declining.  violent is slowly increasing.
    N.drug = round( (200-800)*(t - t_min)/(t_max-t_min) + 800 )
    N.violent = round( (300-100)*(t - t_min)/(t_max-t_min) + 100 )
    
    if ( method == "complex" ) {
      # Add a seasonality component
      N.violent = N.violent + 55 * sin( 2 * pi * t / 12)
    }
    
    if ( method == "jersey" ) {
      N.drug = stats::rpois( length( t ), lambda=700 )
      N.violent = stats::rpois( length( t ), lambda=400 )
      N.property = stats::rpois( length( t ), lambda=500 )
      N.drug = pmax( 0.55, pmin( 1, 1 - (t - t0) / 25 ) ) * N.drug
    }
    
    if ( method=="linear" || method == "complex") {

        # impact on proportion of cases with outcome
        prop.base = arm::logit( seq( 0.8, 0.4, length.out=length(t) ) )

        prop.violent = arm::invlogit( prop.base - 1.5 + stats::rnorm( length(t), mean=0, sd=0.05 )
                                      + (t>t0) * pmin( 0.3*(t-t0), 1.5 ) )

        prop.drug =    arm::invlogit( prop.base + stats::rnorm( length(t), mean=0, sd=0.05 )
                                      - (t>t0) * (0.05*(t-t0)) )
    } else {
      # impact on proportion of cases with outcome
      prop.base = arm::logit( seq( 0.5, 0.55, length.out=length(t) ) )
      
      prop.violent = arm::invlogit( prop.base + 1.5 + stats::rnorm( length(t), mean=0, sd=0.02 )
                                    - (t>t0) * (0.01*(t-t0)) )
      
      prop.property = arm::invlogit( prop.base + 1 + stats::rnorm( length(t), mean=0, sd=0.02 )
                                    - (t>t0) * (0.003*(t-t0)) )
      
      prop.drug =    arm::invlogit( prop.base + stats::rnorm( length(t), mean=0, sd=0.02 ) 
                                    - (t>t0) * (0.005*(t-t0)) )
      
    }

    
    ## Scenario 1b: multifacet, complex.
    # if ( FALSE ) {
    #     # number of cases of each type (not impacted by policy)
    #     N.drug = round( 300 - 5 * t + 2 * sin( 2 * pi * t / 12) )
    #     N.violent = 30 + round( 100 - 0.1 * t + 10 * sin( 2 * pi * t / 12) )
    # 
    #     # impact on proportion of cases with outcome
    #     prop.drug = 0.6 - 0.01 * t   # baseline index (will recalculate below)
    #     prop.violent = arm::invlogit( prop.drug/2 + stats::rnorm( length(t), mean=0, sd=0.15 )
    #                                   + (t>t0) * pmin( 0.3*(t-t0), 1.5 ) )
    #     prop.drug = arm::invlogit( -1 + prop.drug - (t>t0)* (0.15*(t-t0)) + stats::rnorm( length(t), mean=0, sd=0.15 ) )
    # }
    # 
    # ## Scenario 2: change in number of drug cases, but no impact on case handling within category
    # ## Nonsensical, I think.
    # if ( FALSE ) {
    #     N.drug = round( 100 - 0.5 * t - (t >= t0) * ( 10 + (t-t0) * 2 ) )
    #     N.violent = round( 100 - 0.1 * t + 10 * sin( 2 * pi * t / 12) )
    # 
    #     prop.drug = 0.6 - 0.01 * t
    #     prop.violent = arm::invlogit( prop.drug + 0.2 + stats::rnorm( length(t), mean=0, sd=0.15 ) )
    #     prop.drug = arm::invlogit( -2 + prop.drug + stats::rnorm( length(t), mean=0, sd=0.15 ) )
    # }


    # bundle our subgroups
    make.frame = function( N, prop, type="unknown" ) {
        Y = round( N * prop )
        data.frame( month = t, type=type, N=N, Y=Y, prop = Y / N, stringsAsFactors = FALSE )
    }

    df = dplyr::bind_rows( make.frame( N.drug, prop.drug, "drug" ),
                    make.frame( N.violent, prop.violent, "violent" ) )
    if ( method =="jersey" ) {
      df = dplyr::bind_rows( df, 
                      make.frame( N.property, prop.property, "property" ) )
    }
    df = mutate( df,
                 M = 1 + (month %% 12),
                 M.ind = as.factor(M),
                 A = sin( 2 * pi * month / 12 ),
                 B = cos( 2 * pi * month / 12 ),
                 Tx = as.numeric(month >= t0) )

    df = dplyr::arrange( df, month )
    df
}





#### Exploring and testing our fake data structure ####

if ( FALSE ) {
    # fake, illustration data -- specifying the range of months
    t_min = -12*6.5
    t0 = 0
    t_max = 18

    dat = generate_fake_grouped_data( t_min, t0, t_max, method = "jersey" )
    head( dat )

    ss = aggregate_data( dat, "prop", "type", rich=TRUE )
    head( ss )
    plot( ss$pi_drug )
    
    sdat = aggregate_data( dat, "prop", "type", is_count=FALSE, rich = FALSE )
    sdat2 = aggregate_data( dat, "Y", "type", is_count=TRUE, rich= FALSE )
    sdat = merge( sdat, sdat2, by=c("month","N") )
    head( sdat )
    sdat$type = "all"

    d2 = dplyr::bind_rows( dat, sdat )
    d2 = tidyr::gather( d2, Y, N, prop, key="variable", value="outcome" )
    ggplot2::ggplot( d2, ggplot2::aes( month, outcome, col=type ) ) +
        ggplot2::facet_wrap( ~ variable , scales = "free_y" ) +
        ggplot2::geom_line() +
        ggplot2::geom_vline( xintercept=t0, col="red" )



    dat %>% dplyr::group_by( type ) %>% dplyr::summarise( N.bar = mean(N),
                                            Y.bar = mean(Y),
                                            prop.bar = mean(prop) )

}


#### Examining aggregation functions ####
if ( FALSE ) {
    head( dat )

    # Calculate how to weight the groups
    pis = calculate_group_weights( "type", dat, t0, max(dat$month) )
    pis



    # looking at rates
    head( dat )
    sdat = aggregate_data( dat, "prop", "type", is_count=FALSE )

    adjdat = adjust_data( dat, "prop", "type", pis )
    head( adjdat )
    adjdat = merge( adjdat, sdat, by=c("N","month"), all=TRUE )
    head( adjdat )

    d1 = gather( adjdat, starts_with( "pi" ), key="group", value="pi" )
    head( d1 )
    ggplot2::ggplot( d1, ggplot2::aes( month, pi, col=group ) ) +
        ggplot2::geom_line() +
        ggplot2::labs( title="Sizes of the groups")

    d2 = tidyr::gather( adjdat, starts_with( "prop" ), key="outcome", value="Y" )
    head( d2 )

    ggplot2::ggplot( d2, ggplot2::aes( d2$month, d2$Y, col=outcome ) ) +
        ggplot2::geom_line()

    # checking calculations
    head( adjdat )


    # Looking at counts
    sdat = aggregate_data( dat, "Y", "type", is_count=TRUE )
    head( sdat )

    adjdat = adjust_data( dat, "Y", "type", pis, is_count = TRUE )
    head( adjdat )
    d2 = tidyr::gather( adjdat, Y.adj, Y, starts_with( "type." ), key="outcome", value="Y" )
    head( d2 )

    ggplot2::ggplot( d2, ggplot2::aes( d2$month, d2$Y, col=outcome ) ) +
        ggplot2::geom_line()

}



#### Illustration of the easy modeling approach  ####



if ( FALSE ) {

    # fake, illustration data -- specifying the range of months
    t_min = -12*6.5
    t0 = 0
    t_max = 18

    dat = generate_fake_grouped_data( t_min, t0, t_max )
    head( dat )

    pis = calculate_group_weights( "type", dat, t0, max(dat$month) )
    pis


    ##
    ## The proportion as outcome
    ##
    adjdat = adjust_data( dat, "prop", "type", pis, include_aggregate=TRUE )
    head( adjdat )

    adjdat = add_lagged_covariates(adjdat, "prop.adj", c("A","B") )
    head( adjdat )

    # Modeling adjusted and not
    envelope.adj = process_outcome_model( "prop.adj", adjdat, t0=t0, R = 100, summarize = TRUE, smooth=FALSE )

    envelope = process_outcome_model( "prop", adjdat, t0=t0, R = 100, summarize = TRUE, smooth=FALSE )

    envelope.drug = process_outcome_model( "prop.drug", adjdat, t0=t0, R = 100, summarize = TRUE, smooth=FALSE )
    envelope.violent = process_outcome_model( "prop.violent", adjdat, t0=t0, R = 100, summarize = TRUE, smooth=FALSE )

    env = dplyr::bind_rows( raw=envelope, adjusted=envelope.adj, drug=envelope.drug, violent=envelope.violent, .id="model")
    head( env )
    plt <- ggplot2::ggplot( env, ggplot2::aes( month, col=model ) ) +
        ggplot2::geom_line( ggplot2::aes(y= env$Ystar), lty=2 ) +
        ggplot2::geom_line( ggplot2::aes(y=env$Y)) + ggplot2::geom_point( ggplot2::aes( y=env$Y ), size=0.5 ) +
        #geom_line( aes(y=Ysmooth1), lty=2 ) +
        ggplot2::geom_vline( xintercept=t0 )

    #plt

    plt +         facet_wrap( ~model )



    ##
    ## And with Y (counts)
    ##
    adjdat = adjust_data( dat, "Y", "type", pis, include_aggregate=TRUE, is_count = TRUE )
    head( adjdat )

    qplot( Y, Y.adj, data=adjdat )

    adjdat = add_lagged_covariates(adjdat, "Y.adj", c("A","B") )
    head( adjdat )

    # Modeling adjusted and not
    envelope.adj = process_outcome_model( "Y.adj", adjdat, t0=t0, R = 100, summarize = TRUE, smooth=FALSE )

    envelope = process_outcome_model( "Y", adjdat, t0=t0, R = 100, summarize = TRUE, smooth=FALSE )

    envelope.drug = process_outcome_model( "Y.drug", adjdat, t0=t0, R = 100, summarize = TRUE, smooth=FALSE )
    envelope.violent = process_outcome_model( "Y.violent", adjdat, t0=t0, R = 100, summarize = TRUE, smooth=FALSE )

    env = dplyr::bind_rows( raw=envelope, adjusted=envelope.adj, drug=envelope.drug, violent=envelope.violent, .id="model")
    head( env )
    plt <- ggplot2::ggplot( env, ggplot2::aes( month, col=model ) ) +
        ggplot2::geom_line( aes(y= env$Ystar), lty=2 ) +
        ggplot2::geom_line( aes(y= env$Y)) + ggplot2::geom_point( aes( y=env$Y ), size=0.5 ) +
        # ggplot2::geom_line( aes(y=Ysmooth1), lty=2 ) +
        ggplot2::geom_vline( xintercept=t0 )

    #plt

    plt +         ggplot2::facet_wrap( ~model )


}
Any scripts or data that you put into this service are public.
simITS documentation built on July 2, 2020, 4:10 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
simITS
Analysis via Simulation of Interrupted Time Series (ITS) Data

R/post_stratified_ITS.R
In simITS: Analysis via Simulation of Interrupted Time Series (ITS) Data

Defines functions generate_fake_grouped_data adjust_data aggregate_data calculate_group_weights

Documented in adjust_data aggregate_data calculate_group_weights generate_fake_grouped_data

Try the simITS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

simITS Analysis via Simulation of Interrupted Time Series (ITS) Data

R/post_stratified_ITS.R In simITS: Analysis via Simulation of Interrupted Time Series (ITS) Data

Defines functions generate_fake_grouped_data adjust_data aggregate_data calculate_group_weights

Documented in adjust_data aggregate_data calculate_group_weights generate_fake_grouped_data

Try the simITS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

simITS
Analysis via Simulation of Interrupted Time Series (ITS) Data

R/post_stratified_ITS.R
In simITS: Analysis via Simulation of Interrupted Time Series (ITS) Data