SpecCurve: Constructing, Running, and Visualizing Specification Curves

Documented in curveRunner

# curveRunner - processes s-curve objects ---------------------------------




#' curveRunner - processes s-curve objects
#'
#' @param s.curve.model Specification curve model (generated by s.curve function)
#' @param inc.tidy.models logical, should results include tidy model output for each specification (from package broom)? If specified, will override default passed from s.curve.model object (e.g., for simpler output in permutation testing)
#' @param inc.full.models logical, should results include default model output for each specification (e.g., from function lm)? If specified, will override default passed from s.curve.model object (e.g., for simpler output in permutation testing)
#' @param perm.index index value of permutation testing.  If present, index column will be added to results.
#'
#' @return data frame, with rows containing model estimation results for each individual model specification in overall s-curve model
#' @export
#' @importFrom dplyr bind_rows
#' @importFrom broom tidy
#' @importFrom broom glance
#' @importFrom lmtest coeftest
#' @importFrom sandwich vcovHC
#'
#' @examples
curveRunner <-
  function(
    s.curve.model,
    inc.tidy.models = NULL,
    inc.full.models = NULL,
    perm.index = NULL
  ) {

    ## pull out a few things for convenience:
    dat <- s.curve.model$dat
    spec.list <- s.curve.model$spec.list
    subsettings <- s.curve.model$subsettings
    weightings <- s.curve.model$weightings

    if(is.null(inc.tidy.models)){
      inc.tidy.models <-
        s.curve.model$keep.tidy.models
    }

    if(is.null(inc.full.models)){
      inc.full.models <-
        s.curve.model$keep.full.models
    }


    # (STEP 3 from readme.md) RUN S-CURVE AND PROCESS -------------------------

    # .(3a) iterated model fitting ----

    ## Iterate through constructing and running the models described
    ## in spec.list:
    results.list <-
      lapply(seq_len(nrow(spec.list)),
             function(ind){

               ## .... initialization ----

               ## Initialize output:
               results.row <-
                 data.frame(
                   ## Index relating results back to the specification
                   ## list previously generated:
                   specification.no = spec.list$specification.index[ind],
                   ## Quick record of the model type used (lm, glm,
                   ## family if relevant, robustness approach)
                   model.type =
                     paste0(s.curve.model$mod.type, "-",
                            s.curve.model$mod.family, "-",
                            s.curve.model$robust.se),
                   stringsAsFactors = FALSE
                 )

               ## Write the specific model formula used into results:
               results.row$formulas <-
                 as.character(spec.list[["formula"]][ind])

               ## Initialize model parameters to apply to run:
               params <-
                 list(
                   ## The data:
                   data = dat,
                   ## The formulas:
                   formula = spec.list[["formula"]][[ind]]
                 )

               ## Initialize final model report
               ## This contains the elements of each specific run
               ## (for duplicate checking):
               final.model <- c()

               ## Get the weight and subset names for this run:
               weights.name <- spec.list[ind, "weights"]
               subset.name <- spec.list[ind, "subset"]

               ## ... record subsetting  ----
               if (!is.na(subset.name)){
                 ## Establish the subset:
                 params$data <- dat[subsettings[[subset.name]],]
                 ## note it in the output:
                 results.row$subset.used <- subset.name

                 ## Update the final model report:
                 final.model <- c(final.model, subset.name)
               }


               ## .... apply/record weighting ----

               ## If weightings used:
               if (!is.na(weights.name)){

                 ## Recalculate inverse probability weighting:
                 if(weights.name == "WEIGHT: IPW (calc)"){

                   ## Get the variables to recalculate the inverse
                   ## probability weighting on:


                   ipw.reweight.vars <-
                     c(spec.list[["treatment"]][[ind]], ## The
                       ## specific
                       ## treatment
                       ## variable

                       s.curve.model$weights.ipw.vars   ## Any
                       ## additional
                       ## variables
                     )

                   ## Recalculate weights:

                   margin.config <-
                     ## Variable to indicate how to calculate margins for IPWs:
                     ifelse(length(ipw.reweight.vars) == 1,
                            0,
                            2:length(ipw.reweight.vars))

                   ## Calculate a new weight table:
                   new.weight.table <-
                     1/prop.table(
                       do.call(table, params$data[ipw.reweight.vars]),
                       margin = margin.config)



                   ## Iterate through the actual values to collect weights:
                   params$weights <-
                     vapply(seq_len(nrow(params$data)),
                            function(i){
                              do.call(`[`,
                                      c(
                                        list(new.weight.table),
                                        lapply(params$data[ipw.reweight.vars],
                                               function(x){as.character(x)[i]})
                                      ))}, 1)

                 } else {

                   ## Use specific weight, subsetted:
                   params$weights <- weightings[subsettings[[subset.name]],
                                                weights.name]

                 }

                 ## note it in the output:
                 results.row$weights.used <- weights.name

                 ## Update the final model report:
                 final.model <- c(final.model, weights.name)
               }

               ## Generalized linear model:
               if(s.curve.model$mod.type == "glm"){
                 params$family <- s.curve.model$mod.family
               }

               ## Generalized multilevel model:
               if(s.curve.model$mod.type %in% c("glmer", "glmmTMB")){
                 params$family <- s.curve.model$mod.family
               }

               ## Set up function to match one specified:
               FUN <- match.fun(s.curve.model$mod.type)


               ## ....model runs ----

               ## Run model:
               model.run <-
                 do.call(FUN, params)

               ## Extract model features:
               model.glance <-
                 glance(model.run)

               ## Cluster models if requested:
               if(s.curve.model$cluster) {

                 model.run.cluster <-
                   cl(dat = f.data, fm = model.run,
                      cluster = f.data[[cluster.var]])

                 ## Get tidy model output:
                 model.tidy <-
                   tidy(model.run.cluster)


                 ## Robust standard errors, if requested:
               }

               if(!is.null(s.curve.model$robust.se)) {

                 ## Do standard error adjustment as called for by robust.se:
                 model.run.robust <-
                   coeftest(model.run,
                            vcov = vcovHC(model.run,
                                          type=s.curve.model$robust.se)
                   )

                 ## Get tidy model output:
                 model.tidy <-
                   tidy(model.run.robust)


                 ## If neither of these
               } else {

                 ## Gets tidied output (data frame) for concatenation:
                 model.tidy <-
                   tidy(model.run)

               }

               ## .(3b) organize results of each run ----

               ## Get info from tidy output to insert into main results:

               ## overall treatment effect:
               results.row[c("estimate", "std.error",
                             "statistic", "p.value")] <-
                 model.tidy[model.tidy$term %in% s.curve.model$treatment,
                            c("estimate", "std.error",
                              "statistic", "p.value")]


               ## Assign a logical significance variable:
               results.row$significant <-
                 if (is.null(s.curve.model$tail)){
                   results.row$p.value <= s.curve.model$alpha
                 } else if (s.curve.model$tail == "upper") {
                   results.row$p.value <= 2*s.curve.model$alpha &&
                     results.row$estimate > 0
                 } else {
                   results.row$p.value <= 2*s.curve.model$alpha &&
                     results.row$estimate < 0
                 }

               ## Bind in model characteristics from glance()
               results.row <- cbind.data.frame(results.row, model.glance)

               ## Record the number of cases used in each model
               ## (in case something was dropped due to missingness):
               results.row["n"] <- nobs(model.run)

               if(s.curve.model$mod.type %in% c("lmer", "glmer", "glmmTMB")){
                 #get a random effects only model:
                 model.tidy.randef <-
                   model.tidy[model.tidy$effect == "ran_pars",]

                 model.tidy.fixef <-
                   model.tidy[model.tidy$effect == "fixed",]

                 ## Add variables indicating estimates for each variable:
                 results.row[paste0(model.tidy.fixef$term, ".est")] <-
                   as.list(model.tidy.fixef$estimate)

                 ## Variables indicating SE's for each included variable:
                 results.row[paste0(model.tidy.fixef$term, ".se")] <-
                   as.list(model.tidy.fixef$std.error)

                 ## Variables indicating p.vals for each included variable:
                 results.row[paste0(model.tidy.fixef$term, ".pval")] <-
                   as.list(model.tidy.fixef$p.value)

                 ## Variables for random effects terms:
                 results.row[paste0(model.tidy.randef$group, "_", model.tidy.randef$term, ".est")] <-
                   as.list(model.tidy.randef$estimate)

                 ## Create a record of included variables:
                 model.tidy.include <-
                   ## Drops any variables that are not estimated (e.g.,
                   ## rank-deficient), and intercept term:
                   model.tidy[!is.na(model.tidy$estimate),][-1,]

                 final.variables <-
                   ifelse(
                     # If it's a fixed effect:
                     is.na(model.tidy.include$group),
                     # Just add the fixed effect term:
                     model.tidy.include$term,
                     # But if random construct that and add it:
                     paste0(model.tidy.include$group, "_", model.tidy.include$term)
                   )


               } else {

                 ## Version for LM and GLM models:

                 ## Add variables indicating estimates for each variable:
                 results.row[paste0(model.tidy$term, ".est")] <-
                   as.list(model.tidy$estimate)

                 ## Variables indicating SE's for each included variable:
                 results.row[paste0(model.tidy$term, ".se")] <-
                   as.list(model.tidy$std.error)

                 ## Variables indicating p.vals for each included variable:
                 results.row[paste0(model.tidy$term, ".pval")] <-
                   as.list(model.tidy$p.value)

               ## Create a record of included variables:
               final.variables <-
                 ## Drops any variables that are not estimated (e.g.,
                 ## rank-deficient):
                 model.tidy$term[!is.na(model.tidy$estimate)][-1]
               }

               ## Update the final model report:
               final.model <- c(final.model, final.variables)

               ## Save the variables in the output for review (as a list):
               results.row$final.variables <-
                 list(final.variables)

               ## Save the overall specification in the output for
               ## review (as list):
               results.row$final.specification <-
                 list(final.model)

               ## Add in the full tidy model output if requested (as list):
               if(inc.tidy.models) {
                 results.row$tidy.models <- list(model.tidy)
                 results.row$glance.models <- list(model.glance)
               }

               ## Add in the full model output if requested (as list,
               ## gets big fast):
               if(inc.full.models) {
                 results.row$full.models <- list(model.run)
               }

               ## Output the results from this particular run:
               results.row

             })

    # .(3c) compile into dataframe of results ----
    ## Bind it up together as a single data.frame
    ##(requires dplyr for bind_rows, as rbind is less flexible)
    results.df <-
      do.call(results.list, what = bind_rows)


    ## If permutation run,
    ## mark which permutation this data is:
    if(!is.null(perm.index)){
      results.df$permutation.index <- perm.index
    }

    ## Output the full data frame of the fit model
    results.df

  }