#' Specification curve maker function
#'
#' @param dat a dataframe containing the variables
#' @param outcomes vector of outcome variables
#' @param treatment vector of treatment variables
#' @param cov.list named list, sets of covariates / moderators - write
#' moderators as "var1:var2"; eg: cov.list = list(gender = c("gender.roster",
#' "gender.selfreport")
#' @param no.cov.exclude Will add a model where each item of the list above is
#' missing, unless specified here
#' @param extra.models Models written literally; will be appended to the set
#' (still crossed against subsets and weights)
#' @param extra.treatment Treatment variable in extra models (length 1 or same
#' as other, needed if doing inverse probability weighting
#' @param mod.type takes lm, glm
#' @param mod.family if family needed
#' @param alpha nominal alpha value (if one tailed, will test against p <= .1
#' on that side)
#' @param tail One tailed test? Takes "upper" and "lower"
#' @param subsets runs subsets of data based on the specifications listed here
#' (vector of conditions)
#' @param subsets.exclude if subsets added, includes an un-subsetted version
#' @param weights vector of weight variables names to add to runs, re-named if
#' desired or, a weighting method (currenly accepts "ipw.calc" for inverse
#' probability weighting)
#' @param weights.exclude Includes an unweighted version where weights added
#' @param weights.ipw.vars in addition to treatment variable, any other (factor
#' or categorical) variables to consider when (re)weighting subsets
#' @param permutations optional, number of permutations for p-curve
#' @param perm.pvalues logical, calculates permutation test-based pvalues (if
#' permutation test active)
#' @param perm.seed optional, RNG seed to use for permutation test (integer,
#' will create one if not present)
#' @param cluster optional, Cluster robust standard errors
#' @param cluster.var optional, Variable on which to cluster, string
#' @param robust.se optional, heteroskedasticity-consistent standard error
#' adjustment provided by package "sandwich". See details in ?vcovHC.
#' @param cat.percent logical, displays summary output of % significant at the
#' end of the data for convenience in interactive mode. Set to false if using
#' as a part of an Rmarkdown file.
#' @param keep.tidy.models logical, set to false for large model samples
#' @param keep.full.models logical, set to false for large model samples
#' @param model.only logical, outputs the model for later use, rather than
#' running (default is FALSE)
#'
#' @return depending on paramters, either a fitted s-curve object or a template for fitting an s-curve object
#' @export
#'
#' @examples
s.curve <- function(dat, outcomes, treatment,
cov.list, no.cov.exclude = NULL,
extra.models = NULL,
extra.treatment = NULL,
mod.type = lm, mod.family = NULL,
alpha = .05, tail = NULL,
subsets = NULL, subsets.exclude = TRUE,
weights = NULL, weights.exclude = TRUE,
weights.ipw.vars = NULL,
permutations = NULL,
perm.pvalues = FALSE, perm.seed = NULL,
cluster = FALSE, cluster.var = NULL,
robust.se = NULL,
cat.percent = TRUE,
keep.tidy.models = TRUE,
keep.full.models = FALSE,
model.only = FALSE) {
## Type of model:
model.type <- deparse(substitute(mod.type))
## INITIALIZATION ----
s.curve.mod <-
list(
dat = dat,
treatment = treatment,
outcomes = outcomes,
alpha = alpha,
tail = tail,
mod.type = model.type,
mod.family = mod.family,
cluster = cluster,
cluster.var = cluster.var,
robust.se = robust.se,
permutations = permutations,
perm.pvalues = perm.pvalues,
perm.seed = perm.seed,
weights.ipw.vars = weights.ipw.vars,
keep.tidy.models = keep.tidy.models,
keep.full.models = keep.full.models
)
## (STEP 2 from readme.md) FORMULA BUILDING ----
## Create a vector of all included variables,
## outcomes, treatment variables, and covariates:
vars.list <-
unique(c(outcomes, treatment, unlist(cov.list)))
## .(2a & 2b) cross treatment, outcomes, & covariates ----
## Add NA's to beginning of each category to model exclusion of that
## set from specification, excepting any categories marked for
## constant inclusion:
cov.list.na <-
sapply(
names(cov.list),
function(x){
if(x %in% no.cov.exclude){
cov.list[[x]]
} else {
c(NA, cov.list[[x]])
}
},
USE.NAMES = TRUE,
simplify = FALSE
)
## Add in condition variables for right-hand side
var.list.rhs <-
c(list(treatment = treatment), cov.list.na)
## Add in outcome variables
var.list.all <-
c(list(outcomes = outcomes), var.list.rhs)
## Expand to a data frame where each case
## elements to go in a formula on right-hand side
mod.grid <-
expand.grid(var.list.rhs,
stringsAsFactors = FALSE)
## elements to go in all of them:
mod.grid.all <-
expand.grid(var.list.all,
stringsAsFactors = FALSE)
## .(2c) construct formulas ----
## Make right-hand side formula
formulas.rhs <-
apply(mod.grid, 1, function(x){
## dropping NA's (when category excluded):
formula.rhs <- paste(na.omit(x), collapse= " + ")
## adding "1" to empty formula (to avoid errors):
formula.rhs[formula.rhs == ""] <- 1
formula.rhs
})
## Probably don't want empty models, so throw warning:
if( any(formulas.rhs == "1") ) {
warning(
"NOTE: set includes one or more null models."
)}
## Adds left-hand side (outcomes), make full formulas:
s.curve.mod$formulas <-
lapply(
as.vector(outer(outcomes, formulas.rhs,
paste, sep=" ~ ")),
as.formula)
## .(2d) extra models ----
## Have a vector of treatment variables for use later:
treatment.vars <-
mod.grid.all$treatment
## Add in any extra models, if applicable:
if(!is.null(extra.models)){
s.curve.mod$formulas <-
c(s.curve.mod$formulas,
lapply(extra.models, as.formula)
)
if (is.null(extra.treatment)){
if (length(treatment) == 1){
## fill it in with the presumed treatment variable
## (to be updates later for more robust checking):
treatment.vars <-
c(treatment.vars, rep(treatment, length(extra.models)))
} else {
stop("need to specify treatment variables for extra models if more than 1")
}
} else {
treatment.vars <- c(treatment.vars, extra.treament)
}
}
## Converts formula names to character vector for easier review later:
s.curve.mod$formula.names <-
sapply(s.curve.mod$formulas, deparse, width.cutoff = 500)
## Length of formula list (# initial specifications):
s.curve.mod$n.formulas <-
length(s.curve.mod$formulas)
## Initialize names of everything:
full.names <-
s.curve.mod$formula.names
## .(2e) subsets & weightings ----
## ..........subsets ----
## Make the data subsets:
if (!is.null(subsets)){
subsettings <-
do.call(
cbind.data.frame,
lapply(subsets, function(singlesubset){
with(dat, eval(parse(text = singlesubset)))
}))
## Add prefix to subset for identification:
subset.names <-
paste0("SUBSET: ", subsets)
names(subsettings) <-
subset.names
## Add a no-subset "subset" if that's relevant
if (subsets.exclude) {
subsettings <-
cbind.data.frame(
setNames(data.frame(TRUE), "SUBSET: ALL DATA"),
subsettings
)
subset.names <-
names(subsettings)
}
## Update the full names
full.names <-
as.vector(outer(full.names, subset.names,
paste, sep=", "))
## Write to model for curveRunner use:
s.curve.mod$n.subsettings <-
length(subset.names)
s.curve.mod$subsettings <-
subsettings
s.curve.mod$subset.names <-
subset.names
} else {
## Just have a non-subsetting subset:
subsettings <- setNames(NA, NA)
}
## ..........weights ----
## Make the data weights:
if (!is.null(weights)){
## Version if single weights vector is provided:
if (is.numeric(weights)){
## Throw error if weights is incorrect length:
if (length(weights) != nrow(dat)) stop("Weighting variable must be length of data")
## name weights specification component:
weights.names <-
"WEIGHT: WEIGHTED"
## Make a data frame of weights
weightings <-
data.frame(`WEIGHT: WEIGHTED` = weights)
} ## End numeric approach
## Version if weights are provided in list vector form:
if (is.list(weights)){
## Throw error if weights is incorrect length or not numeric:
if (
any(vapply(weights, length, 1) != nrow(dat)) ||
any(vapply(weights, class, "class") != "numeric")
) stop("List of weights must all be numeric vectors equal to length of data.")
## Make a vector of names for the weights
if(!is.null(names(weights))){
## use provided names if available:
weights.names <- paste0("WEIGHT: ", names(weights))
} else {
## otherwise make a simple sequence:
weights.names <- paste0("WEIGHT: WEIGHT", seq_along(weights))
}
## Make a data frame of weights
weightings <-
as.data.frame(weights)
## apply the names:
names(weightings) <-
weights.names
} ## End list vector approach
## Version if names are provided (in character form):
if (is.character(weights) && weights != "ipw.calc"){
## Make a vector of names for the weights
if(!is.null(names(weights))){
## use provided names if available:
weights.names <- paste0("WEIGHT: ", names(weights))
} else {
## or just use the variable names from the dataframe:
weights.names <- paste0("WEIGHT: ", weights)
}
## Assign the weightings to a new dataframe:
weightings <-
do.call(
cbind.data.frame,
lapply(weights, function(wset){
dat[wset]
}))
## apply the names:
names(weightings) <- weights.names
} ## End character variable approach
## Version if "ipw.calc" is called:
if (weights == "ipw.calc") {
weights.names <- paste0("WEIGHT: IPW (calc)")
weightings <-
setNames(
data.frame(rep(1, nrow(dat))),
weights.names
)
} ## End dynamic calculation approach (actual reweights handled in
## curveRunner below)
## Add extra column if weights.exclude == TRUE
if (weights.exclude) {
weightings <-
cbind.data.frame(
setNames(data.frame(1), "WEIGHT: NONE"),
weightings
)
weights.names <- names(weightings)
}
## Construct new specification names based on the weights:
full.names <-
as.vector(
outer(full.names, weights.names,
paste, sep=", ")
)
## Write to model for curveRunner use:
s.curve.mod$weightings <- weightings
s.curve.mod$weights.names <- weights.names
s.curve.mod$n.weightings <- length(weights.names)
} else {
## If no weights, just make a NA variable to indicate:
weightings <- setNames(NA, NA)
}
## CREATE SPECIFICATION INDEX LIST ----
## total number of specifications after variables and weightings:
s.curve.mod$n.specifications <- length(full.names)
## Create a final specification list:
s.curve.mod$spec.list <-
setNames(
expand.grid(
s.curve.mod$formulas,
names(weightings),
names(subsettings),
stringsAsFactors = FALSE
),
c("formula", "weights", "subset")
)
## Add in a treatment variable for the index (used for inverse probability weighting)
s.curve.mod$spec.list$treatment <- treatment.vars
## Give each specification a numeric index:
s.curve.mod$spec.list$specification.index <-
seq_len(nrow(s.curve.mod$spec.list))
## Just output the s-curve overall specification, if requested:
if(model.only) return(s.curve.mod)
## (STEP 3) RUN MAIN SPECIFICATION CURVE --------------------------------
s.curve.mod$results <-
curveRunner(s.curve.mod)
## (STEP 4) POST-RUN PROCESSING ------------------------------------------
## .(4a) duplicate handling -------
## Check for duplicated models, note positions:
duplicates <-
which(
duplicated(
lapply(s.curve.mod$results$final.specification,
## Moves around terms so that duplicates aren't missed due to
## R-style formatting:
function(x){
str_split(x, ":") %>%
lapply(sort) %>%
vapply(function(x){paste0(x, collapse = ":")}, FUN.VALUE = "char") %>%
sort
})
))
## Set a duplicate flag:
s.curve.mod$has.duplicates <- FALSE
## Make note of duplicates, if existing, and update:
if(length(duplicates) > 0){
s.curve.mod$has.duplicates <- TRUE
## Moves results of any duplicated models to separate location:
s.curve.mod$duplicated.models <-
s.curve.mod$results[duplicates,]
## Moves duplicate specifications from specification list:
s.curve.mod$duplicate.specs <-
s.curve.mod$spec.list[duplicates,]
## Drops duplicates from results:
s.curve.mod$results <-
s.curve.mod$results[-duplicates,]
## Drops duplicates from specification list:
s.curve.mod$spec.list <-
s.curve.mod$spec.list[-duplicates,]
## Records the total number of speficiations:
s.curve.mod$n.specifications.original <-
s.curve.mod$n.specifications
## Updates the actual number of remaining specifications:
s.curve.mod$n.specifications <-
nrow(s.curve.mod$results)
## Print message: report on identified duplicates:
message("Duplicate models dropped, ",
s.curve.mod$n.specifications.original,
" now ",
s.curve.mod$n.specifications,
" (less ",
s.curve.mod$n.specifications.original -
s.curve.mod$n.specifications,
")")
}
## (pemutation test-not used in Bryan, Yeager, O'Brien) ----
if(is.numeric(permutations)){
# Set the RNG seed:
if(!is.null(s.curve.mod$perm.seed)){
# use specified seed if available:
set.seed(s.curve.mod$perm.seed)
} else {
# Otherwise generate one and record it:
s.curve.mod$perm.seed <- sample(1:1000, 1)
set.seed(s.curve.mod$perm.seed)
}
## Save a nested list of individual data frames
## like the one we just made above:
perm.dat <- dat
replicate(
## number of permutations
permutations,
{
## shuffle all treatment vars
## Same ordering for all variables in a particular
## iteration
new.order <- sample(nrow(perm.dat))
perm.dat[treatment] <-
lapply(
treatment,
function(tvar){
perm.dat[[tvar]][new.order]
})
## run model for that permutation:
curveRunner(s.curve.mod,
inc.full.models = FALSE,
inc.tidy.models = FALSE)
},
simplify=FALSE)
} #
## .(4b & 4c) Model results and output ----
## Calls s-curve update to provide final output:
s.curve.update(
s.curve.mod = s.curve.mod,
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.