#' @title Descriptive within/between decomposition
#'
#' @description [...]
#'
#' @param y Dependent variable
#' @param group Grouping variable to decompose variance into within- and between-group components
#' @param time Time variable to analyze change over time
#' @param weights Probability weights
#' @param dat Dataframe
#' @param smoothDat Logical. Should data be smoothed?
#' @param ref Number or FALSE. Should values be reported in reference to a specific time?
#' @param long Logical. Should output be in long format?
#'
#' @return List of length 2. Element 1 returns the decomposition by group and time. Elements 2 returns the decomposition by time.
#'
#' @examples data(incdat)
#' wibe1 <- wibe(y="inc", group="SES", time="year", dat=dat1)
#'
#' @export wibe
#'
#' @author Benjamin Rosche <benjamin.rosche@@gmail.com>
#'
#' @details
#' ...
wibe <- function(y=NULL, group=NULL, time=NULL, weights=NULL, ref=F, long=F, dat) {
# dat <- data %>% dplyr::filter(n==1, bystart<=4); time <- "i.year5"; group <- "SES"; y <- "earnweekf"; weights=NULL; ref <- F; smoothDat <- F
# ============================================================================================== #
# Dissect input ----
# ============================================================================================== #
if(isTRUE(ref)) stop("ref must be numeric")
# Y
y <- dissectVar(y, cicheck="c")[[1]]
# Group
if(is.null(rlang::enexpr(group))) {
dat <- dat %>% dplyr::mutate(group=1)
group <- substitute(group)
}
group <- dissectVar(group, cicheck="i")[[1]]
# Time
if(is.null(rlang::enexpr(time))) {
dat <- dat %>% dplyr::mutate(time=1)
time <- substitute(time)
}
time <- dissectVar(time, cicheck="i")[[1]]
# Check whether variables are in the dataset
if(!as.character(y) %in% names(dat)) stop(paste0(y, " not in dataset"))
if(!as.character(group) %in% names(dat)) stop(paste0(group, " not in dataset"))
if(!as.character(time) %in% names(dat)) stop(paste0(time, " not in dataset"))
if(!is.null(weights)) if(!as.character(weights) %in% names(dat)) stop(paste0(weights, " not in dataset"))
# ============================================================================================== #
# Rename ----
# ============================================================================================== #
# Rename variables and filter NAs to avoid creating another grouping level
dat <-
dat %>%
dplyr::select({{ group }}, {{ time }}, {{ y }}, {{ weights }}) %>%
dplyr::rename(group:={{ group }}, time:={{ time }}, y:={{ y }}, w:={{ weights }}) %>%
tidyr::drop_na()
# Weights
if(is.null(weights)) dat <- dat %>% dplyr::mutate(w=1)
# Number of levels of group and time var
group_levels <- dat %>% .$group %>% unique() %>% sort()
time_levels <- dat %>% .$time %>% unique() %>% sort()
# ============================================================================================== #
# Create dat.between ----
# ============================================================================================== #
dat.between <-
tibble() %>%
tidyr::expand(time=time_levels, group=group_levels) %>%
left_join(
dat %>%
group_by(time, group) %>%
dplyr::summarise(
n=n(),
sw=sum(w),
mu=1/sw * sum(w * y, na.rm = T),
sigma2=n/(sw*(n-1)) * sum(w*(y-mu)^2),
sigma=sqrt(sigma2)
) %>%
ungroup() %>%
dplyr::mutate(
n=sw/sum(sw)*sum(n), # weighted n
CV2=sigma2/(mu^2) # squared CV
) %>%
dplyr::select(time, group, n, mu, sigma, sigma2, CV2),
by=c("time", "group"))
# Weights follow Stata's sum command:
# https://www.stata.com/support/faqs/statistics/weights-and-summary-statistics/
# ============================================================================================== #
# Create dat.total ----
# ============================================================================================== #
dat.total <-
dat.between %>%
group_by(time) %>%
dplyr::summarise(N=sum(n, na.rm = T),
gmu=sum(n*mu, na.rm = T)/N,
VarW=sum(n*sigma2, na.rm = T)/sum(n, na.rm = T),
VarB=sum(n*(mu-gmu)^2, na.rm = T)/sum(n, na.rm = T),
CV2W=sum((n/N)*(mu/gmu)^2*CV2, na.rm = T),
CV2B=(sum((n/N)*(((mu/gmu)^2)-1), na.rm = T))) %>%
ungroup() %>%
mutate(VarWBRatio=VarW/VarB, CV2WBRatio=CV2W/CV2B) %>%
# mutate(VarT=VarW+VarB, CV2T=CV2W+CV2B) %>%
inner_join(
dat %>%
group_by(time) %>%
dplyr::summarise(
VarT = var(y, na.rm = T),
CV2T = var(y, na.rm = T)/(mean(y, na.rm = T)^2)
) %>%
ungroup(), by="time")
# ============================================================================================== #
# Prepare output ----
# ============================================================================================== #
if(!isFALSE(ref)) {
# Relative to year == ref
dat.between <-
dat.between %>%
group_by(group) %>%
mutate(
mu1= mu[time==ref],
mu = (mu - mu1)/mu1*100+100,
sigma1 = sigma[time==ref],
sigma = (sigma - sigma1)/sigma1*100+100,
sigma21 = sigma2[time==ref],
sigma2 = (sigma2 - sigma21)/sigma21*100+100,
CV21 = CV2[time==ref],
CV2 = (CV2 - CV21)/CV21*100+100
) %>%
ungroup() %>%
dplyr::select(-mu1, -sigma1, -sigma21, -CV21)
dat.total <-
dat.total %>%
mutate(
VarB=(VarB-VarB[time==ref])/VarB[time==ref]*100+100,
VarW=(VarW-VarW[time==ref])/VarW[time==ref]*100+100,
VarT=(VarT-VarT[time==ref])/VarT[time==ref]*100+100,
VarWBRatio=(VarWBRatio-VarWBRatio[time==ref])/VarWBRatio[time==ref]*100+100,
CV2B=(CV2B-CV2B[time==ref])/CV2B[time==ref]*100+100,
CV2W=(CV2W-CV2W[time==ref])/CV2W[time==ref]*100+100,
CV2T=(CV2T-CV2T[time==ref])/CV2T[time==ref]*100+100,
CV2WBRatio=(CV2WBRatio-CV2WBRatio[time==ref])/CV2WBRatio[time==ref]*100+100
)
}
if(long==T) dat.between <- dat.between %>% tidyr::pivot_longer(cols=c(-time, -group), names_to = "variable", values_to = "value")
if(long==T) dat.total <- dat.total %>% tidyr::pivot_longer(cols=c(-time), names_to = "variable", values_to = "value")
# Rename variables back to their original names
dat.between <-
dat.between %>%
dplyr::rename(
!!enquo(time) := time,
!!enquo(group) := group
)
dat.total <-
dat.total %>%
dplyr::rename(
!!enquo(time) := time
)
return(list("By group and time"=dat.between, "By time"=dat.total))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.