mutual: Entropy-Based Segregation Indices

Documented in mutual_local mutual_total mutual_total_nested mutual_within

#' @import data.table
mutual_total_compute <- function(data, group, unit, base) {
    n_total <- sum(data[, freq])
    data[, n_unit := sum(freq), by = unit]
    data[, n_group := sum(freq), by = group]
    data[, `:=`(p = freq / n_total, p_exp = n_group * n_unit / (n_total^2))]

    # calculate M
    M <- data[p > 0, sum(p * log(p / p_exp, base = base))]

    # calculate H
    p <- data[, list(p = first(n_group / n_total)), by = group][["p"]]
    entropy_group <- sum(p * logf(1 / p, base))
    H <- M / entropy_group

    data.table(
        stat = c("M", "H"), est = c(M, H),
        stringsAsFactors = FALSE
    )
}

#' @import data.table
mutual_total_within_compute <- function(data, group, unit, within, base,
                                        components = FALSE) {
    if (components == FALSE) {
        # this uses the equation M(A; B | C) = M(A; B, C) - M(A; C)
        total <- mutual_total_compute(data, group, c(within, unit), base)
        within_data <- data[, list(freq = sum(freq)), by = c(group, within)]
        between <- mutual_total_compute(within_data, group, within, base)
        res <- data.table(
            stat = c("M", "H"),
            est = c(
                total[[1, "est"]] - between[[1, "est"]],
                total[[2, "est"]] - between[[2, "est"]]
            ),
            stringsAsFactors = FALSE
        )
        return(res)
    }

    # calculate totals
    n_total <- sum(data[, "freq"])
    n_within <- data[, list(n_within_group = sum(freq)), by = c(within, group)]
    n_within[, `:=`(n_within = sum(n_within_group)), by = within]
    # calculate proportions and entropy for each within unit
    n_within[, `:=`(p_within = n_within / n_total, p = n_within_group / n_within)]
    entropy <- n_within[, list(entropyw = sum(p * logf(1 / p, base)), p_within = first(p_within)),
        by = within
    ]
    setkeyv(entropy, within)

    # calculate totals
    data[, `:=`(n_total, sum(freq)), by = within]
    data[, `:=`(n_unit, sum(freq)), by = c(within, unit)]
    data[, `:=`(p_unit = n_unit / n_total, p_group_g_unit = freq / n_unit)]

    # calculate entropy within groups
    by_unit <- data[, list(
        p_unit = first(p_unit),
        entropy_cond = sum(p_group_g_unit * logf(1 / p_group_g_unit, base))
    ),
    by = c(within, unit)
    ]
    setkeyv(by_unit, within)

    # compute total entropy
    p <- data[, list(p = sum(freq)), by = group][["p"]] / n_total
    entropy_overall <- sum(p * logf(1 / p, base))

    # merge within entropy, and compare to group entropy
    by_unit <- merge(by_unit, entropy)
    by_within <- by_unit[, list(
        M = sum(p_unit * (entropyw - entropy_cond)),
        p = first(p_within),
        H = sum(p_unit * (entropyw - entropy_cond)) / first(entropyw),
        ent_ratio = first(entropyw) / entropy_overall
    ), by = within]
    by_within$H <- ifelse(is.finite(by_within$H), by_within$H, 0)

    melt(by_within,
        id.vars = within, measure.vars = c("M", "p", "H", "ent_ratio"),
        variable.name = "stat", value.name = "est",
        variable.factor = FALSE
    )
}

#' Calculates the Mutual Information Index M and Theil's Entropy Index H
#'
#' Returns the total segregation between \code{group} and \code{unit}.
#' If \code{within} is given, calculates segregation within each
#' \code{within} category separately, and takes the weighted average.
#' Also see \code{\link{mutual_within}} for detailed within calculations.
#'
#' @param data A data frame.
#' @param group A categorical variable or a vector of variables
#'   contained in \code{data}. Defines the first dimension
#'   over which segregation is computed.
#' @param unit A categorical variable or a vector of variables
#'   contained in \code{data}. Defines the second dimension
#'   over which segregation is computed.
#' @param within A categorical variable or a vector of variables
#'   contained in \code{data}. The variable(s) should be a superset of either
#'   the \code{unit} or the \code{group} for the calculation to be meaningful.
#'   If provided, segregation is
#'   computed within the groups defined by the variable, and then averaged.
#'   (Default \code{NULL})
#' @param weight Numeric. (Default \code{NULL})
#' @param se If \code{TRUE}, the segregation estimates are bootstrapped to provide
#'   standard errors and to apply bias correction. The bias that is reported
#'   has already been applied to the estimates (i.e. the reported estimates are "debiased")
#'   (Default \code{FALSE})
#' @param CI If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
#'   in addition to the bootstrap standard error.
#'   This is based on percentiles of the bootstrap distribution, and a valid interpretation
#'   relies on a larger number of bootstrap iterations. (Default \code{0.95})
#' @param n_bootstrap Number of bootstrap iterations. (Default \code{100})
#' @param base Base of the logarithm that is used in the calculation.
#'   Defaults to the natural logarithm.
#' @return Returns a data.table with two rows. The column \code{est} contains
#'   the Mutual Information Index, M, and Theil's Entropy Index, H. The H is the
#'   the M divided by the \code{group} entropy. If \code{within} was given,
#'   M and H are weighted averages of the within-category segregation scores.
#'   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
#'   the associated bootstrapped standard errors, an additional column \code{CI} contains
#'   the estimate confidence interval as a list column, an additional column \code{bias} contains
#'   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
#' @references
#' Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
#'
#' Ricardo Mora and Javier Ruiz-Castillo. 2011.
#'      "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
#' @examples
#' # calculate school racial segregation
#' mutual_total(schools00, "school", "race", weight = "n") # M => .425
#'
#' # note that the definition of groups and units is arbitrary
#' mutual_total(schools00, "race", "school", weight = "n") # M => .425
#'
#' # if groups or units are defined by a combination of variables,
#' # vectors of variable names can be provided -
#' # here there is no difference, because schools
#' # are nested within districts
#' mutual_total(schools00, "race", c("district", "school"),
#'     weight = "n"
#' ) # M => .424
#'
#' # estimate standard errors and 95% CI for M and H
#' \dontrun{
#' mutual_total(schools00, "race", "school",
#'     weight = "n",
#'     se = TRUE, n_bootstrap = 1000
#' )
#'
#' # estimate segregation within school districts
#' mutual_total(schools00, "race", "school",
#'     within = "district", weight = "n"
#' ) # M => .087
#'
#' # estimate between-district racial segregation
#' mutual_total(schools00, "race", "district", weight = "n") # M => .338
#' # note that the sum of within-district and between-district
#' # segregation equals total school-race segregation;
#' # here, most segregation is between school districts
#' }
#' @import data.table
#' @export
mutual_total <- function(data, group, unit, within = NULL, weight = NULL,
                         se = FALSE, CI = 0.95, n_bootstrap = 100, base = exp(1)) {
    stopifnot(CI > 0 & CI < 1)
    d <- prepare_data(data, group, unit, weight, within)

    if (is.null(within)) {
        ret <- mutual_total_compute(d, group, unit, base)
    } else {
        ret <- mutual_total_within_compute(d, group, unit, within, base)
    }

    if (se == TRUE) {
        vars <- attr(d, "vars")
        n_total <- sum(d[["freq"]])

        if (all.equal(n_total, round(n_total)) == TRUE) {
            message(paste0(n_bootstrap, " bootstrap iterations on ", n_total, " observations"))
        } else {
            stop(paste0(
                "bootstrap with a total sample size that is not an integer is not allowed, ",
                "maybe scale your weights?"
            ))
        }
        # draw from a multinomial with weights specified by the cell counts
        draws <- stats::rmultinom(n_bootstrap, n_total, d[["freq"]] / n_total)

        boot_ret <- lapply(seq_len(n_bootstrap), function(i) {
            if (i %% 5 == 0) update_log(bs_n = i, bs_max = n_bootstrap)
            d[, freq := as.double(draws[, i])]

            if (is.null(within)) {
                mutual_total_compute(d[freq > 0], group, unit, base)
            } else {
                mutual_total_within_compute(d[freq > 0], group, unit, within, base)
            }
        })
        close_log()
        boot_ret <- rbindlist(boot_ret)
        ret <- bootstrap_summary(ret, boot_ret, "stat", CI)
        setattr(ret, "bootstrap", boot_ret)
    }
    ret
}

#' Calculates detailed within-category segregation scores for M and H
#'
#' Calculates the segregation between \code{group} and \code{unit}
#' within each category defined by \code{within}.
#'
#' @param data A data frame.
#' @param group A categorical variable or a vector of variables
#'   contained in \code{data}. Defines the first dimension
#'   over which segregation is computed.
#' @param unit A categorical variable or a vector of variables
#'   contained in \code{data}. Defines the second dimension
#'   over which segregation is computed.
#' @param within A categorical variable or a vector of variables
#'   contained in \code{data} that defines the within-segregation categories.
#' @param weight Numeric. (Default \code{NULL})
#' @param se If \code{TRUE}, the segregation estimates are bootstrapped to provide
#'   standard errors and to apply bias correction. The bias that is reported
#'   has already been applied to the estimates (i.e. the reported estimates are "debiased")
#'   (Default \code{FALSE})
#' @param CI If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
#'   in addition to the bootstrap standard error.
#'   This is based on percentiles of the bootstrap distribution, and a valid interpretation
#'   relies on a larger number of bootstrap iterations. (Default \code{0.95})
#' @param n_bootstrap Number of bootstrap iterations. (Default \code{100})
#' @param base Base of the logarithm that is used in the calculation.
#'   Defaults to the natural logarithm.
#' @param wide Returns a wide dataframe instead of a long dataframe.
#'   (Default \code{FALSE})
#' @return Returns a data.table with four rows for each category defined by \code{within}.
#'   The column \code{est} contains four statistics that
#'   are provided for each unit:
#'   \code{M} is the within-category M, and \code{p} is the proportion of the category.
#'   Multiplying \code{M} and \code{p} gives the contribution of each within-category
#'   towards the total M.
#'   \code{H} is the within-category H, and \code{ent_ratio} provides the entropy ratio,
#'   defined as \code{EW/E}, where \code{EW} is the within-category entropy,
#'   and \code{E} is the overall entropy.
#'   Multiplying \code{H}, \code{p}, and \code{ent_ratio} gives the contribution of each within-category
#'   towards the total H.
#'   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
#'   the associated bootstrapped standard errors, an additional column \code{CI} contains
#'   the estimate confidence interval as a list column, an additional column \code{bias} contains
#'   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
#'   If \code{wide} is set to \code{TRUE}, returns instead a wide dataframe, with one
#'   row for each \code{within} category, and the associated statistics in separate columns.
#' @references
#' Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
#'
#' Ricardo Mora and Javier Ruiz-Castillo. 2011.
#'      "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
#' @examples
#' \dontrun{
#' (within <- mutual_within(schools00, "race", "school",
#'     within = "state",
#'     weight = "n", wide = TRUE
#' ))
#' # the M for state "A" is .409
#' # manual calculation
#' schools_A <- schools00[schools00$state == "A", ]
#' mutual_total(schools_A, "race", "school", weight = "n") # M => .409
#'
#' # to recover the within M and H from the output, multiply
#' # p * M and p * ent_ratio * H, respectively
#' sum(within$p * within$M) # => .326
#' sum(within$p * within$ent_ratio * within$H) # => .321
#' # compare with:
#' mutual_total(schools00, "race", "school", within = "state", weight = "n")
#' }
#' @import data.table
#' @export
mutual_within <- function(data, group, unit, within,
                          weight = NULL, se = FALSE, CI = 0.95, n_bootstrap = 100, base = exp(1),
                          wide = FALSE) {
    stopifnot(CI > 0 & CI < 1)
    d <- prepare_data(data, group, unit, weight, within)

    ret <- mutual_total_within_compute(d, group, unit, within, base, components = TRUE)

    if (se == TRUE) {
        vars <- attr(d, "vars")
        n_total <- sum(d[, "freq"])

        if (all.equal(n_total, round(n_total)) == TRUE) {
            message(paste0(n_bootstrap, " bootstrap iterations on ", n_total, " observations"))
        } else {
            stop(paste0(
                "bootstrap with a total sample size that is not an integer is not allowed, ",
                "maybe scale your weights?"
            ))
        }

        # draw from a multinomial with weights specified by the cell counts
        draws <- stats::rmultinom(n_bootstrap, n_total, d[["freq"]] / n_total)

        boot_ret <- lapply(seq_len(n_bootstrap), function(i) {
            if (i %% 5 == 0) update_log(bs_n = i, bs_max = n_bootstrap)
            d[, freq := as.double(draws[, i])]
            mutual_total_within_compute(d[freq > 0], group, unit, within, base, components = TRUE)
        })
        close_log()
        boot_ret <- rbindlist(boot_ret)
        ret <- bootstrap_summary(ret, boot_ret, c(within, "stat"), CI)
        setattr(ret, "bootstrap", boot_ret)
    }

    if (wide == TRUE) {
        f <- stats::as.formula(paste(
            paste(within, collapse = "+"),
            "~ factor(stat, levels=c('M', 'p', 'H', 'ent_ratio'))"
        ))
        if (se == TRUE) {
            ret <- dcast(ret, f, value.var = c("est", "se", "CI", "bias"))
            names(ret) <- c(
                within,
                "M", "p", "H", "ent_ratio",
                "M_se", "p_se", "H_se", "ent_ratio_se",
                "M_CI", "p_CI", "H_CI", "ent_ratio_CI",
                "M_bias", "p_bias", "H_bias", "ent_ratio_bias"
            )
            setcolorder(ret, c(
                within,
                "M", "M_se", "M_CI", "M_bias",
                "p", "p_se", "p_CI", "p_bias",
                "H", "H_se", "H_CI", "H_bias",
                "ent_ratio", "ent_ratio_se", "ent_ratio_CI", "ent_ratio_bias"
            ))
            setattr(ret, "bootstrap", boot_ret)
        } else {
            ret <- dcast(ret, f, value.var = c("est"))
        }
    }

    ret
}

#' @import data.table
mutual_local_compute <- function(data, group, unit, base = exp(1)) {
    add_local(data, group, unit, base)

    local <- data[, list(ls = first(ls_unit), p = first(p_unit)), by = unit]
    # melt into long form
    melt(local,
        id.vars = unit, measure.vars = c("ls", "p"),
        variable.name = "stat", value.name = "est",
        variable.factor = FALSE
    )
}

#' Calculates local segregation scores based on M
#'
#' Returns local segregation indices for each category defined
#' by \code{unit}.
#'
#' @param data A data frame.
#' @param group A categorical variable or a vector of variables
#'   contained in \code{data}. Defines the dimension
#'   over which segregation is computed.
#' @param unit A categorical variable or a vector of variables
#'   contained in \code{data}. Defines the group for which local
#'   segregation indices are calculated.
#' @param weight Numeric. (Default \code{NULL})
#' @param se If \code{TRUE}, the segregation estimates are bootstrapped to provide
#'   standard errors and to apply bias correction. The bias that is reported
#'   has already been applied to the estimates (i.e. the reported estimates are "debiased")
#'   (Default \code{FALSE})
#' @param CI If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
#'   in addition to the bootstrap standard error.
#'   This is based on percentiles of the bootstrap distribution, and a valid interpretation
#'   relies on a larger number of bootstrap iterations. (Default \code{0.95})
#' @param n_bootstrap Number of bootstrap iterations. (Default \code{100})
#' @param base Base of the logarithm that is used in the calculation.
#'   Defaults to the natural logarithm.
#' @param wide Returns a wide dataframe instead of a long dataframe.
#'   (Default \code{FALSE})
#' @return Returns a data.table with two rows for each category defined by \code{unit},
#'   for a total of \code{2*(number of units)} rows.
#'   The column \code{est} contains two statistics that
#'   are provided for each unit: \code{ls}, the local segregation score, and
#'   \code{p}, the proportion of the unit from the total number of cases.
#'   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
#'   the associated bootstrapped standard errors, an additional column \code{CI} contains
#'   the estimate confidence interval as a list column, an additional column \code{bias} contains
#'   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
#'   If \code{wide} is set to \code{TRUE}, returns instead a wide dataframe, with one
#'   row for each \code{unit}, and the associated statistics in separate columns.
#' @references
#' Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
#'
#' Ricardo Mora and Javier Ruiz-Castillo. 2011.
#'   "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
#' @examples
#' # which schools are most segregated?
#' (localseg <- mutual_local(schools00, "race", "school",
#'     weight = "n", wide = TRUE
#' ))
#'
#' sum(localseg$p) # => 1
#'
#' # the sum of the weighted local segregation scores equals
#' # total segregation
#' sum(localseg$ls * localseg$p) # => .425
#' mutual_total(schools00, "school", "race", weight = "n") # M => .425
#' @import data.table
#' @export
mutual_local <- function(data, group, unit, weight = NULL,
                         se = FALSE, CI = 0.95, n_bootstrap = 100, base = exp(1),
                         wide = FALSE) {
    stopifnot(CI > 0 & CI < 1)
    d <- prepare_data(data, group, unit, weight)

    ret <- mutual_local_compute(d, group, unit, base)

    if (se == TRUE) {
        vars <- attr(d, "vars")
        n_total <- sum(d[, "freq"])

        if (all.equal(n_total, round(n_total)) == TRUE) {
            message(paste0(n_bootstrap, " bootstrap iterations on ", n_total, " observations"))
        } else {
            stop(paste0(
                "bootstrap with a total sample size that is not an integer is not allowed, ",
                "maybe scale your weights?"
            ))
        }

        # draw from a multinomial with weights specified by the cell counts
        draws <- stats::rmultinom(n_bootstrap, n_total, d[["freq"]] / n_total)

        boot_ret <- lapply(seq_len(n_bootstrap), function(i) {
            if (i %% 5 == 0) update_log(bs_n = i, bs_max = n_bootstrap)
            d[, freq := as.double(draws[, i])]
            mutual_local_compute(d[freq > 0], group, unit, base)
        })
        close_log()
        boot_ret <- rbindlist(boot_ret)
        ret <- bootstrap_summary(ret, boot_ret, c(unit, "stat"), CI)
        setattr(ret, "bootstrap", boot_ret)
    }

    if (wide == TRUE) {
        f <- stats::as.formula(paste(
            paste(unit, collapse = "+"),
            "~ factor(stat, levels=c('ls', 'p'))"
        ))
        if (se == TRUE) {
            ret <- dcast(ret, f, value.var = c("est", "se", "CI", "bias"))
            names(ret) <- c(
                unit, "ls", "p",
                "ls_se", "p_se",
                "ls_CI", "p_CI",
                "ls_bias", "p_bias"
            )
            setcolorder(ret, c(
                unit,
                "ls", "ls_se", "ls_CI", "ls_bias",
                "p", "p_se", "p_CI", "p_bias"
            ))
            setattr(ret, "bootstrap", boot_ret)
        } else {
            ret <- dcast(ret, f, value.var = c("est"))
        }
    }

    ret
}


#' Calculates a nested decomposition of segregation for M and H
#'
#' Returns the between-within decomposition defined by
#' the sequence of variables in \code{unit}.
#'
#' @param data A data frame.
#' @param group A categorical variable or a vector of variables
#'   contained in \code{data}. Defines the first dimension
#'   over which segregation is computed.
#' @param unit A vector of variables
#'   contained in \code{data}. Defines the levels at which
#'   the decomposition should be computed.
#' @param weight Numeric. (Default \code{NULL})
#' @param base Base of the logarithm that is used in the calculation.
#'   Defaults to the natural logarithm.
#' @return Returns a data.table similar to \code{\link{mutual_total}},
#'   but with column \code{between} and \code{within} that define
#'   the levels of nesting.
#' @examples
#' mutual_total_nested(schools00, "race", c("state", "district", "school"),
#'     weight = "n"
#' )
#' # This is a simpler way to run the following manually:
#' # mutual_total(schools00, "race", "state", weight = "n")
#' # mutual_total(schools00, "race", "district", within = "state", weight = "n")
#' # mutual_total(schools00, "race", "school", within = c("state", "district"), weight = "n")
#' @import data.table
#' @export
mutual_total_nested <- function(data, group, unit, weight = NULL, base = exp(1)) {
    stopifnot(length(unit) >= 2)
    d <- prepare_data(data, group, unit, weight)

    decomp <- list()
    for (u in seq_along(unit)) {
        if (u == 1) {
            collapsed <- d[, .(freq = sum(freq)), by = c(group, unit[1])]
            seg <- mutual_total_compute(collapsed, group, unit[1], base)
            seg[, between := unit[1]]
            seg[, within := ""]
        } else {
            within <- unit[1:(u - 1)]
            collapsed <- d[, .(freq = sum(freq)), by = c(group, within, unit[u])]
            seg <- mutual_total_within_compute(collapsed, group, unit[u], within, base)
            seg[, between := unit[u]]
            seg[, within := paste0(within, collapse = ", ")]
        }
        decomp[[u]] <- seg
    }
    res <- data.table::rbindlist(decomp)
    res[, .(between, within, stat, est)]
}