R/datamodel-constructors.R

Defines functions validate_dm_nbinom new_dm_nbinom dm_nbinom validate_dm_exact new_dm_exact dm_exact validate_dm_poibin new_dm_poibin dm_poibin

Documented in dm_exact dm_nbinom dm_poibin

## Poisson-binomial mixture ---------------------------------------------------

## HAS_TESTS
#' Create a data model based on a Poisson-binomial mixture
#'
#' Create a data model where the distribution of the
#' reported value is a Poisson-binomial mixture.
#' If \code{Y ~ binom(n, p)} and \code{Z ~ pois(n(1-p))} then
#' \code{X = Y + Z} is a Poisson-binomial mixture.
#'
#' @param data A data frame, described in \code{\link{data-arg}}.
#' @param prob A number between 0 and 1.
#' @param nm_series The name of the demographic series
#'   that \code{data} describes.
#' @param nm_data The name of the dataset. If no value
#'   supplied, then \code{nm_data} is assumed to
#'   equal the name of the object supplied as the \code{data}
#'   argument.
#'
#' @return An object of class \code{"dm_poibin"}.
#'
#' @examples
#' reg_immig <- account::gl_reg_immig
#' reg_immig_dm <- dm_poibin(data = reg_immig,
#'                           prob = 0.95,
#'                           nm_series = "immigration")
#' reg_immig_dm
#' @export
dm_poibin <- function(data, prob, nm_series, nm_data = NULL) {
    nm_data_arg <- deparse(substitute(data))
    check_nm_series(nm_series = nm_series,
                    names_data = names(data))
    is_popn <- identical(nm_series, "population")
    check_data_dm(data = data,
                  is_popn = is_popn)
    check_prob(prob)
    data <- tidy_data_dm(data)
    if (is.null(nm_data))
        nm_data <- nm_data_arg
    else
        check_nm_data(nm_data)
    new_dm_poibin(data = data,
                  prob = prob,
                  nm_series = nm_series,
                  nm_data = nm_data)
}

## HAS_TESTS
new_dm_poibin <- function(data, prob, nm_series, nm_data) {
    ans <- list(data = data,
                prob = prob,
                nm_series = nm_series,
                nm_data = nm_data)
    class(ans) <- c("dm_poibin", "datamodel")
    ans
}

## HAS_TESTS
validate_dm_poibin <- function(x) {
    data <- x$data
    prob <- x$prob
    nm_series <- x$nm_series
    nm_data <- x$nm_data
    check_nm_series(nm_series = nm_series,
                    names_data = names(data))
    is_popn <- identical(nm_series, "population")
    check_data_dm(data = data,
                  is_popn = is_popn)
    check_prob(prob)
    check_nm_data(nm_data)
    check_is_not_births_deaths(x)
    x
}


## Exact ----------------------------------------------------------------------

## HAS_TESTS
#' Create a data model for a series treated as error-free
#'
#' Create a data model in which the observed data are
#' treated as error-free and are copied straight into
#' the account and not subsequently updated.
#'
#' @section Warning:
#'
#' \code{dm_exact} is the only valid data model for births
#' and deaths. \code{dm_exact} cannot be used for
#' any series other than births and deaths.
#'
#' @inheritParams dm_poibin
#'
#' @return An object of class \code{"dm_exact"}.
#'
#' @examples
#' reg_deaths <- dm_exact(data = account::gl_reg_deaths,
#'                        nm_series = "deaths")
#' reg_deaths
#' @export
dm_exact <- function(data,
                     nm_series = c("births", "deaths"),
                     nm_data = NULL)  {
    nm_data_arg <- deparse(substitute(data))
    check_data_dm(data = data,
                  is_popn = FALSE)
    nm_series <- match.arg(nm_series)
    is_births <- nm_series == "births"
    check_data_exact(data = data,
                     is_births = is_births)
    data <- tidy_data_dm(data)
    if (is.null(nm_data))
        nm_data <- nm_data_arg
    else
        check_nm_data(nm_data)
    new_dm_exact(data = data,
                 nm_series = nm_series,
                 nm_data = nm_data)
}

## HAS_TESTS
new_dm_exact <- function(data, nm_series, nm_data) {
    ans <- list(data = data,
                nm_series = nm_series,
                nm_data = nm_data)
    class(ans) <- c("dm_exact", "datamodel")
    ans
}

## HAS_TESTS
validate_dm_exact <- function(x) {
    data <- x$data
    nm_series <- x$nm_series
    nm_data <- x$nm_data
    check_data_dm(data = data,
                  is_popn = FALSE)
    check_nm_series(nm_series = nm_series,
                    names_data = names(data))
    check_is_births_deaths(x)
    is_births <- nm_series == "births"
    check_data_exact(data = data,
                     is_births = is_births)
    check_nm_data(nm_data)
    x
}


## Negative binomial ----------------------------------------------------------

## NO_TESTS
#' Create a data model based on a negative binomial distribution
#'
#' TODO - EDIT THIS
#'
#' Create a data model where the
#' reported value has a negative binomial distribution.
#' The negative binomial distribution has a mean
#' mean-dispersion parameterisation.
#'
#' \code{ratio} and \code{disp} can both be data frames
#' or single numbers
#'
#' \code{ratio} can be zero, but \code{disp} cannot. Neither
#' can be negative.
#'
#' The \code{"ratio"} column in data frame \code{ratio}
#' gives expected coverage ratios,
#' that is, the number of people or events that the dataset
#' is expected to report for each actual person or event.
#' If \code{ratio$ratio[i]} is the coverage ratio,
#' and \code{true$count[i]} is the true number of people or events,
#' then the expected value for \code{data$count[i]} is
#' \code{ratio$ratio[i] * true$count[i]}.
#'
#' All elements ratio$ratio must be non-negative,
#' and can only be \code{NA} if the corresponding
#' value of \code{data$data} is.
#'
#' The \code{disp} argument measures the amount of dispersion
#' beyond what would be expected for a Poisson distribution.
#' It equals the reciprocal of the \code{size} argument
#' in \code{\link[stats]{NegBinomial}} Setting \code{disp}
#' to \code{0} is equivalent to having Poisson variance,
#' and setting \code{disp} to a higher number induces
#' greater variable. In general, the less reliable the data source,
#' the higher \code{disp} should be.
#'
#' \code{disp} can be a single number, in which case all
#' values of \code{data} have the same dispersion, or it
#' can be a data frame with a column called \code{"disp"}.
#'
#' If \code{disp} is a single number, it must be non-negative,
#' and cannot be \code{NA}. If \code{disp} is a data frame,
#' all elements disp$disp must be non-negative,
#' and can only be \code{NA} if the corresponding
#' value of \code{data$data} is.d
#'
#'
#' If \code{ratio} or \code{disp} are data frames,
#' then they do not need to have all the variables that
#' are in \code{data}. Values for \code{ratio}
#' or \code{disp} are assumed to be constant across
#' the missing variables. For instance, if \code{disp}
#' does not have a \code{time} variable, then
#' values for \code{dism} are assumed to be constant
#' across time.
#'
#' If \code{ratio} and \code{disp} are data frames,
#' then every row in \code{data} must map on to
#' them. However, not every row in
#' \code{ratio} and \code{disp}
#' needs to map on to a row in \code{data}: any
#' rows that do not map on to \code{data}
#' are silently dropped.
#'
#' @inheritParams dm_poibin
#' 
#' @param ratio A data frame, identical to \code{data}, except that
#'   the \code{"count"} variable is replaced by a \code{"ratio"} variable
#'   giving expected coverage ratios.
#' @param disp A single number or a data frame. If a data frame,
#'   it is identical to \code{data}, except that
#'   the \code{"count"} variable is replaced by a \code{"disp"} variable
#'   giving values for dispersion.
#'
#' @return An object of class \code{"dm_nbinom"}.
#'
#' @examples
#' ## Use a constant ratio across all categories
#' ## but use higher dispersion for males than for females,
#' ## and higher dispersion for ages 20-29 than for
#' ## other age groups.
#' reg_popn <- account::gl_reg_popn
#' ratio <- 1
#' disp <- within(reg_popn, {
#'   rm(count)
#'   disp <- ifelse(gender == "Female", 1.1, 1.2)
#'   disp <- ifelse(age %in% 20:29, disp * 1.3, disp)
#' })
#' reg_popn_dm <- dm_nbinom(data = reg_popn,
#'                          ratio = ratio,
#'                          disp = disp,
#'                          nm_series = "population")
#' reg_popn_dm
#' @export
dm_nbinom <- function(data, ratio, disp, nm_series, nm_data = NULL) {
    nm_data_arg <- deparse(substitute(data))
    check_nm_series(nm_series = nm_series,
                    names_data = names(data))
    is_popn <- identical(nm_series, "population")
    check_data_dm(data = data,
                  is_popn = is_popn)
    data <- tidy_data_dm(data)
    check_ratio_dm(ratio = ratio,
                   data = data)
    check_disp_dm(disp = disp,
                  data = data)
    if (is.numeric(disp)) {
        classif_vars <- data[-match("count", names(data))]
        disp <- rep(disp, times = nrow(classif_vars))
        disp <- data.frame(classif_vars, disp)
    }
    if (is.null(nm_data))
        nm_data <- nm_data_arg
    else
        check_nm_data(nm_data)
    new_dm_nbinom(data = data,
                  ratio = ratio,
                  disp = disp,
                  nm_series = nm_series,
                  nm_data = nm_data)
}


## NO_TESTS
new_dm_nbinom <- function(data, ratio, disp, nm_series, nm_data) {
    stopifnot(is.data.frame(disp))
    ans <- list(data = data,
                ratio = ratio,
                disp = disp,
                nm_series = nm_series,
                nm_data = nm_data)
    class(ans) <- c("dm_nbinom", "datamodel")
    ans
}

## NO_TESTS
validate_dm_nbinom <- function(x) {
    data <- x$data
    ratio <- x$ratio
    disp <- x$disp
    nm_series <- x$nm_series
    nm_data <- x$nm_data
    check_nm_series(nm_series = nm_series,
                    names_data = names(data))
    is_popn <- identical(nm_series, "population")
    check_data_dm(data = data,
                  is_popn = is_popn)
    check_ratio_dm(ratio = ratio,
                   data = data)
    stopifnot(is.data.frame(disp))
    check_disp_dm(disp = disp,
                  data = data)
    check_nm_data(nm_data)
    check_is_not_births_deaths(x)
    x
}
ONSdigital/Bayesian-demographic-accounts documentation built on Jan. 10, 2022, 12:34 a.m.