old_code/prepare_data.R

#' Prepare input data on events and rates
#'
#' Prepare data on births, deaths, and migration,
#' to be supplied to functions \code{\link{estimate_account}},
#' \code{\link{increment_account}}, or \code{\link{forecast_account}}.
#'
#' All arguments to \code{prepare_data_events}
#' and \code{prepare_data_rates}
#' are data frames. Arguments \code{births}, \code{deaths},
#' \code{immigration} and \code{emigration} are required.
#' The remaining arguments are optional.
#'
#' If one of \code{immigration2} and \code{emigration2} are supplied,
#' then the other must be too. \code{immigration2} and
#' \code{emigration2} are used if there are two
#' types of international migration. For instance,
#' \code{immigration} and \code{emigration} might be used to
#' describe migration between England and Wales and
#' Scotland and Northern Ireland, while
#' \code{immigration2} and \code{emigration2} are used to
#' describe migration between England and Wales and
#' the rest of the world. 
#'
#' \code{internal_in} and \code{internal_out}
#' are also optional, though, again, if one is included
#' the other must be. \code{internal_in} and \code{internal_out}
#' are used to describe migration between internal regions,
#' such as migration between Local Authorities within England and Wales.
#' \code{internal_in} and \code{internal_out} represent internal migration
#' using a 'migrant pool' format: \code{internal_in} describe
#' internal migration into each region, regardless of source region,
#' and \code{internal_out} describe internal migration out of
#' each region, regardless of destination region.
#' 
#' All data frames must include columns with names 
#' \code{source}, \code{cohort},
#' \code{age}, \code{sex}, and
#' \code{time}, in any order. If (and only if) \code{internal_in}
#' and \code{internal_out} arguments are supplied,
#' all data frames must also include columns named
#' \code{region}.
#'
#' \code{source} is the origin of the input data,
#' eg {"Birth registrations"}. \code{time} is the period
#' during which the events occurred. Different data sources can
#' cover different combinations of periods. For instance,
#' one data source for immigration might only have data for the
#' first 5 years of the estimation period, while another
#' source has data for the entire estimation period.
#'
#' \code{cohort}, \code{age}, and \code{time}
#' must be vectors of whole numbers
#' while \code{source},
#' \code{sex} and (if present) \code{region} must be
#' character vectors.
#'
#' In the \code{births} dataset, the \code{age}
#' and \code{cohort} variables
#' refer to the mother, while the \code{sex}
#' variable refers to the sex of the child.
#'
#' Within each data frame, there must not be any
#' duplicated combinations of source, cohort, age,
#' time, sex, and (if present) region.
#'
#' When using function \code{prepare_events}, each data frame
#' should have a single measurement variable
#' called \code{count}; when using \code{prepare_rates},
#' each data frame should have a measurement variable
#' called \code{rate}. The elements of \code{count} columns
#' must be whole numbers.
#'
#' \code{NA}s are allowed in the \code{count} or \code{rate}
#' columns, but nowhere else.
#' 
#' Future versions of the software will include the option
#' of calculating rates within the estimation process.
#'
#' @param births A data frame of birth counts or rates. Required.
#' @param deaths A data frame of death counts or rates. Required.
#' @param internal_in A data frame of counts or rates for combined inward
#' migration from all other regions. Optional.
#' @param internal_out A data frame counts or rates for combined outward
#' migration to all other regions. Optional.
#' @param immigration A data frame of counts or rates for
#' immigration. Required.
#' @param emigration A data frame of counts or rates for
#' emigration. Required.
#' @param immigration2 A second data frame of counts or rates for
#' immigration. Optional.
#' @param emigration2 A second data frame of counts or rates for
#' emigration. Optional.
#'
#' @return A data frame combining counts or rates from all the inputs.
#'
#' @seealso \code{\link{prepare_data_population}}
#'
#' @export
#' @name prepare_data_events
NULL

#' @rdname prepare_data_events
#' @export
prepare_data_events <- function(births,
                                deaths,
                                internal_in = NULL,
                                internal_out = NULL,
                                immigration,
                                emigration,
                                immigration2 = NULL,
                                emigration2 = NULL) {
    events <- collect_and_check_args(births = births,
                                     deaths = deaths,
                                     internal_in = internal_in,
                                     internal_out = internal_out,
                                     immigration = immigration,
                                     emigration = emigration,
                                     immigration2 = immigration2,
                                     emigration2 = emigration2)
    names_events <- names(events)
    ## check and reformat datasets independently
    for (i in seq_along(events[[i]]))
        events[[i]] <- prepare_dataset(events[[i]],
                                       name = names_events[[i]],
                                       measures_vars = "count")
    ## combine and return
    ans <- combine_datasets(datasets = events,
                            measure_var = "count")
    ans
}


#' @rdname prepare_data_events
#' @export
prepare_data_rates <- function(births,
                               deaths,
                               internal_in = NULL,
                               internal_out = NULL,
                               immigration,
                               emigration,
                               immigration2 = NULL,
                               emigration2 = NULL) {
    datasets <- list(births = births,
                     deaths = deaths,
                     internal_in = internal_in,
                     internal_out = internal_out,
                     immigration = immigration,
                     emigration = emigration,
                     immigration2 = immigration2,
                     emigration2 = emigration2)
    ## check and reformat rates independently
    names_datasets <- names(datasets)
    for (i in seq_along(datasets))
        datasets[[i]] <- prepare_dataset(x = datasets[[i]],
                                         name = names_datasets[[i]],
                                         measure_vars = "rate",
                                         measure_is_int = FALSE)
    ## combine and return
    ans <- combine_datasets(datasets = datasets,
                            measure_var = "rate")
    ans
}    





#' Prepare data on population
#'
#' Prepare data on population to be supplied to
#' functions \code{\link{estimate_account}}or 
#' \code{\link{increment_account}}.
#'
#' \code{population} is a data frame with columns named 
#' \code{source}, \code{age}, \code{sex}, \code{time}, \code{count}, and,
#' optionally, \code{region}.
#'
#' All function in package \code{account},
#' use the convention that, if population is measured
#' at points \code{t, t+1, t+2, \dots}, then
#' the period between \code{t-1} and \code{t}
#' is referred to as period \code{"t"}.
#' For example, if population is measured on
#' 30 June 2020, 30 June 2021, and 30 June 2022,
#' then the time points are labelled \code{2020},
#' \code{2021}, and \code{2022},
#' and the periods are labeled \code{2021} and \code{2022}.
#'
#' Note that \code{population}, which is measured at
#' points in time, should have one more time value
#' than the arguments to \code{prepare_data_events},
#' and \code{prepare_data_rates}, which are measured
#' over periods. For instance, if \code{population}
#' has time values \code{2011, 2012, \dots, 2021},
#' then the arguments to \code{prepare_data_events}
#' and \code{prepare_data_rates} should have time
#' values \code{2012, 2013, \dots, 2021}.
#' 
#' Classifying variables \code{age}
#' and \code{time} in \code{population}
#' must be vectors of whole numbers,
#' while \code{sex} and (if present) \code{region} must be
#' character vectors.
#'
#' \code{count} must consist of non-negative integers.
#'
#' \code{NA}s are allowed in the \code{count}
#' column, but nowhere else.
#'
#' @param population A data frame.
#'
#' @return A data frame.
#'
#' @seealso \code{\link{prepare_data_population}}
#'
#' @export
prepare_data_population <- function(population) {
    prepare_dataset(dataset = population,
                    name_dataset = "population",
                    measure_vars = "count",
                    measure_is_int = TRUE)
}
ONSdigital/Bayesian-demographic-accounts documentation built on Jan. 10, 2022, 12:34 a.m.