old_code/prepare.R

## NOTE THAT RATES MUST BE DEFINED PER YEAR IF USING 1-YEAR UNITS,
## PER 5-YEARS IF USING 5-YEAR UNITS, PER MONTH IF USING 1-MONTH UNITS, ETC

## collect arguments into one list, and check
## that 
combine_args <- function(births,
                         deaths,
                         internal_in = NULL,
                         internal_out = NULL,
                         immigration,
                         emigration,
                         immigration2 = NULL,
                         emigration2 = NULL) {
    has_internal_in <- !is.null(internal_in)
    has_internal_out <- !is.null(internal_out)
    if (has_internal_in && !has_internal_out)
        stop(gettextf("'%s' is non-NULL, but '%s' is NULL"),
             "internal_in", "internal_out")
    if (!has_internal_in && has_internal_out)
        stop(gettextf("'%s' is NULL, but '%s' is non-NULL"),
             "internal_in", "internal_out")
    has_im2 <- !is.null(immigration2)
    has_em2 <- !is.null(emigration2)
    if (has_im2 && !has_em2)
        stop(gettextf("'%s' is non-NULL, but '%s' is NULL"),
             "immigration2", "emigration2")
    if (!has_im2 && has_em2)
        stop(gettextf("'%s' is NULL, but '%s' is non-NULL"),
             "immigration2", "emigration2")
    ans <- list(births = births,
                deaths = deaths,
                immigration = immigration,
                emigration = emigration)
    if (has_internal_in)
        ans <- c(ans,
                 list(internal_in = internal_in,
                      internal_out = internal_out))
    if (has_im2)
        ans <- c(ans,
                 list(immigration2 = immigration2,
                      emigration2 = emigration2))
    names <- names(ans)
    has_reg <- vapply(ans, function(x) "region" %in% names, logical())
    if (any(has_reg[-1L] != has_reg[[1L]]))
        stop(gettextf("some datasets have '%s' variable and others do not",
                      "region"))
    ans
}

combine_datasets <- function(datasets, measure_var) {
    has_internal <- "internal_in" %in% names(datasets)
    has_imem2 <- "immigration2" %in% names(datasets)
    ans <- cbind(datasets[["births"]]
                 deaths = datasets[["deaths"]][[measure_var]])
    if (has_internal)
        ans <- cbind(ans,
                     internal_in = datasets[["internal_in"]][[measure_var]],
                     internal_out = datasets[["internal_out"]][[measure_var]])
    ans <- cbind(ans,
                 immigration = datasets[["immigration"]][[measure_var]],
                 emigration = datasets[["emigration"]][[measure_var]])
    if (has_imem2)
        ans <- cbind(ans,
                     immigration2 = datasets[["immigration2"]][[measure_var]],
                     emigration2 = datasets[["emigration2"]][[measure_var]])
    ans
}

## check and tidy an individual dataset

prepare_dataset <- function(dataset,
                            name_dataset,
                            measure_vars,
                            measure_is_int) {
    ## check that 'dataset' is a data frame
    if (!is.data.frame(dataset))
        stop(gettextf("'%s' is not a data frame",
                      name_dataset))
    ## check that 'dataset' has expected columns
    classif_vars <- make_classif_vars(dataset)
    colnames_expected <- c(classif_vars, measure_vars)
    colnames_obtained <- names(dataset)
    for (colname in colnames_expected) {
        if (!(colname %in% colnames_obtained))
            stop(gettextf("'%s' does not have a column called '%s'",
                          name_dataset, colname))
    }
    for (colname in colnames_obtained) {
        if (!(colname %in% colnames_expected))
            stop(gettextf("'%s' has a column called '%s'",
                          "dataset", colname))
    }
    ## check that has neither or both of 'internal_in', 'internal_out'
    has_internal_in <- internal_in %in% colnames_obtained
    has_internal_out <- internal_out %in% colnames_obtained
    if (has_internal_in && !has_internal_out)
        stop(gettextf("'%s' has '%s' column but not '%s' column"),
             "internal_in", "internal_out")
    if (!has_internal_in && has_internal_out)
        stop(gettextf("'%s' has '%s' column but not '%s' column"),
             "internal_out", "internal_in")
    
    for (colname in colnames_obtained) {
        val <- datasets[[colname]]
        if (any(is.na(val)))
            stop(gettextf("column '%s' has NAs",
                          colname))
    }
    ## check that has "region" column iff has 'internal_in'
    has_region <- "region" %in% colnames_obtained
    if (has_region && !has_internal_in)
        stop(gettextf("'%s' has '%s' column' 
    ## check that has neither or both of 'immigration2', 'emigration2'
    has_im2 <- "immigration2" %in% colnames_obtained
    has_em2 <- "emigration2" %in% colnames_obtained
    if (has_im2 && !has_em2)
        stop(gettextf("'%s' has '%s' column but not '%s' column"),
             "immigration2", "emigration2")
    if (!has_im2 && has_em2)
        stop(gettextf("'%s' has '%s' column but not '%s' column"),
             "emigration2", "immigration2")
    ## check integer classification variables
    classif_vars_int <- c("age", "cohort", "time")
    for (colname in classif_vars_int) {
        val <- dataset[[colname]]
        if (!is.numeric(val))
            stop(gettextf("column '%s' is non-numeric",
                          colname))
        if (any(val != round(val)))
            stop(gettextf("column '%s' has non-integer values",
                          colname))
        if (any(val < 0L))
            stop(gettextf("column '%s' has negative values",
                          colname))
        dataset[[colname]] <- as.integer(dataset[[colname]])
    }
    ## check measure variables
    for (colname in measure_vars) {
        val <- dataset[[colname]]
        if (!is.numeric(val))
            stop(gettextf("column '%s' is non-numeric",
                          colname))
        if (any(val < 0L))
            stop(gettextf("column '%s' has negative values",
                          colname))
        if (measure_is_int) {
            if (any(val != round(val)))
                stop(gettextf("column '%s' has non-integer values",
                              colname))
            dataset[[colname]] <- as.integer(dataset[[colname]])
        }
        else
            dataset[[colname]] <- as.numeric(dataset[[colname]])
    }
    ## check for duplicated combinations of classification variables
    if (any(duplicated(dataset[classif_vars])))
        stop(gettextf("'%s' has duplicates rows for classification variables",
                      "dataset"))
    ## check that every possible combination of classification variables is present
    nrow_obtained <- nrow(dataset)
    nrow_expected <- nrow_complete_classif(dataset)
    if (nrow_obtained < nrow_expected)
        stop(gettextf("classification variables for '%s' missing some combinations of values",
                      name_dataset))
    ## return result
    dataset
}
ONSdigital/Bayesian-demographic-accounts documentation built on Jan. 10, 2022, 12:34 a.m.