R/check-functions.R

Defines functions check_threshold check_region_iff_internal check_ratio_dm check_rates check_prob check_nm_series check_nm_data check_names_valid check_names_series_valid check_names_df_valid check_n_thread check_n_thin check_n_particle check_measure_var_valid_num check_measure_var_valid_list check_measure_var_valid check_is_not_births_deaths check_is_births_deaths check_identities check_dominant check_disp_dm check_data_models check_data_exact check_data_dm check_consistent_rates_data_models check_consistent_account_rates check_classif_vars_valid check_classif_vars_consistent_names check_classif_vars_consistent_levels check_classif_vars_complete check_arg_dm_num check_arg_dm_df check_arg_dm check_account

## HAS_TESTS
## Check that an account is valid.
## 'account' - a named list of data frames
## return value - TRUE or an error
check_account <- function(account) {
    ## 'account' is list
    if (!is.list(account))
        stop("'account' is not a list")
    ## elements of 'account' all data frames
    if (!all(vapply(account, is.data.frame, TRUE)))
        stop("'account' has elements that are not data frames")
    ## names for 'account' are valid demographic series
    nms_account <- names(account)
    check_names_series_valid(names = nms_account,
                             nm_obj = "account",
                             incl_popn = TRUE,
                             incl_stock = FALSE,
                             subset_ok = FALSE)
    ## has 'region' variable iff has 'internal_in'
    check_region_iff_internal(nms_vars = names(account[[1L]]),
                              nms_series = nms_account)
    ## internal consistency of each data frame
    for (i in seq_along(account)) {
        df <- account[[i]]
        nm_obj <- nms_account[[i]]
        nms_vars <- names(df)
        nm_measure_var <- "count"
        measure_var <- df[[nm_measure_var]]
        nms_classif_vars <- setdiff(nms_vars, nm_measure_var)
        classif_vars <- df[nms_classif_vars]
        incl_cohort <- nm_obj != "population"
        incl_sexgender <- TRUE
        incl_region <- "internal_in" %in% nms_account
        no_min_age <- nm_obj == "births"
        na_ok <- FALSE
        frac_ok <- FALSE
        tryCatch({account
            check_names_df_valid(nms_vars = nms_vars,
                                 nm_obj = nm_obj,
                                 incl_cohort = incl_cohort,
                                 incl_sexgender = incl_sexgender,
                                 incl_region = incl_region,
                                 nm_measure_var = nm_measure_var)
            check_classif_vars_valid(classif_vars = classif_vars,
                                     nm_obj = nm_obj)
            check_classif_vars_complete(classif_vars = classif_vars,
                                        nm_obj = nm_obj,
                                        no_min_age = no_min_age)
            check_measure_var_valid(var = measure_var,
                                    nm_var = nm_measure_var,
                                    nm_obj = nm_obj,
                                    na_ok = na_ok,
                                    frac_ok = frac_ok)
        },
        error = function(e) {
            msg <- paste("problem with 'account' :", e$message)
            stop(msg, call. = FALSE)
        })
    }
    ## consistency across data frames
    nms_classif_all <- lapply(account, function(x) setdiff(names(x), nm_measure_var))
    classif_vars_all <- lapply(seq_along(account), function(i) account[[i]][nms_classif_all[[i]]])
    i_deaths <- match("deaths", nms_account)
    nms_classif_template <- nms_classif_all[[i_deaths]]
    classif_vars_template <- account[[i_deaths]][nms_classif_template]
    no_cohort <- "population"
    no_sexgender <- character()
    absent_ok <- FALSE
    ignore_age_cohort <- "births"
    omit_first_time <- "population"
    check_classif_vars_consistent_names(nms_classif_all = nms_classif_all,
                                        nms_obj_all = nms_account,
                                        nms_classif_template = nms_classif_template,
                                        no_cohort = no_cohort,
                                        no_sexgender = no_sexgender)
    check_classif_vars_consistent_levels(classif_vars_all = classif_vars_all,
                                         nms_obj_all = nms_account,
                                         classif_vars_template = classif_vars_template,
                                         absent_ok = absent_ok,
                                         ignore_age_cohort = ignore_age_cohort,
                                         omit_first_time = omit_first_time)
    check_identities(account)
    ## return TRUE if all tests pass
    TRUE
}


## HAS_TESTS
## Check a data frame or numeric argument (other than 'data')
## used for constructing a data model in a 'dm' function
## is valid
## 'arg' - the argument being checked, a data frame or numeric scalar
## 'nm_arg' - the name of the argument being checked
## 'neg_ok' - whether 'arg' can have negative values
## 'zero_ok' - whether 'arg' can have zeros
## return value - TRUE or an error
check_arg_dm <- function(arg, nm_arg, data, neg_ok, zero_ok) {
    if (is.data.frame(arg))
        check_arg_dm_df(arg = arg,
                        nm_arg = nm_arg,
                        data = data,
                        neg_ok = neg_ok,
                        zero_ok = zero_ok)
    else if (is.numeric(arg))
        check_arg_dm_num(arg = arg,
                         nm_arg = nm_arg,
                         neg_ok = neg_ok,
                         zero_ok = zero_ok)
    else
        stop("'", nm_arg, "' has class '", class(arg), "'")
}


## HAS_TESTS
## Check a data frame argument to a data model constructor.
## Assume that 'is.data.frame(arg)' is TRUE.
## 'arg' - the argument being checked, a data frame
## 'nm_arg' - the name of the argument being checked
## 'neg_ok' - whether the 'nm_arg' column of 'arg' can have negative values
## 'zero_ok' - whether the 'nm_arg' column of 'arg' can have zeros
## return value - TRUE or an error
check_arg_dm_df <- function(arg, nm_arg, data, neg_ok, zero_ok) {
    nms_arg <- names(arg)
    nms_data <- names(data)
    nms_classif_arg <- setdiff(nms_arg, nm_arg)
    nms_classif_data <- setdiff(nms_data, "count")
    ## 'nm_arg' in 'nms_arg'
    if (!(nm_arg %in% nms_arg))
        stop("'", nm_arg, "' does not have a variable called '", nm_arg, "'")
    ## 'nm_arg' not in 'nms_data'
    if (nm_arg %in% nms_data)
        stop("argument '", nm_arg, "' has the same name as a variable in 'data'")
    ## all elements of 'nms_arg', except for 'nm_arg', in 'nms_data'
    is_in_data <- nms_classif_arg %in% nms_data
    i_not_in_data <- match(FALSE, is_in_data, nomatch = 0L)
    if (i_not_in_data > 0L)
        stop("'", nm_arg, "' has a variable [", nms_classif_arg[[i_not_in_data]],
             "] not found in 'data'")
    ## all rows of 'data' map to a row of 'arg'
    arg_check_map <- arg
    arg_check_map[[nm_arg]] <- TRUE
    merged_check_map <- merge(data, arg_check_map,
                              by = nms_classif_arg,
                              all.x = TRUE)
    is_not_in_arg <- is.na(merged_check_map[[nm_arg]])
    i_not_in_arg <- match(TRUE, is_not_in_arg, nomatch = 0L)
    if (i_not_in_arg > 0L) {
        row_not_in_arg <- merged_check_map[i_not_in_arg, nms_classif_data]
        str_not_in_arg <- paste(row_not_in_arg, collapse = ", ")
        stop("row of 'data' with classifying variables ", str_not_in_arg,
             " does not have corresponding row in '", nm_arg, "'")
    }
    ## measure variable for 'arg' only has NAs in places where 'data' does
    merged_check_na <- merge(data, arg,
                             by = nms_classif_arg,
                             all.x = TRUE)
    is_na_clash <- !is.na(merged_check_na$count) & is.na(merged_check_na[[nm_arg]])
    i_na_clash <- match(TRUE, is_na_clash, nomatch = 0L)
    if (i_na_clash > 0L) {
        row_na_clash <- merged_check_map[i_na_clash, nms_classif_data]
        str_na_clash <- paste(row_na_clash, collapse = ", ")
        stop("row of 'data' with classifying variables ", str_na_clash,
             " has non-NA value for 'count' but corresponding row in '", nm_arg,
             "' has NA value for '", nm_arg, "'")
    }
    ## if 'neg_ok' is FALSE, no elements of measure variable in 'arg' are negative
    if (!neg_ok && any(arg[[nm_arg]] < 0, na.rm = TRUE))
        stop("'", nm_arg, "' variable in '", nm_arg, "' has negative values")
    ## if 'zero_ok' is FALSE, no elements of measure variable in 'arg' are zero
    if (!zero_ok && any(arg[[nm_arg]] == 0, na.rm = TRUE))
        stop("'", nm_arg, "' variable in '", nm_arg, "' has zeros")
    ## if all tests pass, return TRUE
    TRUE
}


## HAS_TESTS
## Check a numeric scalar argument to a data model constructor.
## Assume that 'is.numeric(arg)' is TRUE.
## 'arg' - the argument being checked, a numeric scalar
## 'nm_arg' - the name of the argument being checked
## 'neg_ok' - whether 'arg' can be negative
## 'zero_ok' - whether 'arg' can be zero
## return value - TRUE or an error
check_arg_dm_num <- function(arg, nm_arg, neg_ok, zero_ok) {
    if (!identical(length(arg), 1L))
        stop("'", nm_arg, "' does not have length 1")
    if (is.na(arg))
        stop("'", nm_arg, "' is NA")
    if (!neg_ok && (arg < 0))
        stop("'", nm_arg, "' is negative")
    if (!zero_ok && (arg == 0))
        stop("'", nm_arg, "' equals 0")
    TRUE
}

    
## HAS_TESTS
## Check that 'classif_vars' contains all intermediate categories
## of numeric variables, and all possible combinations
## of all classifying variables. Assumes that 'classif_vars'
## has already been checked for duplicates and invalid
## combiniations of age, time, and cohort. If 'no_min_age'
## is TRUE, then age can start from any non-negative number;
## otherwise age must start from 0.
## 'classif_vars' - a data frame containing the classification
##    variables
## 'nm_obj' - the name of the object that the classification
##   variables were taken from
## 'no_min_age' - whether the minimum value for 'age' can be
##    higher than 0
## return value - TRUE or an error message
check_classif_vars_complete <- function(classif_vars, nm_obj, no_min_age) {
    nms <- names(classif_vars)
    nms_numeric <- intersect(nms, c("cohort", "time", "age"))
    ## except when 'classif_vars' is birth-related, check that the minimum age is 0
    if (!no_min_age) {
        age <- classif_vars$age
        min_age <- min(age)
        if (min_age > 0L)
            stop("minimum value for 'age' [", min_age, "] in '", nm_obj,
                 "' is not zero")
    }
    ## check that numeric classification variables contain all intermediate values
    for (nm in nms_numeric) {
        vals <- classif_vars[[nm]]
        range_vals <- seq.int(from = min(vals), to = max(vals))
        is_included <- range_vals %in% vals
        i_not_included <- match(FALSE, is_included, nomatch = 0L)
        if (i_not_included > 0L)
            stop("'", nm, "' variable in '", nm_obj, "' missing intermediate value [",
                 range_vals[[i_not_included]], "]")
    }
    ## all combinations of classification variables are included
    levels_classif <- lapply(classif_vars, unique)
    has_cohort <- "cohort" %in% nms
    if (has_cohort) {
        classif_vars_implied <- expand.grid(levels_classif)
        is_valid_lexis <- with(classif_vars_implied,
        (time - cohort - age) %in% c(0L, 1L))
        nrow_expected <- sum(is_valid_lexis)
    }
    else {
        n_levels_classif <- vapply(levels_classif, length, 1L)
        nrow_expected <- prod(n_levels_classif)
    }
    nrow_obtained <- nrow(classif_vars)
    if (nrow_obtained < nrow_expected)
        stop("'", nm_obj, "' does not include all possible combinations",
             " of classification variables")
    TRUE
}
    


## HAS_TESTS
## Check that classification variables have expected levels.
## The classifications are not assumed to all have the same
## variables - the completeness of variables needs to be
## checked separately, eg via 'check_classif_vars_complete'.
## 'check_classif_vars_consistent_levels' also does not check
## that all combinations of the levels are present - this
## needs to be checked separately, eg via 'check_classif_vars_complete'.
## 'classif_vars_all' - a list of data frames with classification variables
## 'nms_obj_all' - the names of the data frames
## 'classif_vars_template' - a data frame with the target classification
## 'absent_ok' - whether a data frame is allowed to omit some levels
##    contained in 'classif_vars_template'
## 'ignore_age_cohort' - names of data frames where age and cohort
##    are omitted from the comparison
## return value - TRUE or an error message
check_classif_vars_consistent_levels <- function(classif_vars_all,
                                                 nms_obj_all,
                                                 classif_vars_template,
                                                 absent_ok,
                                                 ignore_age_cohort = character(),
                                                 omit_first_time = character()) {
    levels_expected <- lapply(classif_vars_template, unique)
    nms_levels_expected <- names(classif_vars_template)
    for (i in seq_along(classif_vars_all)) {
        classif_vars <- classif_vars_all[[i]]
        nm_obj <- nms_obj_all[[i]]
        if (nm_obj %in% ignore_age_cohort) {
            non_age_cohort <- setdiff(names(classif_vars), c("age", "cohort"))
            classif_vars <- classif_vars[non_age_cohort]
        }
        levels_obtained <- lapply(classif_vars, unique)
        nms_levels_obtained <- names(classif_vars)
        for (nm in nms_levels_expected) {
            if (nm %in% nms_levels_obtained) {
                levels_expected_var <- levels_expected[[nm]]
                levels_obtained_var <- levels_obtained[[nm]]
                if ((nm == "time") && (nm_obj %in% omit_first_time))
                    levels_obtained_var <- levels_obtained_var[-1L]
                is_valid <- levels_obtained_var %in% levels_expected_var
                i_invalid <- match(FALSE, is_valid, nomatch = 0L)
                if (i_invalid > 0L)
                    stop("variable '", nm, "' in '", nm_obj, "' has unexpected value [",
                         levels_obtained_var[[i_invalid]], "]")
                if (!absent_ok) {
                    is_present <- levels_expected_var %in% levels_obtained_var
                    i_absent <- match(FALSE, is_present, nomatch = 0L)
                    if (i_absent > 0L)
                    stop("variable '", nm, "' in '", nm_obj, "' does not have value '", 
                         levels_expected_var[[i_absent]], "'")
                }
            }
        }
    }
    TRUE
}



## HAS_TESTS
## Check that a set of data frames has 'consistent' classification variables.
## Consistent does not mean identical - some data frames may be required
## to omit a cohort or sex/gender variable found in other data frames.
## 'nms_classif_all' - a list of names of classification variables
## 'nms_obj_all' - the names of the data frames
## 'nms_classif_template' - a set of target names for classification variables
## 'no_cohort' - names of data frames that should not have a cohort variable
## 'no_sexgender' - names of data frames that should not have a sex or
##    gender variable
## return value - TRUE or an error message
check_classif_vars_consistent_names <- function(nms_classif_all,
                                                nms_obj_all,
                                                nms_classif_template,
                                                no_cohort = character(),
                                                no_sexgender = character()) {
    for (i in seq_along(nms_classif_all)) {
        nms_obtained <- nms_classif_all[[i]]
        nm_obj <- nms_obj_all[[i]]
        nms_expected <- nms_classif_template
        if (nm_obj %in% no_cohort)
            nms_expected <- setdiff(nms_expected, "cohort")
        if (nm_obj %in% no_sexgender)
            nms_expected <- setdiff(nms_expected, c("sex", "gender"))
        if (!setequal(nms_obtained, nms_expected))
            stop("'", nm_obj, "' has classification variables ",
                 paste(nms_obtained, collapse = ", "), " : expected variables ",
                 paste(nms_expected, collapse = ", "))
    }
    TRUE
}


## HAS_TESTS
## Check that classification variables are valid
## (though not necessarily complete.) Assumes that
## 'classif_vars' object has correct columns.
## 'classif_vars' - a data frame containing the classification
##    variables
## 'nm_obj' - the name of the object that the classification
##   variables were taken from
## return value - TRUE or an error message
check_classif_vars_valid <- function(classif_vars, nm_obj) {
    nms <- names(classif_vars)
    nms_numeric <- intersect(nms, c("cohort", "time", "age"))
    ## check that classification variables have no NAs
    for (nm in nms) {
        if (anyNA(classif_vars[[nm]]))
            stop("'", nm, "' variable in '", nm_obj, "' has NAs")
    }
    ## check that classification variables have no duplicated rows
    is_duplicated <- duplicated(classif_vars)
    i_duplicated <- match(TRUE, is_duplicated, nomatch = 0L)
    if (i_duplicated > 0L) {
        str_dup <- paste(classif_vars[i_duplicated, ], collapse = ", ")
        stop("'", nm_obj, "' has more than one row with values : ", str_dup)
    }
    ## check numeric columns are all non-negative whole numbers
    for (nm in nms_numeric) {
        vals <- classif_vars[[nm]]
        if (!is.numeric(vals))
            stop("'", nm, "' variable in '", nm_obj, "' is non-numeric")
        if (any(vals != round(vals)))
            stop("'", nm, "' variable in '", nm_obj, "' has non-integer values")
        if (any(vals < 0L))
            stop("'", nm, "' variable in '", nm_obj, "' has negative values")
    }
    ## check combinations of age, time, and cohort are valid
    has_cohort <- "cohort" %in% nms
    if (has_cohort)
        valid_age_time_cohort(age = classif_vars$age,
                              time = classif_vars$time,
                              cohort = classif_vars$cohort)
    ## return TRUE if all tests passed
    TRUE
}


## HAS_TESTS
## Check that the names of demographic series, and the
## classification variables, of 'account' and 'rates'
## are consistent with each other. Note that 'consistent'
## does not mean identical.
## 'account' - a list of data frames
## 'rates' - a list of data frames
## return value - TRUE or an error
check_consistent_account_rates <- function(account, rates) {
    nms_series_account <- names(account)
    nms_series_rates <- names(rates)
    ## same series, apart from population
    nms_nopop <- setdiff(nms_series_account, "population")
    if (!setequal(nms_nopop, nms_series_rates))
        stop("series in 'account' [", paste(nms_series_account, collapse = ", "),
             "] inconsistent with series in 'rates' [",
             paste(nms_series_rates, collapse = ", "), "]")
    ## same classification variables, apart from cohort
    ## and (for births) sex/gender
    for (nm_series in nms_series_rates) {
        nms_vars_account <- names(account[[nm_series]])
        nms_vars_rates <- names(rates[[nm_series]])
        nms_classif_vars_account <- setdiff(nms_vars_account, "count")
        nms_classif_vars_rates <- setdiff(nms_vars_rates, "rate")
        nms_classif_vars_account_trim <- setdiff(nms_classif_vars_account, "cohort")
        if (nm_series == "births")
            nms_classif_vars_account_trim <- setdiff(nms_classif_vars_account_trim,
                                                     c("sex", "gender"))
        if (!setequal(nms_classif_vars_account_trim, nms_classif_vars_rates)) {
            str_account <- paste(nms_classif_vars_account, collapse = ", ")
            str_rates <- paste(nms_classif_vars_rates, collapse = ", ")
            stop("names of classification variables [", str_account, "] for series '",
                 nm_series, "' in 'account' not consistent with names of",
                 " classification variables [", str_rates, "] for series '",
                 nm_series, "' in 'rates'")
        }
    }
    ## levels of classification variables for account
    ## identical to levels of of classification
    ## variables for rates, apart from time
    for (nm_series in nms_series_rates) {
        data_account <- account[[nm_series]]
        data_rates <- rates[[nm_series]]
        nms_rates <- names(data_rates)
        nms_check <- setdiff(nms_rates, c("time", "rate"))
        for (nm in nms_check) {
            var_account <- data_account[[nm]]
            var_rates <- data_rates[[nm]]
            levels_account <- unique(var_account)
            levels_rates <- unique(var_rates)
            is_in_rates <- levels_account %in% levels_rates
            i_not_in_rates <- match(FALSE, is_in_rates, nomatch = 0L)
            if (i_not_in_rates > 0L)
                stop("'", nm, "' variable for series '", nm_series,
                     "' in 'account' has value [", levels_account[[i_not_in_rates]],
                     "] not found in '", nm, "' variable for series '",
                     nm_series, "' in 'rates'")
            is_in_account <- levels_rates %in% levels_account
            i_not_in_account <- match(FALSE, is_in_account, nomatch = 0L)
            if (i_not_in_account > 0L)
                stop("'", nm, "' variable for series '", nm_series,
                     "' in 'rates' has value [", levels_rates[[i_not_in_account]],
                     "] not found in '", nm, "' variable for series '",
                     nm_series, "' in 'account'")
        }
    }
    TRUE    
}


## HAS_TESTS
## Check that the classification variables of 'rates' and
## 'data_models' are consistent.
## 'rates' - a list of data frames
## 'data_models - a list of data models
## return value - TRUE or an error
check_consistent_rates_data_models <- function(rates, data_models) {
    nms_series_rates <- names(rates)
    nms_series_dm <- vapply(data_models, function(x) x$nm_series, "nm")
    nms_data_dm <- vapply(data_models, function(x) x$nm_data, "nm")
    ## 'data_models' series, apart from "population",
    ## are subset of 'rates' series
    is_in_rates <- nms_series_dm %in% nms_series_rates
    is_popn <- nms_series_dm == "population"
    i_not_in <- match(FALSE, is_in_rates | is_popn, nomatch = 0L)
    if (i_not_in > 0L) {
        nm_not_in <- nms_series_dm[[i_not_in]]
        stop("'data_models' has model for series [", nm_not_in,
             "] not found in 'rates'")
    }
    ## same classification variables, apart from cohort,
    ## and (for births) sex/gender
    for (i_dm in seq_along(data_models)) {
        nm_series <- nms_series_dm[[i_dm]]
        if (nm_series != "population") {
            i_rates <- match(nm_series, nms_series_rates)
            nms_classif_vars_dm <- setdiff(names(data_models[[i_dm]]$data), "count")
            nms_classif_vars_rates <- setdiff(names(rates[[i_rates]]), "rate")
            nms_classif_vars_dm_trim <- setdiff(nms_classif_vars_dm, "cohort")
            if (nm_series == "births")
                nms_classif_vars_dm_trim <- setdiff(nms_classif_vars_dm_trim, c("sex", "gender"))
            if (!setequal(nms_classif_vars_dm_trim, nms_classif_vars_rates)) {
                str_dm <- paste(nms_classif_vars_dm, collapse = ", ")
                str_rates <- paste(nms_classif_vars_rates, collapse = ", ")
                stop("names of classification variables [", str_dm, "] for data model ",
                     "for dataset '", nms_data_dm[[i_dm]], "' and series '",
                     nm_series, "' in 'data_models' not consistent with names of ",
                     "classification variables [", str_rates,
                     "] for series '", nm_series, "' in 'rates'")
            }
        }
    }
    ## levels of classification variables for data models
    ## are subset of levels of classification
    ## variables for rates 
    for (i_dm in seq_along(data_models)) {
        nm_series <- nms_series_dm[[i_dm]]
        if (nm_series != "population") {
            i_rates <- match(nm_series, nms_series_rates)
            data_dm <- data_models[[i_dm]]$data
            data_rates <- rates[[i_rates]]
            nms_vars_rates <- names(data_rates)
            nms_classif_vars_rates <- setdiff(nms_vars_rates, "rate")
            for (nm in nms_classif_vars_rates) {
                var_dm <- data_dm[[nm]]
                var_rates <- data_rates[[nm]]
                levels_dm <- unique(var_dm)
                levels_rates <- unique(var_rates)
                is_included <- levels_dm %in% levels_rates
                i_not_included <- match(FALSE, is_included, nomatch = 0L)
                if (i_not_included > 0L)
                    stop("'", nm, "' variable from data model for dataset '",
                         nms_data_dm[[i_dm]], "' and series '", nm_series,
                         "' in 'data_models' has value [", levels_dm[[i_not_included]],
                         "] not found in '", nm, "' variable from series '",
                         nm_series, "' in 'rates'")
            }
        }
    }
    TRUE    
}


## HAS_TESTS
## Check that 'data' meets the requirements for an
## object of class "datamodel"
## 'data' - a data frame
## 'is_popn' - whether the data refers to population
## return value - TRUE or an error
check_data_dm <- function(data, is_popn) {
    if (!is.data.frame(data))
        stop("'data' has class \"", class(data), "\"")
    nms <- names(data)
    check_names_df_valid(nms_vars = nms,
                         nm_obj = "data",
                         incl_cohort = !is_popn,
                         incl_sexgender = TRUE,
                         incl_region = NULL,
                         nm_measure_var = "count")
    nms_classif_vars <- setdiff(nms, "count")
    classif_vars <- data[nms_classif_vars]
    check_classif_vars_valid(classif_vars = classif_vars,
                             nm_obj = "data")
    check_measure_var_valid(var = data[["count"]],
                            nm_var = "count",
                            nm_obj = "data",
                            na_ok = TRUE,
                            frac_ok = FALSE)
    TRUE
}


## HAS_TESTS
## Check that a to be used in an 'exact' data model
## has all levels of classification variables
## (which is not required in other data model classes.)
## If 'data' is births, then lower and upper
## age groups can be missing.
## 'data' - a data frame
## 'is_births' - whether the data refers to births
check_data_exact <- function(data, is_births) {
    nms_classif_vars <- setdiff(names(data), "count")
    classif_vars <- data[nms_classif_vars]
    check_classif_vars_complete(classif_vars = classif_vars,
                                nm_obj = "data",
                                no_min_age = is_births)
    TRUE
}


## HAS_TESTS
## Check that 'data_models' argument is valid.
## Do not need to check internal consistency
## of individual data models, since this is done
## by checking functions for data models.
## 'data_models' - a list of object of class "data_model"
## return value - TRUE or an error
check_data_models <- function(data_models) {
    ## 'data_models' is list
    if (!is.list(data_models))
        stop("'data_models' is not a list")
    ## return if length 0
    n <- length(data_models)
    if (identical(n, 0L))
        return(TRUE)
    ## all elements are data models
    is_data_model <- vapply(data_models,
                            FUN = inherits,
                            FUN.VALUE = TRUE,
                            what = "datamodel")
    i_not_data_model <- match(FALSE, is_data_model, nomatch = 0L)
    if (i_not_data_model > 0L)
        stop("'data_models' has element with class \"",
             class(data_models[[i_not_data_model]]),
             "\"")
    ## datasets unique
    nms_data <- vapply(data_models, function(x) x$nm_data, "nm")
    is_duplicated_data <- duplicated(nms_data)
    i_duplicated_data <- match(TRUE, is_duplicated_data, nomatch = 0L)
    if (i_duplicated_data > 0L)
        stop("two models in 'data_models' refer to the same dataset : '",
             nms_data[[i_duplicated_data]], "'")
    ## exactly one 'births' series and 'deaths' series
    nms_series <- vapply(data_models, function(x) x$nm_series, "nm")
    for (nm_series in c("births", "deaths")) {
        is_series <- nm_series == nms_series
        if (sum(is_series) == 0L)
            stop("'data_models' does not have a model for ", nm_series)
        if (sum(is_series) >= 2L)
            stop("'data_models' has two or more models for ", nm_series)
    }
    ## consistency across data models
    nm_measure_var <- "count"
    nms_classif_all <- lapply(data_models,
                              function(x) setdiff(names(x$data), nm_measure_var))
    classif_vars_all <- lapply(seq_along(data_models),
                               function(i) data_models[[i]]$data[nms_classif_all[[i]]])
    i_deaths <- match("deaths", nms_series)
    nms_classif_template <- nms_classif_all[[i_deaths]]
    classif_vars_template <- data_models[[i_deaths]]$data[nms_classif_template]
    nms_popn <- nms_data[nms_series == "population"]
    no_cohort <- nms_popn
    no_sexgender <- character()
    absent_ok <- TRUE
    ignore_age_cohort <- "births"
    omit_first_time <- nms_popn
    check_classif_vars_consistent_names(nms_classif_all = nms_classif_all,
                                        nms_obj_all = nms_data,
                                        nms_classif_template = nms_classif_template,
                                        no_cohort = no_cohort,
                                        no_sexgender = no_sexgender)
    check_classif_vars_consistent_levels(classif_vars_all = classif_vars_all,
                                         nms_obj_all = nms_data,
                                         classif_vars_template = classif_vars_template,
                                         absent_ok = absent_ok,
                                         ignore_age_cohort = ignore_age_cohort,
                                         omit_first_time = omit_first_time)
    ## return TRUE if all tests pass
    TRUE
}


## HAS_TESTS
## Check that the 'disp' argument in a data model is valid
## 'disp' - a data frame or numeric scalar
## 'data' - a data frame
## return value - TRUE or an error
check_disp_dm <- function(disp, data) {
    check_arg_dm(arg = disp,
                 nm_arg = "disp",
                 data = data,
                 neg_ok = FALSE,
                 zero_ok = FALSE)
}


## HAS_TESTS
## Check that the 'dominant' argument is
## a valid string.
## 'dominant' - a character vector of length 1
## return value - TRUE or an error
check_dominant <- function(dominant) {
    ## is character
    if (!is.character(dominant))
        stop("'dominant' is non-character")
    ## has length 1
    if (!identical(length(dominant), 1L))
        stop("'dominant' does not have length 1")
    ## not NA
    if (is.na(dominant))
        stop("'dominant' is NA")
    ## not blank
    if (!nzchar(dominant))
        stop("'dominant' is blank")
    TRUE
}


## HAS_TESTS
## Check that  accounting identities are satisfied.
## 'account' - a demographic account
## return value - TRUE or a warning (not an error)
check_identities <- function(account) {
    increments <- to_increments(account)
    if (any(increments$discrepancy != 0))
        warning("accounting identities not satisfied")
    TRUE
}


## HAS_TESTS
## Check that a data model refers to births or deaths
## 'x' - a data model
## TRUE or an error
check_is_births_deaths <- function(x) {
    nm_series <- x$nm_series
    if (!(nm_series %in% c("births", "deaths")))
        stop("data model has class '", class(x)[[1L]],
             "' but series is '", nm_series, "'")
    TRUE
}


## HAS_TESTS
## Check that a data model does not refer to births or deaths
## 'x' - a data model
## TRUE or an error
check_is_not_births_deaths <- function(x) {
    nm_series <- x$nm_series
    if (nm_series %in% c("births", "deaths"))
        stop("data model has class '", class(x)[[1L]],
             "' but series is '", nm_series, "'")
    TRUE
}


## HAS_TESTS
## Check that a measure variable is valid. The measure variable
## can be numeric or a list column: it is up to the calling
## function to check whether either form is appropriate.
## 'var' - the measure variable
## 'nm_var' - the name of the measure variable
## 'nm_obj' - the name of the data frame that 'var' was taken from
## 'na_ok' - whether 'var' can contain NAs
## 'frac_ok' - whether 'var' can contain non-integer values
## return value - TRUE or an error message
check_measure_var_valid <- function(var, nm_var, nm_obj, na_ok, frac_ok) {
    if (is.numeric(var))
        check_measure_var_valid_num(var = var,
                                    nm_var = nm_var,
                                    nm_obj = nm_obj,
                                    na_ok = na_ok,
                                    frac_ok = frac_ok)
    else if (is.list(var))
        check_measure_var_valid_list(var = var,
                                     nm_var = nm_var,
                                     nm_obj = nm_obj,
                                     na_ok = na_ok,
                                     frac_ok = frac_ok)
    else
        stop("'", nm_var, "' variable in '", nm_obj, "' is not numeric or list")
}


## HAS_TESTS
## Check that a measure variable is valid, given that
## the measure variable is a list of numeric vectors
## 'var' - the measure variable
## 'nm_var' - the name of the measure variable
## 'nm_obj' - the name of the data frame that 'var' was taken from
## 'na_ok' - whether 'var' can contain NAs
## 'frac_ok' - whether 'var' can contain non-integer values
## return value - TRUE or an error message
check_measure_var_valid_list <- function(var, nm_var, nm_obj, na_ok, frac_ok) {
    ## elements numeric
    is_numeric <- vapply(var, is.numeric, TRUE)
    if (!all(is_numeric))
        stop("'", nm_var, "' variable in '", nm_obj, "' has non-numeric elements")
    ## all elements have the same length
    if (length(var) > 1L) {
        lengths <- vapply(var, length, 1L)
        if (any(lengths[-1L] != lengths[[1L]]))
            stop("elements of '", nm_var, "' variable in '", nm_obj, "' have different lengths")
    }
    ## non-NA
    if (!na_ok) {
        has_na <- vapply(var, anyNA, TRUE)
        if (any(has_na))
            stop("'", nm_var, "' variable in '", nm_obj, "' has elements with NAs")
    }
    ## non-negative
    x_has_neg <- function(x) any(x[!is.na(x)] < 0)
    has_neg <- vapply(var, x_has_neg, TRUE)
    if (any(has_neg))
        stop("'", nm_var, "' variable in '", nm_obj, "' has elements with negative values")
    ## whole number
    if (!frac_ok) {
        x_has_frac <- function(x) any(x[!is.na(x)] != round(x[!is.na(x)]))
        has_frac <- vapply(var, x_has_frac, TRUE)
        if (any(has_frac))
            stop("'", nm_var, "' variable in '", nm_obj, "' has elements with non-integer values")
    }
    ## return
    TRUE
}


## HAS_TESTS
## Check that a measure variable is valid, given that
## the measure variable is a numeric vector
## 'var' - the measure variable
## 'nm_var' - the name of the measure variable
## 'nm_obj' - the name of the data frame that 'var' was taken from
## 'na_ok' - whether 'var' can contain NAs
## 'frac_ok' - whether 'var' can contain non-integer values
## return value - TRUE or an error message
check_measure_var_valid_num <- function(var, nm_var, nm_obj, na_ok, frac_ok) {
    ## non-NA
    if (!na_ok) {
        if (anyNA(var))
            stop("'", nm_var, "' variable in '", nm_obj, "' has NAs")
    }
    var <- var[!is.na(var)]
    ## non-negative
    if (any(var < 0))
        stop("'", nm_var, "' variable in '", nm_obj, "' has negative values")
    ## whole number
    if (!frac_ok) {
        if (any(var != round(var)))
            stop("'", nm_var, "' variable in '", nm_obj, "' has non-integer values")
    }
    ## return
    TRUE
}


## HAS_TESTS
## Check that 'n_particle' is a positive whole number
## 'n_particle' - a numeric vector of length 1
## return value - TRUE or an error
check_n_particle <- function(n_particle) {
    ## is numeric
    if (!is.numeric(n_particle))
        stop("'n_particle' is non-numeric")
    ## has length 1
    if (!identical(length(n_particle), 1L))
        stop("'n_particle' does not have length 1")
    ## not NA
    if (is.na(n_particle))
        stop("'n_particle' is NA")
    ## is an integer
    if (n_particle != round(n_particle))
        stop("'n_particle' is not an integer")
    ## is at least 1
    if (n_particle < 1)
        stop("'n_particle' is less than 1")
    TRUE
}

## HAS_TESTS
## Check that 'n_thin' is a positive whole number
## 'n_thin' - a numeric vector of length 1
## return value - TRUE or an error
check_n_thin <- function(n_thin) {
    ## is numeric
    if (!is.numeric(n_thin))
        stop("'n_thin' is non-numeric")
    ## has length 1
    if (!identical(length(n_thin), 1L))
        stop("'n_thin' does not have length 1")
    ## not NA
    if (is.na(n_thin))
        stop("'n_thin' is NA")
    ## is an integer
    if (n_thin != round(n_thin))
        stop("'n_thin' is not an integer")
    ## is at least 1
    if (n_thin < 1)
        stop("'n_thin' is less than 1")
    TRUE
}

## HAS_TESTS
## Check that 'n_thread' is NULL
## Will revise as part of introducing
## parallel processing.
check_n_thread <- function(n_thread) {
    if (!is.null(n_thread))
        warning("'n_thread' is currently ignored")
    TRUE
}


## HAS_TESTS
## Check that the column names of a data frame holding
## demographic rates or counts are valid.
## If an 'incl' argument is TRUE, then 'nms_vars' must include
## the associated name. If an 'incl' argument is FALSE, then
## 'nms_vars' must not include the associated name. If an
## 'incl' argument is NULL, then 'nms_vars' can either include or
## not include the associated name.
## If 'incl_sexgender' is TRUE, then the associated
## column can be called "sex" or "gender".
## 'nms_vars' - the column nms_vars of the data frame
## 'nm_obj' - the name of the data frame
## 'incl_cohort' - whether the nms_vars should include "cohort"
## 'incl_sexgender' - whether the nms_vars should include "sex" or "gender"
## 'incl_region' - whether the nms_vars should include "region"
check_names_df_valid <- function(nms_vars,
                                 nm_obj,
                                 incl_cohort,
                                 incl_sexgender,
                                 incl_region,
                                 nm_measure_var) {
    check_names_valid(names = nms_vars,
                      nm_obj = nm_obj)
    sets_names_classif_vars <- get_sets_names_classif_vars(incl_cohort = incl_cohort,
                                                           incl_sexgender = incl_sexgender,
                                                           incl_region = incl_region)
    sets_names_vars <- lapply(sets_names_classif_vars,
                              append,
                              values = nm_measure_var)
    found <- FALSE
    for (set_nms_vars in sets_names_vars) {
        found <- setequal(nms_vars, set_nms_vars)
        if (found)
            break
    }
    if (!found)
        stop("'", nm_obj, "' variables [", paste(nms_vars, collapse = ", "),
             "] incomplete or invalid")
    TRUE
}


## HAS_TESTS
## Check that a combination of series names is valid.
## Under all settings, "births" and "deaths" are required.
## 'names' - the series names to be checked
## 'nm_obj' - the name of the object that the names
##    were taken from
## 'incl_popn' - whether then names should include "population"
## 'incl_stock' - whether then names should include "stock"
## 'subset_ok' - whether 'names' can be a subset of the target names
##    or has to have every member
## return value - TRUE or an error
check_names_series_valid <- function(names,
                                     nm_obj,
                                     incl_popn,
                                     incl_stock,
                                     subset_ok) {
    check_names_valid(names = names,
                      nm_obj = nm_obj)
    sets_names_series <- get_sets_names_series(incl_popn = incl_popn,
                                               incl_stock = incl_stock)
    found <- FALSE
    for (set_names in sets_names_series) {
        if (subset_ok)
            found <- (all(names %in% set_names)
                && ("births" %in% names)
                && ("deaths" %in% names))
        else
            found <- setequal(names, set_names)
        if (found)
            break
    }
    if (!found)
        stop("'", nm_obj, "' series names [", paste(names, collapse = ", "),
             "] incomplete or invalid")
    TRUE
}


## HAS_TESTS
## Check that 'names' is non-NULL,
## and has no NAs, blanks, or duplicates
## 'names' - a vector of possible names, or NULL
## 'nm' - the name of the object that the names
##    were taken from
## return value - TRUE or an error
check_names_valid <- function(names, nm_obj) {
    if (is.null(names))
        stop("'", nm_obj, "' does not have names")
    ## names have no NAs
    if (anyNA(names))
        stop("names for '", nm_obj, "' have NAs")
    ## names have no blanks
    if (any(names == ""))
        stop("names for '", nm_obj, "' have blanks")
    ## names have no duplicates
    if (any(duplicated(names)))
        stop("names for '", nm_obj, "' have duplicates")
    TRUE
}

## HAS_TESTS
## Check that 'nm_data' is a valid dataset name
check_nm_data <- function(nm_data) {
    ## is character
    if (!is.character(nm_data))
        stop("'nm_data' is non-character")
    ## has length 1
    if (!identical(length(nm_data), 1L))
        stop("'nm_data' does not have length 1")
    ## is not NA
    if (is.na(nm_data))
        stop("'nm_data' is NA")
    ## not blank
    if (!nzchar(nm_data))
        stop("'nm_data' is blank")
    TRUE
}

## HAS_TESTS
## Check that 'nm_series' is a valid series name, which
## depends on whether 'names_data' includes "region"
check_nm_series <- function(nm_series, names_data) {
    ## is character
    if (!is.character(nm_series))
        stop("'nm_series' is non-character")
    ## has length 1
    if (!identical(length(nm_series), 1L))
        stop("'nm_series' does not have length 1")
    ## is not NA
    if (is.na(nm_series))
        stop("'nm_series' is NA")
    ## is valid series
    incl_internal <- "region" %in% names_data
    valid_nms_series <- get_names_series(incl_popn = TRUE,
                                         incl_stock = FALSE,
                                         one_imem = NULL,
                                         incl_internal = incl_internal)
    if (!(nm_series %in% valid_nms_series))
        stop("'nm_series' has invalid value [\"", nm_series, "\"]")
    TRUE
}

## HAS_TESTS
## Check that 'prob' is a number between 0 and 1 (exclusive)
## 'prob' - A numeric scalar
## return value - TRUE or an error
check_prob <- function(prob) {
    if (!is.numeric(prob))
        stop("'prob' is non-numeric")
    if (!identical(length(prob), 1L))
        stop("'prob' does not have length 1")
    if (is.na(prob))
        stop("'prob' is NA")
    if (prob <= 0)
        stop("'prob' is less than or equal to 0")
    if (prob >= 1)
        stop("'prob' is greater than or equal to 1")
    TRUE
}


## HAS_TESTS
## Check 'rates' argument.
## 'rates' - a list of data frames
## return value - TRUE or an error
check_rates <- function(rates) {
    ## 'rates' is list
    if (!is.list(rates))
        stop("'rates' is not a list")
    ## elements of 'rates' all data frames
    if (!all(vapply(rates, is.data.frame, TRUE)))
        stop("'rates' has elements that are not data frames")
    ## names for 'rates' are valid demographic series
    nms_rates <- names(rates)
    check_names_series_valid(names = nms_rates,
                             nm_obj = "rates",
                             incl_popn = FALSE,
                             incl_stock = FALSE,
                             subset_ok = FALSE)
    ## Has 'region' variable iff has 'internal_in'. Only need to
    ## check first data frame - if data frames inconsistent,
    ## this will be picked up later.
    check_region_iff_internal(nms_vars = names(rates[[1L]]),
                              nms_series = nms_rates)
    ## internal consistency of each data frame
    for (i in seq_along(rates)) {
        df <- rates[[i]]
        nm_obj <- nms_rates[[i]]
        nms_vars <- names(df)
        nm_measure_var <- "rate"
        measure_var <- df[[nm_measure_var]]
        nms_classif_vars <- setdiff(nms_vars, nm_measure_var)
        classif_vars <- df[nms_classif_vars]
        incl_cohort <- FALSE
        incl_sexgender <- nm_obj != "births"
        incl_region <- "internal_in" %in% nms_rates
        no_min_age <- nm_obj == "births"
        na_ok <- FALSE
        frac_ok <- TRUE
        tryCatch({
            check_names_df_valid(nms_vars = nms_vars,
                                 nm_obj = nm_obj,
                                 incl_cohort = incl_cohort,
                                 incl_sexgender = incl_sexgender,
                                 incl_region = incl_region,
                                 nm_measure_var = nm_measure_var)
            check_classif_vars_valid(classif_vars = classif_vars,
                                     nm_obj = nm_obj)
            check_classif_vars_complete(classif_vars = classif_vars,
                                        nm_obj = nm_obj,
                                        no_min_age = no_min_age)
            check_measure_var_valid(var = measure_var,
                                    nm_var = nm_measure_var,
                                    nm_obj = nm_obj,
                                    na_ok = na_ok,
                                    frac_ok = frac_ok)
        },
        error = function(e) {
            msg <- paste("problem with 'rates' :", e$message)
            stop(msg, call. = FALSE)
        })
    }
    ## consistency across data frames
    nms_classif_all <- lapply(rates, function(x) setdiff(names(x), nm_measure_var))
    classif_vars_all <- lapply(seq_along(rates), function(i) rates[[i]][nms_classif_all[[i]]])
    i_deaths <- match("deaths", nms_rates)
    nms_classif_template <- nms_classif_all[[i_deaths]]
    classif_vars_template <- rates[[i_deaths]][nms_classif_template]
    no_cohort <- character()
    no_sexgender <- "births"
    absent_ok <- FALSE
    ignore_age_cohort <- "births"
    check_classif_vars_consistent_names(nms_classif_all = nms_classif_all,
                                        nms_obj_all = nms_rates,
                                        nms_classif_template = nms_classif_template,
                                        no_cohort = no_cohort,
                                        no_sexgender = no_sexgender)
    check_classif_vars_consistent_levels(classif_vars_all = classif_vars_all,
                                         nms_obj_all = nms_rates,
                                         classif_vars_template = classif_vars_template,
                                         absent_ok = absent_ok,
                                         ignore_age_cohort = ignore_age_cohort)
    ## return TRUE if all tests pass
    TRUE
}


## HAS_TESTS
## Check that the 'ratio' argument in a data model is valid
## 'ratio' - a data frame or numeric scalar
## 'data' - a data frame
## return value - TRUE or an error
check_ratio_dm <- function(ratio, data) {
    check_arg_dm(arg = ratio,
                 nm_arg = "ratio",
                 data = data,
                 neg_ok = FALSE,
                 zero_ok = TRUE)
}


## HAS_TESTS
## Check that classification variables contain 'region' iff
## series include 'internal_in'.
## 'nms_vars' - names of variables
## 'nms_series' - names of demographic series
## return value - TRUE or an error message
check_region_iff_internal <- function(nms_vars, nms_series) {
    has_region <- "region" %in% nms_vars
    has_internal <- "internal_in" %in% nms_series
    ## if has 'internal_in', then all elements have "region" variable
    if (has_region && !has_internal)
        stop("have 'region' variable but do not have series",
             " 'internal_in' and 'internal_out'")
    if (!has_region & has_internal)
        stop("have series 'internal_in' and 'internal_out' but do not",
             " have 'region' variable")
    TRUE
}


## HAS_TESTS
## Check that 'threshold' is a number between 0 and 1 (inclusive)
## 'threshold' - a number
## return value - TRUE or an error
check_threshold <- function(threshold) {
    if (!is.numeric(threshold))
        stop("'threshold' is non-numeric")
    if (!identical(length(threshold), 1L))
        stop("'threshold' does not have length 1")
    if (is.na(threshold))
        stop("'threshold' is NA")
    if (threshold < 0)
        stop("'threshold' is negative")
    if (threshold > 1)
        stop("'threshold' is greater than 1")
    TRUE
}







    
ONSdigital/Bayesian-demographic-accounts documentation built on Jan. 10, 2022, 12:34 a.m.