R/make_inputs.R

Defines functions tidy_data_dm make_df_rates make_df_meta make_df_forecast make_df_extend make_df_estimate make_df_cdms make_df_birthdeath make_deaths make_classif_vars make_births_to make_births_of

## HAS_TESTS
## Create a data frame with classification vartiables 'cohort'
## and 'sex'/'gender', and with measure variables 'is_new_cohort',
## and 'births_of'. 'is_new_cohort' is a logical vector.
## 'births_of' is a list of scalars for national data,
## and a list of vectors for regional data,
## with NAs where 'is_new_cohort' is FALSE.
## 'births_of' is births of members of the cohort,
## ans is the initial population for a cohort born during the period.
make_births_of <- function(data_births, classif_vars) {
    nms_classif <- names(classif_vars)
    has_region <- "region" %in% nms_classif
    nms_classif_ans <- intersect(c("cohort", "sex", "gender"), nms_classif)
    nms_keep <- intersect(c("time", "sex", "gender", "region"), nms_classif)
    data_ag <- stats::aggregate(data_births["count"],
                                data_births[nms_keep],
                                sum)
    names(data_ag)[match("time", names(data_ag))] <- "cohort"
    if (has_region)
        data_ag <- sort_df(data_ag, ignore = "count") ## to sort regions
    data_ag <- to_list_col(df = data_ag,
                           nm_f = nms_classif_ans,
                           nm_x = "count")
    names(data_ag)[match("count", names(data_ag))] <- "counts_births_of"
    ans <- unique(classif_vars[nms_classif_ans])
    ans <- sort_df(ans)
    ans$is_new_cohort <- ans$cohort %in% data_ag$cohort
    ans <- merge(ans, data_ag,
                 by = nms_classif_ans,
                 all.x = TRUE,
                 sort = FALSE)
    ans <- sort_df(ans, ignore = c("counts_births_of", "is_new_cohort"))
    ans
}

## HAS_TESTS
## Create a data frame with classification variables "cohort",
## and "sex"/"gender", and with measure variable "births_to",
## which is a list of scalars for national data, and a list of vectors for
## regional data. The measurement is the number of births occurring
## to members of a cohort. The value is 0 where
## the sex/gender is not dominant.
make_births_to <- function(data_births, classif_vars, dominant) {
    nms_classif <- names(classif_vars)
    nms_classif_nosexgender <- setdiff(nms_classif, c("sex", "gender"))
    nm_sexgender <- intersect(nms_classif, c("sex", "gender"))
    nms_classif_ans <- c("cohort", nm_sexgender)
    has_region <- "region" %in% nms_classif
    data_ag <- stats::aggregate(data_births["count"],
                                data_births[nms_classif_nosexgender],
                                sum)
    data_ag[[nm_sexgender]] <- dominant
    data_ag <- sort_classif_cols(data_ag, ignore = "count")
    data_ag <- sort_df(data_ag, ignore = "count")
    data_ag <- to_list_col(df = data_ag,
                           nm_f = nms_classif_ans,
                           nm_x = "count")
    names(data_ag)[match("count", names(data_ag))] <- "counts_births_to"
    if (has_region) {
        n_region <- nlevels(classif_vars$region)
        data_ag$counts_births_to <- lapply(data_ag$counts_births_to, matrix, nrow = n_region)
    }
    ans <- unique(classif_vars[nms_classif_ans])
    is_dominant <- ans[[nm_sexgender]] == dominant
    is_in_repr_ages <- ans$cohort %in% data_ag$cohort
    ans$has_births_to <- is_dominant & is_in_repr_ages
    ans <- merge(ans, data_ag,
                 by = nms_classif_ans,
                 all.x = TRUE,
                 sort = FALSE)
    ans <- sort_df(ans,
                   ignore = c("has_births_to", "counts_births_to"))
    ans
}


## HAS_TESTS
## rates is a list of data frames that
## has been checked via 'check_rates_list',
## 'check_rates_single' and 'check_rates_consistent'
make_classif_vars <- function(rates) {
    deaths <- rates$deaths
    cols_base <- intersect(c("age", "sex", "gender", "time", "region"),
                           names(deaths))
    ans <- deaths[cols_base]
    ans <- rbind(ans, ans)
    is_upper <- rep(c(FALSE, TRUE), each = nrow(ans) / 2L)
    ans$cohort <- ans$time - ans$age - is_upper
    ans <- sort_classif_cols(ans)
    has_region <- "region" %in% names(ans)
    if (has_region)
        if (!is.factor(ans$region))
            ans$region <- factor(ans$region, levels = unique(ans$region))
    ans <- sort_df(ans)
    ans
}


## HAS_TESTS
## Create a data frame of deaths aligned exactly to 'classif_vars'.
## Assume that 'data_deaths' has been checked - including
## checking that it has all the combinations of variables
## in 'classif_vars'. The measurement variable, "deaths",
## in the return value is a list of matrices if
## the 'data_deaths' has a 'region' column, and
## otherwise a list of vectors.
make_deaths <- function(data_deaths, classif_vars) {
    nms_classif <- names(classif_vars)
    nms_classif_ans <- intersect(c("cohort", "sex", "gender"), nms_classif)
    has_region <- "region" %in% nms_classif
    ans <- sort_classif_cols(data_deaths, ignore = "count")
    ans <- sort_df(ans, ignore = "count") # 'region' varies fastest if present
    ans <- to_list_col(df = ans,
                       nm_f = nms_classif_ans,
                       nm_x = "count")
    names(ans)[match("count", names(ans))] <- "counts_deaths"
    if (has_region) {
        n_region <- nlevels(classif_vars$region)
        ans$counts_deaths <- lapply(ans$counts_deaths, matrix, nrow = n_region)
    }
    ans
}


## HAS_TESTS
## make counts of births and deaths
make_df_birthdeath <- function(data_models, classif_vars, dominant) {
    nm_series <- vapply(data_models, function(x) x$nm_series, "")
    data_births <- data_models[[match("births", nm_series)]]$data
    data_deaths <- data_models[[match("deaths", nm_series)]]$data
    nms_classif <- names(classif_vars)
    nms_classif_ans <- intersect(c("cohort", "sex", "gender"), nms_classif)
    births_of <- make_births_of(data_births = data_births,
                                classif_vars = classif_vars)
    births_to <- make_births_to(data_births = data_births,
                                classif_vars = classif_vars,
                                dominant = dominant)
    deaths <- make_deaths(data_deaths = data_deaths,
                          classif_vars = classif_vars)
    ans <- unique(classif_vars[nms_classif_ans])
    left_join <- function(x, y)
        merge(x, y, by = nms_classif_ans, sort = FALSE)
    ans <- left_join(ans, births_of)
    ans <- left_join(ans, births_to)
    ans <- left_join(ans, deaths)
    ans
}


## HAS_TESTS
## 'data-models' is a list of object of class "datamodel"
## 'classif_vars' is a data frame
make_df_cdms <- function(data_models, classif_vars) {
    nms_classif <- names(classif_vars)
    has_region <- "region" %in% nms_classif
    nms_classif_ans <- intersect(c("cohort", "sex", "gender"), nms_classif)
    nms_series_ans <- get_nms_series_cdms(nms_classif)
    nms_series_dm <- vapply(data_models, function(x) x$nm_series, "")
    is_birth_death <- nms_series_dm %in% c("births", "deaths")
    data_models <- data_models[!is_birth_death]
    nms_series_dm <- nms_series_dm[!is_birth_death]
    ans <- unique(classif_vars[nms_classif_ans])
    for (i in seq_along(data_models)) {
        nm <- nms_series_dm[[i]]
        if (identical(nm, "population")) {
            nms_series_dm[[i]] <- "stock"
            cohort <- with(data_models[[i]]$data, time - age)
            data_models[[i]]$data$cohort <- cohort
        }
    }
    nms_series_dm[grep("^immigration$", nms_series_dm)] <- "immigration1"
    nms_series_dm[grep("^emigration$", nms_series_dm)] <- "emigration1"
    l <- lapply(data_models,
                make_list_cdm,
                classif_vars = classif_vars)
    constructor <- if (has_region) new_CdmsWithreg else new_CdmsNoreg
    for (nm in nms_series_ans) {
        i_series <- grep(nm, nms_series_dm)
        n_series <- length(i_series)
        if (n_series > 0L) {
            l_series <- l[i_series]
            l_series <- do.call(function(...) mapply(list, ..., SIMPLIFY = FALSE),
                                args = l_series)
            l_series <- lapply(l_series, constructor)
        }
        else
            l_series <- rep(list(constructor()), times = nrow(ans))
        cdms_nm <- paste("cdms", nm, sep = "_")
        ans[[cdms_nm]] <- l_series
    }
    ans
}


make_df_estimate <- function(rates,
                             data_models,
                             dominant) {
    ## 'classif_vars' is a data frame with cols "cohort", "sex"/"gender",
    ## "time", "age", and, optionally, "region"
    classif_vars <- make_classif_vars(rates)
    nms_classif <- names(classif_vars)
    nms_classif_ans <- intersect(c("cohort", "sex", "gender"), nms_classif)
    ## 'df_birthdeath' is a data frame with cols "cohort", "sex"/"gender",
    ## "is_new_cohort", "counts_births_of", "has_births_to", "counts_births_to",
    ## "counts_deaths"
    df_birthdeath <- make_df_birthdeath(data_models = data_models,
                                        classif_vars = classif_vars,
                                        dominant = dominant)
    ## 'df_rates' is a data frame with cols "cohort", "sex"/"gender",
    ## "rates_births", "rates_deaths", "rates_immigration1",
    ## "rates_emigration1", "rates_immigration2", "rates_emigration2",
    ## and possibly "rates_internal_in", "rates_internal_out"
    df_rates <- make_df_rates(rates = rates,
                              classif_vars = classif_vars,
                              dominant = dominant)
    ## 'df_cdms' is a data frame with cols "cohort", "sex"/"gender",
    ## "cdms_stock", "cdms_immigration1", "cdms_emigration1",
    ## "cdms_immigration2", "cdms_emigration2", and possibly
    ## "cdms_internal_in", "cdms_internal_out"
    df_cdms <- make_df_cdms(data_models = data_models,
                            classif_vars = classif_vars)
    ## store metadata
    df_meta <- make_df_meta(classif_vars = classif_vars)
    ## combine 'df_birthdeath', 'df_rates', 'df_cdms' to create
    ## single data frame with all inputs
    df <- merge(df_birthdeath, df_rates,
                by = nms_classif_ans,
                sort = FALSE)
    df <- merge(df, df_cdms,
                by = nms_classif_ans,
                sort = FALSE)
    ## final processing
    i_sexgender <- grep("^sex$|^gender$", names(df))
    df$sexgender <- df[[i_sexgender]]
    df$initial_stock_fixed <- df$is_new_cohort
    df$n_interval <- vapply(df$counts_deaths, length, 1L)
    ## return
    df
}


make_df_extend <- function(account,
                           rates,
                           data_models,
                           dominant) {
    stop("not written yet")
}


make_df_forecast <- function(account,
                             rates,
                             dominant) {
    stop("not written yet")
}


## HAS_TESTS
make_df_meta <- function(classif_vars) {
    classif_vars <- sort_classif_cols(classif_vars)
    classif_vars <- sort_df(classif_vars)
    nms_classif <- names(classif_vars)
    nms_classif_ans <- intersect(c("cohort", "sex", "gender"), nms_classif)
    nms_meta <- setdiff(nms_classif, nms_classif_ans)
    ans <- unique(classif_vars[nms_classif_ans])
    rownames(ans) <- NULL
    meta <- split(x = classif_vars[nms_meta],
                  f = classif_vars[nms_classif_ans],
                  lex.order = TRUE)
    names(meta) <- NULL
    meta <- lapply(meta, function(x) {rownames(x) <- NULL; x})
    ans$meta <- meta
    ans
}    


## HAS_TESTS
make_df_rates <- function(rates, classif_vars, dominant) {
    ans <- classif_vars
    nms_classif <- names(classif_vars)
    nms_classif_by <- setdiff(nms_classif, "cohort") # rates do not have cohort variable
    nms_classif_ans <- intersect(c("cohort", "sex", "gender"), nms_classif)
    has_region <- "region" %in% nms_classif
    left_join <- function(x, y)
        merge(x, y, by = nms_classif_by, sort = FALSE, all.x = TRUE)
    nms_measure <- c("births", "deaths",
                     "immigration1", "emigration1",
                     "immigration2", "emigration2")
    if (has_region)
        nms_measure <- c(nms_measure, "internal_in", "internal_out")
    nms_measure <- paste("rates", nms_measure, sep = "_")
    ## rename 'rates' columns to names of series
    for (i in seq_along(rates))
        names(rates[[i]])[match("rate", names(rates[[i]]))] <-
            paste("rates", names(rates)[[i]], sep = "_")
    ## births - unlike other series, birth rates do not
    ## have a sex/gender dimension.
    births <- rates[["births"]]
    sexgender <- intersect(c("sex", "gender"), nms_classif)
    births[[sexgender]] <- dominant
    ans <- left_join(ans, births)
    ans$rates_births[is.na(ans$rates_births)] <- 0
    ## deaths
    deaths <- rates[["deaths"]]
    ans <- left_join(ans, deaths)
    ## internal migration (if present)
    if ("internal_in" %in% names(rates)) {
        for (name in c("internal_in", "internal_out")) {
            val <- rates[[name]]
            ans <- left_join(ans, val)
        }
    }
    ## add immigration/emigration
    if ("immigration" %in% names(rates)) {
        for (name in c("immigration", "emigration")) {
            val <- rates[[name]]
            name_measure <- paste("rates", name, sep = "_")
            names(val)[match(name_measure, names(val))] <- paste0(name_measure, 1)
            ans <- left_join(ans, val)
        }
        ans$rates_immigration2 <- 0
        ans$rates_emigration2 <- 0
    }
    else {
        for (name in c("immigration1", "emigration1", "immigration2", "emigration2")) {
            val <- rates[[name]]
            ans <- left_join(ans, val)
        }
    }
    ## sort columns and rows
    ans <- sort_classif_cols(ans, ignore = nms_measure)
    ans <- sort_df(ans, ignore = nms_measure)
    ## nest rates within 'cohort' and 'sex'/'gender' and return
    ans <- to_list_col(ans,
                       nm_f = nms_classif_ans,
                       nm_x = nms_measure)
    if (has_region) {
        n_region <- nlevels(classif_vars$region)
        for (nm in nms_measure)
            ans[[nm]] <- lapply(ans[[nm]], matrix, nrow = n_region)
    }
    ans
}


## HAS_TESTS
tidy_data_dm <- function(data) {
    integer_colnames <- c("age", "cohort", "time")
    numeric_colnames <- "count"
    cols_int <- intersect(colnames(data), integer_colnames)
    cols_num <- intersect(colnames(data), numeric_colnames)
    data[cols_int] <- lapply(data[cols_int], as.integer)
    data[cols_num] <- lapply(data[cols_num], as.numeric)
    data
}
ONSdigital/Bayesian-demographic-accounts documentation built on Jan. 10, 2022, 12:34 a.m.