R/helper-functions.R

Defines functions valid_age_time_cohort to_list_col split_into_rows sort_df sort_classif_cols make_list_dm_df is_lower_triangle fake_classif_vars_popn fake_classif_vars add_initial_to_classif

## HAS_TESTS
## Add an extra combinations to classification
## variables, to represent initial values.
## For existing cohorts, this means the population
## at the end the period before the main estimation period;
## for new cohorts, this means accession to age 0, ie birth.
add_initial_to_classif <- function(classif_vars) {
    time_min <- min(classif_vars$time)
    is_time_min <- with(classif_vars, time == min(time))
    is_age_min <- with(classif_vars, age == 0L)
    is_upper <- with(classif_vars, cohort == time - age - 1L)
    ## existing cohorts
    initial_existing <- classif_vars[is_time_min & is_upper, ]
    initial_existing$time <- initial_existing$time - 1L
    ## new_cohorts
    initial_new <- classif_vars[is_age_min & !is_upper, ]
    initial_new$age <- -1L
    ## result
    classif_vars <- rbind(initial_existing,
                          initial_new, 
                          classif_vars)
    classif_vars
}


## HAS_TESTS
fake_classif_vars <- function(n_age = 3L,
                              n_time = 3L,
                              sex = TRUE,
                              n_region = 0L) {
    args <- list(age = seq.int(from = 0L, to = n_age - 1L),
                 time = seq.int(from = 2000L, length.out = n_time),
                 is_lower = c(TRUE, FALSE))
    if (sex)
        sexgender <- list(sex = c("Female", "Male"))
    else
        sexgender <- list(gender = c("Female", "Male", "Diverse"))
    args <- c(args, sexgender)
    if (n_region > 0L)
        args <- c(args, list(region = seq_len(n_region)))
    ans <- expand.grid(args,
                       stringsAsFactors = FALSE,
                       KEEP.OUT.ATTRS = FALSE)
    if (n_region > 0L)
        ans$region <- factor(ans$region, levels = seq_len(n_region))
    ans$cohort <- ans$time - ans$age - 1L + ans$is_lower
    ans <- ans[-match("is_lower", names(ans))]
    ans <- sort_classif_cols(ans)
    ans <- sort_df(ans)
    ans    
}


## HAS_TESTS
fake_classif_vars_popn <- function(n_age = 3L,
                                   n_time = 3L,
                                   sex = TRUE,
                                   n_region = 0L) {
    args <- list(age = seq.int(from = 0L, to = n_age - 1L),
                 time = seq.int(from = 1999L, length.out = n_time + 1L))
    if (sex)
        sexgender <- list(sex = c("Female", "Male"))
    else
        sexgender <- list(gender = c("Female", "Male", "Diverse"))
    args <- c(args, sexgender)
    if (n_region > 0L)
        args <- c(args, list(region = seq_len(n_region)))
    ans <- expand.grid(args,
                       stringsAsFactors = FALSE,
                       KEEP.OUT.ATTRS = FALSE)
    if (n_region > 0L)
        ans$region <- factor(ans$region, levels = seq_len(n_region))
    ans <- sort_classif_cols(ans)
    ans <- sort_df(ans)
    ans    
}


## HAS_TESTS
is_lower_triangle <- function(age, time, cohort) {
    (time - cohort) == age
}


## HAS_TESTS
## Make list of vectors or matrices of values from a
## data model that align to 'classif_vars'. 
## 'df' - a data frame extracted from an object of class "data_model"
## 'classif_vars' - a data frame
## 'is_popn' - whether the data model refers to population
## return value - a list of vectors or matrices
make_list_dm_df <- function(df, classif_vars, is_popn) {
    if (is_popn) {
        df$cohort <- df$time - df$age
        classif_vars <- add_initial_to_classif(classif_vars)
    }
    nms_classif_vars <- names(classif_vars)
    nm_measure_var <- setdiff(names(df), nms_classif_vars)
    merged <- merge(classif_vars,
                    df,
                    by = nms_classif_vars,
                    all.x = TRUE,
                    sort = FALSE)
    merged <- sort_classif_cols(merged, ignore = nm_measure_var)
    merged <- sort_df(merged, ignore = nm_measure_var)
    nm_f <- intersect(c("cohort", "sex", "gender"), names(merged))
    ans_df <- to_list_col(df = merged,
                          nm_f = nm_f,
                          nm_x = nm_measure_var)
    has_region <- "region" %in% names(merged)
    if (has_region) {
        n_region <- nlevels(classif_vars$region)
        ans_df[[nm_measure_var]] <- lapply(ans_df[[nm_measure_var]], matrix, nrow = n_region)
    }
    ans_df[[nm_measure_var]]
}


## HAS_TESTS
sort_classif_cols <- function(x, ignore = NULL) {
    cols_order <- c("cohort", "sex", "gender", "time", "age", "region")
    has_ignore <- !is.null(ignore)
    if (has_ignore) {
        i_ignore <- match(ignore, names(x), nomatch = 0L)
        i_invalid <- match(0L, i_ignore, nomatch = 0L)
        if (i_invalid > 0L)
            stop("\"", ignore[[i_invalid]], "\" is not a valid column name")
        cols_ignore <- x[i_ignore]
        x <- x[-i_ignore]
    }
    cols_classif <- names(x)
    is_valid <- cols_classif %in% cols_order
    i_invalid <- match(FALSE, is_valid, nomatch = 0L)
    if (i_invalid > 0L)
        stop("invalid name for classification variable : \"",
             cols_classif[[i_invalid]], "\"")
    cols <- intersect(cols_order, cols_classif)
    ans <- x[cols]
    if (has_ignore)
        ans <- data.frame(ans, cols_ignore)
    ans
}

    
## HAS_TESTS
## Sort the rows of a data froame, with the left-most columns
## varying slowest and right-most fastest.
sort_df <- function(x, ignore = NULL) {
    if (is.null(ignore))
        args <- x
    else {
        i_ignore <- match(ignore, names(x), nomatch = 0L)
        i_invalid <- match(0L, i_ignore, nomatch = 0L)
        if (i_invalid > 0L)
            stop("\"", ignore[[i_invalid]], "\" is not a valid column name")
        args <- x[-i_ignore]
    }
    ord <- do.call(order, args = args)
    ans <- x[ord, ]
    rownames(ans) <- NULL
    ans
}

## NO_TESTS
## 'df' is a data frame
split_into_rows <- function(df) {
    f <- seq_len(nrow(df))
    split(x = df,
          f = f)
}


## HAS_TESTS
## Create data frame from 'df' with classifying variables
## specified by 'nm_f' and measure variable(s) 'nm_x',
## where the measure variables are put into list columns,
## each entry of which can have any length (not necessarily
## the same). This is an internal function,
## and we assume that all arguments are valid.
## df - a data frame
## nm_f - the names of 1 or more columns of 'df'
## nm_x - the names of 1 or more columns of 'df'
to_list_col <- function(df, nm_f, nm_x) {
    f <- df[nm_f]
    f_unique <- unique(f)
    vals_f <- do.call(paste, f)
    vals_f_unique <- do.call(paste, f_unique)
    indices <- lapply(vals_f_unique, grep, x = vals_f)
    ans <- f_unique
    rownames(ans) <- NULL
    for (nm in nm_x) {
        vals <- df[[nm]]
        get_indexed_vals <- function(index) vals[index]
        ans[[nm]] <- lapply(indices, get_indexed_vals)
    }
    ans
}


## HAS_TESTS
valid_age_time_cohort <- function(age, time, cohort) {
    diff <- (time - cohort) - age
    is_valid <- diff %in% c(0L, 1L)
    i_invalid <- match(FALSE, is_valid, nomatch = 0L)
    if (i_invalid > 0L)
        stop("invalid combination of age [", age[[i_invalid]],
             "], time [", time[[i_invalid]], "], and cohort [",
             cohort[[i_invalid]], "]")
    TRUE
}
ONSdigital/Bayesian-demographic-accounts documentation built on Jan. 10, 2022, 12:34 a.m.