#### raking functions ----
#### read.inputs ----
#' Read xlsx or csv file
#'
#' reads in xlsx or csv input files, detecting those file types from file extension, from inputs
#' folder. This is a helper function used in \code{\link{dbRake}}.
#'
#' @param inputFile is a string of the name of an xlsx or csv file to be read in
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
read.inputs <- function(inputFile) {
if(stringr::str_detect(inputFile, ".xlsx")) {
myFile <- openxlsx::read.xlsx(here::here("inputs", inputFile))
}
if(stringr::str_detect(inputFile, ".csv")) {
myFile <- readr::read_csv(here::here("inputs", inputFile))
}
if(stringr::str_detect(inputFile, ".xlsx", negate = TRUE) & stringr::str_detect(inputFile, ".csv", negate = TRUE)) {
message("Please only read in an xlsx or csv file.")
} else {
myFile
}
}
#### rounded ----
#' Round a number
#'
#' For some reason, R rounds to even (i.e., 12.5 rounds to 12), but we want to round to 0
#' (i.e., 12.5 rounds to 13). This is a helper function used in \code{\link{dbRake}}.
#'
#' @param x Number to be rounded
#' @return Rounded number
#' @examples
#' rounded(1.2345) ## 1
#' rounded(12.5) ## 13
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
#' @author Stephanie Yurchak, BC Stats
#' @export
rounded <- function(x) { trunc(x + 0.5) }
#### rename.age.grps ----
#' Rename negative columns as 5 year age groups
#'
#' Some databases may have 5 year age groups as -4, -9, -14, ... meaning 0-4, 5-9, 10-14, ....
#' However, the \code{\link{dbRake}} function requires depends on age group column names as 'X-Y'.
#' This is a helper function used in \code{\link{dbRake}}.
#'
#' @param data A dataset with -X1, -X2, -X3, ... column names
#' @param VarRegion Name of Region variable in all files (e.g., "LHA")
#' @param VarSex Name of Sex variable in all files (e.g., "Sex")
#' @return data with X-Y col names
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
rename.age.grps <- function(data, VarRegion, VarSex) {
## prep
notAges <- c("Year", "Type", "TypeID", {{VarRegion}}, {{VarSex}}, "TOTAL", "Total", "999", "-999")
ageLast <- max(as.numeric(names(data)[!names(data) %in% notAges]))
# ageLast <- names(data)[ncol(data)-1] ## assumes age are ordered and Total is last column
othCols <- c(notAges, {{ageLast}})
# othCols <- c({{VarRegion}}, {{VarSex}}, {{ageLast}}, "TOTAL")
negs <- data %>% dplyr::select(tidyselect::starts_with(match = "-")) %>% names()
## need to create 5yr age group names; have -4, -9, -14, etc, need 0-4, 5-9, etc
## https://stackoverflow.com/questions/36143119/how-to-get-last-subelement-of-every-element-of-a-list-in-r
negCols <- names(data)[!(names(data) %in% othCols) & stringr::str_detect(names(data), "-")]
ageEnds <- sapply(stringr::str_split(negCols, pattern = "-"), utils::tail, n = 1) %>% as.numeric()
ageOth <- negCols[which(as.numeric(negCols) %% 5 == 0)] ## numbers evenly divisible by 5 (i.e., end in 0 or 5)
othCols <- c(othCols, ageOth) %>% unique()
ageOth <- stringr::str_replace(ageOth, pattern = "-", replacement = "") ## drop neg sign to find in ageEnds
ageEnds <- ageEnds[!(ageEnds %in% ageOth)]
AgeGrps5Yr <- rep(NA, length(ageEnds))
for (i in 1:length(AgeGrps5Yr)) {
AgeGrps5Yr[i] <- paste0((ageEnds[i]-4), "-", ageEnds[i])
} ## AgeGrps5Yr: "0-4" "5-9" "10-14" ... "85-89" "90-94" "95-99"
AgeGrps5Yr <- AgeGrps5Yr[AgeGrps5Yr != "NA-NA"] ## created when there are no 5 year age groups
## rename {{data}} columns to match AgeGrps5Yr
colnames(data)[!colnames(data) %in% othCols & colnames(data) %in% negs] <- AgeGrps5Yr
## drop any remaining negative column names from {{data}}
if(any(names(data) %in% negs)) { data <- data %>% dplyr::select(-tidyselect::any_of(negs)) }
return(data)
}
#### real.to.int ----
#' Converts real numbers to integers
#'
#' Converts real numbers (fractions) to integer numbers while preserving rounded sum.
#' This is a helper function used in \code{\link{dbRake}}.
#'
#' @param realNums a vector of real (fraction) number
#' @return a vector of integer numbers that sums to the rounded sum of realNums
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
#' @export
real.to.int <- function(realNums) {
## difference between rounded sum of realNums and rounded sum of just integer portion of realNums
myDiff <- rounded(sum(realNums)) - rounded(sum(floor(realNums)))
## create vector of decimals (e.g., 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.10)
x <- (1:length(realNums))/(100*length(realNums))
## randomize decimals, no duplicates (replace = FALSE)
randNums <- sample(x = x, size = length(realNums), replace = FALSE)
## add randomized decimals to fractions
if(any(class(realNums) == "data.frame")) {
fracRand <- realNums %>% dplyr::bind_rows(- floor(realNums) + randNums) %>% colSums(.)
} else {
## when realNums is a vector
fracRand <- realNums - floor(realNums) + randNums
}
## get DECREASING rank order
sortOrder <- abs(rank(fracRand) - (1+length(realNums)))
## get floor of realNums vector
intNums <- floor(realNums)
## add 1 to largest fracRand for number needed to increase to rounded sum of realNums
for(i in which(sortOrder <= myDiff)) {
intNums[i] <- floor(realNums[i]) + 1
}
## return integer numbers vector
intNums
}
#### calc.cols ----
#' Calculate necessary columns
#'
#' calculates necessary columns (Sum, Ctrl_TOTAL, Diff, adj_value) in preparation for prorating rows
#' or raking. First, it calculates the actual sum (of columns), then adds in VarRow control totals,
#' then calculates the difference, and finally calculates the adjustment value (difference divided by
#' number of groups). This is a helper function used in \code{\link{dbRake}}, and before calling \code{\link{prorate.row}}.
#'
#' This function is first called in dbRake Part 1 to update initial initial estimates of male/female
#' regional total values (before prorating rows, and again before raking), then again in Part 2 to
#' update initial 5 year age group and maximum age group estimates by Sex (before prorating rows,
#' and again before raking), and finally in Part 3 to update initial single year of age estimates
#' by Sex (before prorating rows, and again before raking).
#'
#' @param data a dataframe of initial population counts that need to be adjusted to control totals
#' (e.g., columns: a region variable ("VarRow"), 1 (for Males), 2 (for Females))
#' @param temp a dataframe of control totals to adjust data to (e.g., region control totals)
#' @param VarRow the name of the variable to join temp to data (e.g., VarRow for "LHA")
#' @param n_colGrps the number of groups to adjust over (e.g., number of Sexes, 2); this is
#' essentially the number of columns in data minus 1 (for the VarRow column)
#' @return the original dataframe with four new columns (Sum, Ctrl_TOTAL, Diff, adj_value)
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
calc.cols <- function(data, temp, VarRow, n_colGrps) {
data %>%
## calc Sum of VarRow per VarCol (-1 to exclude VarRow column)
dplyr::mutate(Sum = rowSums(data[ , -1])) %>%
## add in VarRow Control Totals, rename as Ctrl_TOTAL
dplyr::left_join(temp, by = "VarRow") %>%
dplyr::rename(Ctrl_TOTAL = TOTAL) %>%
## calc difference between Sum and Ctrl_TOTAL, and adjusted value
dplyr::mutate(Diff = Ctrl_TOTAL - Sum,
adj_value = Diff / n_colGrps)
}
#### prorate.row ----
#' Adjust data row by row to sum to control totals
#'
#' Reconcile row by row, when using region control totals, while updating initial estimates of
#' male/female regional total values if difference is NOT zero, adjust actual data. This function
#' will be run iteratively through each row of a dataframe already prepared by running through
#' \code{\link{calc.cols}} while the row's difference is not zero. For example, for 1:n_Sex, prorate
#' so that region totals sum to region control totals. This is a helper function used in \code{\link{dbRake}}.
#'
#' @param CurrRow a dataframe of a row to prorate
#' @param n_colGrps the number of groups to adjust over (e.g., number of groups in region)
#' @param allowNegatives a logical of whether negative population values are allowed (usually FALSE)
#' @return original dataframe of row, that is now prorated by the adjustment values
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
#' @export
prorate.row <- function(CurrRow, n_colGrps, allowNegatives) {
myVars <- names(CurrRow)[2:(n_colGrps+1)]
CurrRow <- CurrRow %>% dplyr::mutate(dplyr::across(tidyselect::all_of(myVars), `+`, CurrRow$adj_value))
## if negatives are NOT allowed, replace any negative adjusted data with zero
if(allowNegatives == FALSE) {
## loop through data columns 2:(n_colGrps+1)
for(j in 2:(n_colGrps+1)) {
if (CurrRow[ ,j] < 0) { CurrRow[ ,j] <- 0 }
}
}
## calc new (adjusted) sum, new difference, and new adjustment value
CurrRow$Sum <- sum(CurrRow[ ,2:(n_colGrps+1)])
CurrRow$Diff <- CurrRow$Ctrl_TOTAL - CurrRow$Sum
# if(abs(CurrRow$Diff) < 0.001) { CurrRow$Diff <- 0 }
if(abs(CurrRow$Diff) < 0.000001) { CurrRow$Diff <- 0 }
CurrRow$adj_value <- CurrRow$Diff/n_colGrps
CurrRow
}
#### prorate ----
#' Adjust data to sum to control totals via prorate.row
#'
#' Prorating is for adjusting data to control totals in only 1 dimension. Use dbRake to adjust data in 2 dimensions.
#'
#' This function iteratively adjusts each row of data by running \code{\link{prorate.row}} which
#' first prepares rows by running through \code{\link{calc.cols}} while the row's difference is not zero.
#' For example, for 1:n_Sex, prorate
#' so that region totals sum to region control totals. This is a helper function used in \code{\link{dbRake}}.
#'
#' @param data a dataframe to prorate (one row per VarRow)
#' @param control a dataframe of control totals (one row per VarRow), column names Year, TOTAL
#' @param VarRow a character, the name of the variable to prorate on (e.g., when prorating Sex for LHA to region control totals, VarRow = "Region")
#' @param allowNegatives a logical of whether negative population values are allowed (usually FALSE)
#' @return original dataframe, that is now prorated by the adjustment values
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
#' @note This function not currently integrated into dbRake
#' @export
prorate <- function(data, control, VarRow, allowNegatives = FALSE) {
## make sure both dataframes have the joining variable renames VarRow for calc.cols
names(data)[names(data) == VarRow] <- "VarRow"
names(control)[names(control) == VarRow] <- "VarRow"
## Step 1: calc actual sum, add in VarRow (i.e., Region) control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
n_colGrps <- ncol(data) - 1
n_rows <- nrow(data)
data <- calc.cols(data = data, control, VarRow, n_colGrps)
## this has columns: VarRow, all non-Total Sexes, Sum, Ctrl_TOTAL, Diff, adj_value for all regions
## 1E. reconcile row by row (i.e., for 1:n_Sex, prorate so region totals sum to region control totals)
for (i in 1:n_rows) {
## Step 3: add/subtract adjustment value to/from actual data to get first interim value
## Step 4: repeat Steps 1 through 3 while difference is not zero
CurrRow <- data[i, ]
## WHILE difference is NOT zero, adjust actual data
while(abs(CurrRow$Diff) > 0.0000000001) {
CurrRow <- prorate.row(CurrRow, n_colGrps, allowNegatives)
}
## ensure all numbers are integers (i.e., no fractional people allowed)
CurrRow[, 2:(n_colGrps + 1)] <- real.to.int(realNums = CurrRow[, 2:(n_colGrps + 1)])
## replace actual data with adjusted data in CurrRow
data[i, ] <- CurrRow
}
names(data)[names(data) == "VarRow"] <- VarRow
data
}
#### prep.prorate.col ----
#' Calculate necessary rows
#'
#' calculates necessary rows (Sum, Ctrl_TOTAL, Diff, adj_value) in preparation for prorating rows
#' or raking. First, it calculates the actual sum (of rows), then adds in VarRow control totals,
#' then calculates the difference, and finally calculates the adjustment value (difference divided by
#' number of groups). This is a helper function used in \code{\link{dbRake}}, and before calling \code{\link{prorate.col}}.
#'
#' This function is first called in dbRake Part 1 to update initial initial estimates of male/female
#' regional total values (after prorating rows before prorating columns), then again in Part 2 to
#' update initial 5 year age group and maximum age group estimates by Sex (after prorating rows
#' before prorating columns) for the older population (vector called AgeGrpsOldest (e.g., "75-59"
#' "80-84" "85-89" "90-94" "95-99" "100") determined by AgeGrpMax) and then again for (remaining)
#' younger populations, and finally in Part 3 to update initial single year of age estimates
#' by Sex (after prorating rows before prorating columns).
#'
#' @param data a dataframe of inital population counts that need to be adjusted to control totals
#' (e.g., columns: a region variable ("VarRow"), 1 (for Males), 2 (for Females))
#' @param n_rowGrps the number of groups to adjust over (e.g., number of groups in region)
#' @param colGrps the number of groups to adjust over (e.g., number of Sexes, 2; number of Age
#' Groups); this is essentially the number of columns in data minus 1 (for the VarRow column) or
#' minus 2 (for the VarRow & Sex columns)
#' @param ctrl_total_row a row of control totals to adjust data to (e.g., region control totals)
#' @param AgeGrpMax age of the older population that will be prorated and raked separately from
#' other 5 year age groups. AgeGrpMax will include all ages, including itself, through the remainder
#' of the population. Default = NULL. If AgeGrpMax is not set, the function will use 75 and up
#' (not necessarily the oldest age; that is, the oldest age is usually 100, meaning 100 and up).
#' The BC Stats Demographics team determined that 75 was the best age for AgeGrpMax to ensure that
#' distortion in older populations is minimized. When prep.prorate.col is called in Part 1 & 3,
#' AgeGrpMax is NULL. AgeGrpMax is set to an age when prep.prorate.col is called in Part 2 when
#' prorating older population, but is NULL when prorating remaining younger population.
#' @param ageLast oldest age value. Default = NULL. When prep.prorate.col is called in Part 1 & 3,
#' ageLast is NULL. ageLast is set when prep.prorate.col is called in Part 2 when prorating older
#' population, but is NULL when prorating remaining younger population.
#' @return original dataframe, that is now prorated by the adjustment values
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
prep.prorate.col <- function(data, n_rowGrps, colGrps, ctrl_total_row, AgeGrpMax = NULL, ageLast = NULL){
## Step 0: get appropriate rows and columns
out <- data[1:n_rowGrps, colGrps]
## Step 1: calc actual sum, add in VarRow control totals, calc difference
out[nrow(out)+1, 1] <- "Sum"
## if running on age columns
if(!is.null(AgeGrpMax) & !is.null(ageLast)) {
## ALSO if AgeGrpsMax is a single (non-grouped) age, use 'sum()'
if(AgeGrpMax == ageLast) {
out[nrow(out), -1] <- sum(out[, -1], na.rm = TRUE)
} else {
## use colSums() to sum across multiple age columns
out[nrow(out), -1] <- as.list(colSums(out[, -1], na.rm = TRUE))
}
} else {
## if running on columns other than age, use colSums() to sum across multiple columns (e.g., sex)
out[nrow(out), -1] <- as.list(colSums(out[, -1], na.rm = TRUE))
}
out[nrow(out)+1, ] <- ctrl_total_row
out[nrow(out)+1, 1] <- "Diff"
out[nrow(out), -1] <- (out[which(out[, 1] == "Ctrl_TOTAL"), -1]
- out[which(out[, 1] == "Sum"), -1])
## Step 2: calc adjustment value (difference divided by number of groups)
out[nrow(out)+1, 1] <- "adj_value"
out[nrow(out), -1] <- (out[which(out[, 1] == "Diff"), -1] / n_rowGrps)
out
}
#### prorate.col ----
#' Adjust data column by column to sum to row totals
#'
#' Reconcile column by column, when using region control totals, while updating initial estimates of
#' male/female regional total values if difference is NOT zero, adjust actual data. This function
#' will be run iteratively through each column of a dataframe already prepared by running through
#' \code{\link{prep.prorate.col}} while the column's difference is not zero. For example, for 1:n_Sex,
#' prorate so that region totals sum to region control totals. This is a helper function used in \code{\link{dbRake}}.
#'
#' @param CurrCol a dataframe of a column to prorate
#' @param n_rowGrps the number of groups to adjust over (e.g., number of groups in region)
#' @param allowNegatives a logical of whether negative population values are allowed (usually FALSE)
#' @return original dataframe of column, that is now prorated by the adjustment values
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
prorate.col <- function(CurrCol, n_rowGrps, allowNegatives) {
myCol <- colnames(CurrCol)[2]
adj_value <- CurrCol %>% dplyr::filter(VarRow == "adj_value") %>%
dplyr::select(tidyselect::all_of(myCol)) %>% dplyr::pull()
CurrCol[1:n_rowGrps, -1] <- CurrCol[1:n_rowGrps, -1] + adj_value
## if negatives are NOT allowed, replace any negative adjusted data with zero
if(allowNegatives == FALSE) {
## loop through data rows 1:n_rowGrps
for(j in 1:n_rowGrps) {
if (CurrCol[j, -1] < 0) { CurrCol[j, -1] <- 0 }
}
}
## calc new (adjusted) sum, new difference, and new adjustment value
CurrCol[which(CurrCol[, 1] == "Sum"), -1] <- sum(CurrCol[1:n_rowGrps, -1], na.rm = TRUE)
CurrCol[which(CurrCol[, 1] == "Diff"), -1] <- (CurrCol[which(CurrCol[, 1] == "Ctrl_TOTAL"), -1]
- CurrCol[which(CurrCol[, 1] == "Sum"), -1])
if(abs(CurrCol[which(CurrCol[, 1] == "Diff"), -1]) < 0.001) { CurrCol[which(CurrCol[, 1] == "Diff"), -1] <- 0 }
CurrCol[which(CurrCol[, 1] == "adj_value"), -1] <- (CurrCol[which(CurrCol[, 1] == "Diff"), -1] / n_rowGrps)
CurrCol
}
#### add.random.fraction.to.cols ----
#' Add random fraction for sorting
#'
#' Add a random number to a specified column, "my_col", then sort rows based on my_col, with the
#' random fraction used to break any ties. This is a helper function used in \code{\link{dbRake}}
#' within the raking algorithm functions (\code{\link{allowNegsnoMargin}}, \code{\link{noNegsnoMargin}},
#' and \code{\link{noNegsneedMargin}}), when there are more than two row groups (e.g., Regions, 5-year
#' Age Groups, Ages, more than two sexes (when Stats Can adds more than Male and Female)).
#'
#' @param df a dataframe (e.g., VarRow (e.g., for "LHA"), whichever columns being raked over, "Min")
#' @param my_col a column in df (i.e., "Min" which is the minimum value for each row). This is the
#' column that the rows need to be sorted by. That is, the df needs to be sorted with the VarRow with
#' the smallest minimum value to the one with the largest minimum value, in preparation for raking.
#' @return original dataframe without original my_col, but with a new sort_rows column with the
#' order needed to sort rows
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
add.random.fraction.to.cols <- function(df, my_col) {
## assumes df has VarRow as first col, multiple columns, then my_col which is col needing to sort rows by
df <- dplyr::rename(df, this_col = {{my_col}})
## 1. create random column, based on number of rows in df
df$random <- sample(x = (1:dim(df)[1]), size = dim(df)[1], replace = FALSE)
## 2. add random numbers as fractions to col needing to break tie (so my_col will be considered first then random to break ties)
upPower10 <- 10*(10^ceiling(log10(dim(df)[1]))) ## round up to the nearest power of 10
df$col_rand <- (df$this_col + df$random/upPower10)
## 3. sort rows in descending order col_rand, so adjustments always made to largest row
## subtract rank (where 1 = smallest) from (1 + dim(df)[1]) so 1 = largest
df$sort_rows <- 1 + dim(df)[1] - rank(df$col_rand)
## 4. delete unnecessary columns
df$random <- NULL
df$col_rand <- NULL
df$this_col <- NULL
## return df with new sort_rows column replacing original my_col
df
}
#### adjustSex ----
#' Adjust population counts by Sex, when additional prorating is needed before raking.
#'
#' @description
#' The APL Estimates Breakdown System (EBS) was unable to rake population estimates of CHSAs
#' by Age and Sex, unless Sex was adjusted (aka, prorated) before it was raked (even more so than
#' the prorating included within the raking function). (EBS first used CHSAs in 2019.)
#'
#' @details
#' This function takes InputData and adjusts Sex by TypeID counts, compared to CtrlPopTotals (i.e.,
#' BC age sex population counts). InputData and CtrlPopTotals are assumed to already be in the
#' environment.
#' Historically, EBS had been run on the 89 LHAs (local health areas). EBS was first run on CHSAs
#' in 2019 to break down the 218 CHSA estimate totals to age sex estimates. The results were erratic
#' in that many young age children (0-5) were removed during the raking process rendering some CHSAs
#' with very low or no children aged 0-5. This problem was fixed by first prorating the EBS output
#' by sex (i.e., this adjustSex() function) using the BC level sex totals, then raking as usual.
#' @param InputData Data variable containing the database to be adjusted. Expects data to be in
#' data.frame with columns: Year, Type, TypeID, Sex, ages 0:100, TOTAL. If InputData's Age is wide,
#' it will pivot the age columns long to column "Age". If any BC/overall counts are included (i.e.,
#' TypeID == 0), they are dropped.
#' @param CtrlPopTotals Data variable containing the overall BC database to be adjusted against.
#' Expects data to be in data.frame with columns: Year, Sex, ages 0:100, TOTAL. The Sex variable is
#' assumed to be numeric, with the largest Sex value representing Total sex.
#' @param years Vector of Years to adjust. Default = NULL. If NULL, the function will iterate over
#' all years found in InputData (assuming such years are also in CtrlPopTotals).
#' @param VarSex Name of Sex variable in both files (e.g., "Sex"). \strong{Note: Sex must be a
#' numeric variable (e.g., 1,2,3) where the Total is the maximum number (e.g., 3.}
#' @return Database with Sex adjusted to CtrlPopTotals, by TypeID.
#' \code{\link{dbConvert}}
#' @examples
#' \dontrun{ adjustSex(InputData = pop, CtrlPopTotals = BC_AGE_SEX, years = NULL, VarSex = "Sex") }
#' \dontrun{ adjustSex(InputData = pop, CtrlPopTotals = BC_AGE_SEX) }
#' @family raking helpers
#' @seealso Overall package documentation: \code{\link{dbutils}}()
#' @export
adjustSex <- function(InputData, CtrlPopTotals, years = NULL, VarSex = "Sex") {
### * PREP
## ensure InputData has "Year", "TypeID" and {{VarSex}} columns
if(!all(c("Year", "Type", "TypeID", VarSex) %in% names(InputData))) {
stop(paste0("InputData is missing (or has misnamed) variables: Year, Type, TypeID and/or ", VarSex, "."))
}
## ensure CtrlPopTotals has "Year" and {{VarSex}} columns
if(!all(c("Year", VarSex) %in% names(CtrlPopTotals))) {
stop(paste0("CtrlPopTotals is missing (or has misnamed) variables: Year and/or ", VarSex, "."))
}
## ensure InputData's Age is long; if wide, pivot it; if no Age data, throw error
if(!("age" %in% tolower(names(InputData)))) {
## no "Age" column in InputData
namesTotal <- c("total", "Total", "TOTAL")
if(any(namesTotal %in% names(InputData))) {
InputData <- InputData %>% dplyr::select(-tidyselect::any_of(namesTotal))
}; rm(namesTotal)
## get namesAges
namesAges <- names(InputData)[(names(InputData) %in% -999:999)]
if(length(namesAges) == 0) {
stop("InputData has no Age data (i.e., no 'Age' nor individual age (0, 1, 2, ...) columns).")
}
## lengthen InputData with new "Age" column from original 0, 1, 2, ... columns
InputData <- InputData %>%
tidyr::pivot_longer(tidyselect::all_of(namesAges), names_to = "Age", values_to = "pop") %>%
dplyr::mutate(Age = as.numeric(Age))
}
## ensure InputData and CtrlPopTotals both have {{VarSex}}, it is numeric, and values are same in files
InputData <- InputData %>% dplyr::rename(VarSex = {{VarSex}})
if(class(InputData$VarSex) != "numeric") {
InputData <- InputData %>% dplyr::mutate(VarSex = as.numeric(VarSex))
}
sexes <- unique(InputData$VarSex)
VarSexTotal <- max(sexes)
CtrlPopTotals <- CtrlPopTotals %>% dplyr::rename(VarSex = {{VarSex}})
if(class(CtrlPopTotals$VarSex) != "numeric") {
CtrlPopTotals <- CtrlPopTotals %>% dplyr::mutate(VarSex = as.numeric(VarSex))
}
if(any(sort(unique(CtrlPopTotals$VarSex)) != sort(sexes))) {
stop(paste0(VarSex, " values do not match between InputData and CtrlPopTotals."))
}
## drop any BC/overall totals from InputData
InputData <- InputData %>% dplyr::filter(TypeID != 0)
## if 'years' not set (i.e., NULL), pull from InputData; check that all years are in InputData & CtrlPopTotals
if(is.null(years)) {
years <- unique(InputData$Year)
} else if(any(!(years %in% unique(InputData$Year)))) {
stop("One or more years are not in InputData.")
}
if(any(!(years %in% unique(CtrlPopTotals$Year)))) {
stop("One or more years are not in CtrlPopTotals.")
}
### * ADJUST SEX
## create placeholder for adjusted data
adjInputData <- InputData[0,] %>% dplyr::rename(adjpop = pop)
## iterate over years
for(yr in seq_along(years)) {
data <- InputData %>% dplyr::filter(Year == years[yr])
dataBC <- CtrlPopTotals %>%
dplyr::filter(Year == years[yr]) %>%
dplyr::select(-Year, -TOTAL) %>%
tidyr::pivot_longer(-VarSex, names_to = "Age", values_to = "BC") %>%
dplyr::mutate(Age = as.numeric(Age)) %>%
dplyr::arrange(Age)
## get all non-total sexes
sexGrps <- sexes[sexes != VarSexTotal]
## get adjusted counts for each (non-total) sex
for(i in seq_along(sexGrps)) {
## A. get tempA (Year, (Type), TypeID, VarSex, Age, pop) where VarSex == sexGrps[i]
tempA <- data %>% dplyr::filter(VarSex == sexGrps[i]) %>%
dplyr::select(TypeID, Age, pop)
## B. get tempB (Age, pop) for VarSex == sexGrps[i]
tempB <- tempA %>%
dplyr::group_by(Age) %>%
dplyr::summarize(pop = sum(pop)) %>%
dplyr::arrange(Age)
## C. get tempC (TypeID, Age, pop.x (tempA pop), pop.y (tempB pop), pop)
tempC <- tempA %>%
dplyr::left_join(tempB, by = "Age") %>%
dplyr::mutate(pop = pop.x / pop.y)
## D. calculate diff of tempC from dataBC
diff <- dataBC %>% dplyr::filter(VarSex == sexGrps[i]) %>% dplyr::select(-VarSex) %>%
dplyr::left_join(tempB, by = "Age") %>%
dplyr::mutate(diff = BC - pop) %>%
dplyr::select(Age, diff)
## E. get tempD (TypeID, Age, pop.x, pop.y, pop, diff, pop_diff, adjpop)
tempD <- tempC %>%
dplyr::left_join(diff, by = "Age") %>%
dplyr::mutate(pop_diff = pop * diff) %>%
dplyr::mutate(adjpop = pop.x + pop_diff)
## F. get whole numbers
tempE <- tempD %>%
dplyr::mutate(adjpop = floor(0.5 + adjpop), VarSex = i) %>%
dplyr::select(TypeID, Age, VarSex, adjpop)
## G. save tempE as adjSex#
assign(x = paste0("adjSex", sexGrps[i]), value = tempE)
rm(tempA, tempB, tempC, tempD, tempE, diff)
}; rm(i)
## combine adjSex# data
adjSexes <- paste0("adjSex", sexGrps)
total <- get(adjSexes[1])
for(s in 2:length(adjSexes)) {
total <- total %>% dplyr::bind_rows(get(adjSexes[s]))
}; rm(s)
## sum adjpop across all sexes, and join into total
temp <- total %>% dplyr::group_by(TypeID, Age) %>%
dplyr::summarize(adjpop = sum(adjpop), .groups = "drop") %>%
dplyr::mutate(VarSex = VarSexTotal) %>%
dplyr::select(TypeID, Age, VarSex, adjpop)
total <- total %>% dplyr::bind_rows(temp)
rm(temp)
## add back Year and Type variables
total <- total %>%
dplyr::mutate(Year = years[yr], Type = unique(data$Type)) %>%
dplyr::select(Year, Type, tidyselect::everything())
## add adjusted data into adjInputData
adjInputData <- adjInputData %>% dplyr::bind_rows(total)
rm(data, dataBC, sexGrps, total, adjSexes)
}
### * OUTPUT
adjInputData <- adjInputData %>%
dplyr::mutate(VarSex = as.numeric(VarSex)) %>%
tidyr::pivot_wider(names_from = "Age", values_from = "adjpop") %>%
dplyr::select(Year, Type, TypeID, VarSex, tidyselect::everything()) %>%
janitor::adorn_totals(where = "col", name = "TOTAL", -c(Year, Type, TypeID, VarSex)) %>%
dplyr::rename({{VarSex}} := VarSex)
rm(namesAges, sexes, yr, VarSexTotal)
return(adjInputData)
}
#### raking algorithm function A: allowNegsnoMargin ----
#' Raking Algorithm Function A: when negative values are allowed
#'
#' When raking, the appropriate algorithm is chosen (negative values allowed or not, margin needed
#' or not), "whole people" adjustments to be made to selected cells in a row are calculated, then
#' "residual people" adjustments to be made to selected cells in a row are calculated, then column
#' control totals are reconciled. Raking is run iteratively row-by-row. Regardless of the algorithm
#' needed, five arguments are needed. A sixth argument, "needMargin" is only needed for algorithm C
#' (\code{\link{noNegsneedMargin}}). This is a helper function used in \code{\link{dbRake}} when
#' negative values ARE allowed.
#'
#' @param CurrRow a subset of data with 3 rows and columns: XXX, n_colGrps, Sum, Ctrl_TOTAL,
#' Diff, adj_value, where XXX is Sex in Part 1, Region in Part 2, and Age in Part 3. The rows
#' are the one XXX row currently being worked on, Adjustments, and AdjCurrRow.
#' @param CurrRow_value a counter to iterate through rows in data
#' @param data a dataframe of rows to be iteratively raked with columns: XXX, n_colGrps, Sum,
#' Ctrl_TOTAL, Diff, adj_value, where XXX is the variable being raked (Sex in Part 1, Region in Part 2,
#' Age in Part 3). Rows are the rows of XXX as well as three summary rows: Sum, Ctrl_TOTAL, Diff.
#' @param n_colGrps the number of column groups to rake over (e.g., 89 LHAs, number of Age Groups, etc.)
#' @param n_rowGrps the number of rows groups to rake over (e.g., 2 Sexes, 89 LHAs, etc.)
#' @param RowAdj vector of zeros, of length n_colGrps (e.g., 89 zeros, etc.)
#' @return original dataframe, but with CurrRow's Diff now 0, and any adjustments made to CurrRow
#' reversed in the row below that has the largest minimum value
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
allowNegsnoMargin <- function(CurrRow, CurrRow_value, data, n_colGrps, n_rowGrps, RowAdj) {
### ********************** A. when negatives are allowed ********************** ###
### ** calc "whole people" adjustments to be made to all cells in the row (see p.12)
## Get column number(s) of valid groups (i.e., all groups)
valid_grps <- (1:n_colGrps) + 1
## calc number of "whole people" to adjust by (multiply by sign (neg or pos) b/c take abs of Diff)
adj_whole <- floor(abs(CurrRow$Diff[1]) / length(valid_grps)) * sign(CurrRow$Diff[1])
## no looping needed, as all groups are valid b/c negatives are allowed
CurrRow[2, 2:(n_colGrps+1)] <- adj_whole
## determine # people actually allocated
## (when CurrRow < adj_whole, not all of adj_whole will be allocated)
sum_adj <- sum(CurrRow[2, 2:(n_colGrps+1)])
## update Diff
CurrRow[1, "Diff"] <- CurrRow[1, "Diff"] - sum_adj
## update AdjCurrRow row (i.e., AdjCurrRow = AdjCurrRow + Adjustments), which is subtracting, as adj_whole is neg
CurrRow[3, 2:(n_colGrps+1)] <- CurrRow[3, 2:(n_colGrps+1)] + CurrRow[2, 2:(n_colGrps+1)]
### ** calc "residual people" adjustments to be made to selected cells in row
### (valid cells to adjust if negatives values are allowed) (see p.16)
## (FYI: below process is the same as when negs are not allowed and values need to be taken away)
## add adjustments to date to original cell values (i.e., replace orig row with AdjCurrRow)
CurrRow[1, 2:(n_colGrps + 1)] <- CurrRow[3, 2:(n_colGrps + 1)]
## update valid groups (i.e., updated CurrRow[1,] > 0)
## p.16: "Note that no adjusments are made to cells that have a population of 0." Negs allowed, so why not?
valid_grps <- which(CurrRow[1, 2:(n_colGrps+1)] > 0) + 1
## add in blank NA Margin row
CurrRow[4, 1] <- "Margin"
CurrRow[4, 2:(n_colGrps+1)] <- NA
## create random numbers to sort descending (updated) AdjCurrRow row in case 2+ are the same
CurrRow[5, 1] <- "Random"
CurrRow[5, 2:(n_colGrps+1)] <- as.list(sample(x = (1:n_colGrps), size = n_colGrps, replace = FALSE))
## add random numbers as fractions to AdjCurrRow values (so AdjCurrRow will be considered first and random to break ties)
upPower10 <- 10*(10^ceiling(log10(n_colGrps))) ## round up to the nearest power of 10
CurrRow[6, 1] <- "AdjCurrRow + Random"
CurrRow[6, 2:(n_colGrps+1)] <- CurrRow[3, 2:(n_colGrps+1)] + (CurrRow[5, 2:(n_colGrps+1)])/upPower10
## sort cells in descending order AdjCurrRow size, so adjustments always made to largest margins
## (use unlist() to get CurrRow row as a non-dataframe vector)
## subtract rank (where 1 = smallest) from (1+n_colGrps) so 1 = largest
CurrRow[7, 1] <- "Sort"
CurrRow[7, 2:(n_colGrps+1)] <- as.list(1 + n_colGrps - rank(unlist(CurrRow[6, 2:(n_colGrps+1)])))
## only adjust cells for which we have residual people (i.e., adjust where AdjCurrRow largest)
for(i in which(CurrRow[7, ] <= abs(CurrRow$Diff[1]))) {
CurrRow[1, i] <- CurrRow[1, i] + (1*sign(CurrRow$Diff[1]))
CurrRow[2, i] <- CurrRow[2, i] + (1*sign(CurrRow$Diff[1])) ## also update Adjustments row to adjust RowAdj for end
}
## check: sum(CurrRow[1, 2:(n_colGrps+1)]) == CurrRow$Ctrl_TOTAL[1]
## update row 1's columns: Sum & Diff, and RowAdj
CurrRow[1, "Sum"] <- sum(CurrRow[1, 2:(n_colGrps+1)])
CurrRow[1, "Diff"] <- CurrRow[1, "Sum"] - CurrRow[1, "Ctrl_TOTAL"]
RowAdj <- RowAdj + unlist(CurrRow[2, 2:(n_colGrps+1)])
### ** reconcile column control totals (p.18-23)
## p.18&19: Negatives allowed, values added to current row, subtract values from lower rows
## p.20: Negatives allowed, values subtracted from current row, add values to lower rows
## replace original data row with updated CurrRow[1,]
data[CurrRow_value, ] <- CurrRow[1, ]
## clean up
rm(upPower10, i, CurrRow)
## update CurrRow_value
CurrRow_value <- CurrRow_value + 1
if(n_rowGrps == 2) {
## if only two Sexes, reconciling is literally subtracting RowAdj (which should = updated Diff) from other Sex
## drop unneeded columns at end: Ctrl_TOTAL, Diff
data$Ctrl_TOTAL <- NULL
data$Diff <- NULL
## update Sum and Diff rows
data[data$VarRow == "Sum", 2:(n_colGrps+1)] <- as.list(colSums(data[1:n_rowGrps, 2:(n_colGrps+1)]))
data[data$VarRow == "Diff", 2:(n_colGrps+1)] <- (data[data$VarRow == "Sum", 2:(n_colGrps+1)]
- data[data$VarRow == "Ctrl_TOTAL", 2:(n_colGrps+1)]) ## RowAdj
## subtract Diff from other Sex (i.e., row above Sum row)
data[which(data$VarRow == "Sum")-1, 2:(n_colGrps+1)] <- (data[which(data$VarRow == "Sum")-1, 2:(n_colGrps+1)]
- data[data$VarRow == "Diff", 2:(n_colGrps+1)])
## check that this adjustment makes Diff now all equal 0
data[data$VarRow == "Sum", 2:(n_colGrps+1)] <- as.list(colSums(data[1:n_rowGrps, 2:(n_colGrps+1)]))
data[data$VarRow == "Diff", 2:(n_colGrps+1)] <- (data[data$VarRow == "Sum", 2:(n_colGrps+1)]
- data[data$VarRow == "Ctrl_TOTAL", 2:(n_colGrps+1)])
}
if(n_rowGrps > 2) {
## TO-DO: add this in
stop("The 'dbRake' function does not have functionality yet to handle more than 2 groups (e.g., Sexes) when negatives are allowed.")
}
#### ************************************ Check Point ************************************** ####
## At this point, columns (Regions) should sum to their control totals, AND
## rows (Sexes) should sum to their control totals
# sum(data[data$Sex == "Sum", 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3]
# sum(data[1, 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1]
# sum(data[2, 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 2]
# sum(data[data$Sex == "Diff", -1]) ## should be zero
#### ************************************************************************************** ####
data
}
#### raking algorithm function B: noNegsnoMargin ----
#' Raking Algorithm Function B: when negative values are NOT allowed and NO margin is needed
#'
#' When raking, the appropriate algorithm is chosen (negative values allowed or not, margin needed
#' or not), "whole people" adjustments to be made to selected cells in a row are calculated, then
#' "residual people" adjustments to be made to selected cells in a row are calculated, then column
#' control totals are reconciled. Raking is run iteratively row-by-row. Regardless of the algorithm
#' needed, five arguments are needed. A sixth argument, "needMargin" is only needed for algorithm C
#' (\code{\link{noNegsneedMargin}}). This is a helper function used in \code{\link{dbRake}} when
#' negative values are NOT allowed AND no margin is needed. Specifically, values need to be
#' subtracted from cells in current row, and added to lower rows. Because adjustments are added to
#' lower rows, no margin is needed.
#'
#' @param CurrRow a subset of data with 3 rows and columns: XXX, n_colGrps, Sum, Ctrl_TOTAL,
#' Diff, adj_value, where XXX is Sex in Part 1, Region in Part 2, and Age in Part 3. The rows
#' are the one XXX row currently being worked on, Adjustments, and AdjCurrRow.
#' @param CurrRow_value a counter to iterate through rows in data
#' @param data a dataframe of rows to be iteratively raked with columns: XXX, n_colGrps, Sum,
#' Ctrl_TOTAL, Diff, adj_value, where XXX is the variable being raked (Sex in Part 1, Region in Part 2,
#' Age in Part 3). Rows are the rows of XXX as well as three summary rows: Sum, Ctrl_TOTAL, Diff.
#' @param n_colGrps the number of column groups to rake over (e.g., 89 LHAs, number of Age Groups, etc.)
#' @param n_rowGrps the number of rows groups to rake over (e.g., 2 Sexes, 89 LHAs, etc.)
#' @param RowAdj vector of zeros, of length n_colGrps (e.g., 89 zeros, etc.)
#' @return original dataframe, but with CurrRow's Diff now 0, and any adjustments made to CurrRow
#' reversed in the row below that has the largest minimum value
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
noNegsnoMargin <- function(CurrRow, CurrRow_value, data, n_colGrps, n_rowGrps, RowAdj) {
### ************************ B. when needMargin is FALSE *********************** ###
### ** Subtract values from cells in current row, add to lower rows, negs not allowed ** ###
### ** calc "whole people" adjustments to be made to all cells in the row (see p.13 & 14)
## Get column number(s) of valid groups (i.e., Adjusted Region > 0)
valid_grps <- which(CurrRow[3, 2:(n_colGrps+1)] > 0) + 1
## For adjustments, repeat looping until Diff < number of valid_grps (i.e., until no "whole" people left)
while(abs(CurrRow[1, "Diff"]) >= length(valid_grps) & length(valid_grps != 0) ) {
## (re)set Adjustments row to zero
CurrRow[2, 2:(n_colGrps+1)] <- 0
## calc number of "whole people" who need to be subtracted from each valid group
adj_whole <- (floor(abs(CurrRow$Diff[1]) / length(valid_grps))) * sign(CurrRow$Diff[1])
## fill adjustment row CurrRow[2, ], with minimum of AdjCurrRow or adj_whole, IFF a valid_grp
for(a in valid_grps) {
## if AdjCurrRow (i.e., CurrRow[3, ]) < abs(adj_whole), only take AdjCurrRow amount, else take adj_whole
if (CurrRow[3, a] < abs(adj_whole)) {
CurrRow[2, a] <- CurrRow[3, a] * sign(CurrRow$Diff[1])
} else {
## set adjustment
CurrRow[2, a] <- adj_whole
}
}
## determine # people actually allocated
## (when CurrRow < abs(adj_whole), not all of adj_whole will be allocated)
sum_adj <- sum(CurrRow[2, 2:(n_colGrps+1)])
## update Diff
CurrRow[1, "Diff"] <- CurrRow[1, "Diff"] - sum_adj
## update AdjCurrRow row (i.e., AdjCurrRow = AdjCurrRow + Adjustments), which is subtracting, as adj_whole is neg
CurrRow[3, 2:(n_colGrps+1)] <- CurrRow[3, 2:(n_colGrps+1)] + CurrRow[2, 2:(n_colGrps+1)]
## update RowAdj
RowAdj <- RowAdj + unlist(CurrRow[2, 2:(n_colGrps+1)])
## update valid groups (i.e., updated Margin > 0)
valid_grps <- which(CurrRow[3, 2:(n_colGrps+1)] > 0) + 1
}
### ** calc "residual people" adjustments to be made to selected cells in row
### (valid cells to adjust if values need to be taken away, negatives not allowed) (see p.16)
## add adjustments to date to original cell values (i.e., replace orig row with AdjCurrRow)
CurrRow[1, 2:(n_colGrps + 1)] <- CurrRow[3, 2:(n_colGrps + 1)]
## add in blank NA Margin row
CurrRow[4, 1] <- "Margin"
CurrRow[4, 2:(n_colGrps+1)] <- NA
## create random numbers to sort descending (updated) AdjCurrRow row in case 2+ are the same
CurrRow[5, 1] <- "Random"
CurrRow[5, 2:(n_colGrps+1)] <- as.list(sample(x = (1:n_colGrps), size = n_colGrps, replace = FALSE))
## add random numbers as fractions to AdjCurrRow values (so AdjCurrRow will be considered first and random to break ties)
upPower10 <- 10*(10^ceiling(log10(n_colGrps))) ## round up to the nearest power of 10
CurrRow[6, 1] <- "AdjCurrRow + Random"
CurrRow <- CurrRow %>% dplyr::mutate(dplyr::across(-VarRow, ~ as.double(.x)))
CurrRow[6, 2:(n_colGrps+1)] <- CurrRow[3, 2:(n_colGrps+1)] + (CurrRow[5, 2:(n_colGrps+1)])/upPower10
## sort cells in descending order AdjCurrRow size, so adjustments always made to largest margins
## (use unlist() to get CurrRow row as a non-dataframe vector)
## subtract rank (where 1 = smallest) from (1+n_colGrps) so 1 = largest
CurrRow[7, 1] <- "Sort"
CurrRow[7, 2:(n_colGrps+1)] <- as.list(1 + n_colGrps - rank(unlist(CurrRow[6, 2:(n_colGrps+1)])))
## only adjust cells for which we have residual people (i.e., subtract where AdjCurrRow largest)
for(i in which(CurrRow[7, ] <= abs(CurrRow$Diff[1]))) {
CurrRow[1, i] <- CurrRow[1, i] - 1
CurrRow[2, i] <- CurrRow[2, i] - 1 ## also update Adjustments row to add to RowAdj for end
}
## check: sum(CurrRow[1, 2:(n_colGrps+1)]) == CurrRow$Ctrl_TOTAL[1]
## update row 1's columns: Sum & Diff, and RowAdj
CurrRow[1, "Sum"] <- sum(CurrRow[1, 2:(n_colGrps+1)])
CurrRow[1, "Diff"] <- CurrRow[1, "Sum"] - CurrRow[1, "Ctrl_TOTAL"]
## do NOT Add RowAdj back to self as this has been accounted for by adding residual to whole adjustments in for loop above
RowAdj <- unlist(CurrRow[2, 2:(n_colGrps+1)])
### ** reconcile column control totals (p.18-23)
## p.22&23: Negatives not allowed, values subtracted from current row, add values to lower rows
## replace original data row with updated CurrRow[1,]
data[CurrRow_value, ] <- CurrRow[1, ]
## clean up
rm(upPower10, i, CurrRow)
## update original data Sum & Diff rows
data[which(data[ ,1] == "Sum"), 2:(n_colGrps+1)] <- as.list(colSums(data[1:n_rowGrps, 2:(n_colGrps+1)]))
data[which(data[ ,1] == "Diff"), 2:(n_colGrps+1)] <- (data[which(data[ ,1] == "Sum"), 2:(n_colGrps+1)]
- data[which(data[ ,1] == "Ctrl_TOTAL"), 2:(n_colGrps+1)])
## update CurrRow_value
CurrRow_value <- CurrRow_value + 1
if(n_rowGrps == 2) {
### p.18-23: Reconcile column control totals (when only 2 rows)
## if only two Sexes, reconciling is literally subtracting RowAdj (which should = updated Diff) from other Sex
## update Sum and Diff rows
data[data$VarRow == "Sum", 2:(n_colGrps+1)] <- as.list(colSums(data[1:n_rowGrps, 2:(n_colGrps+1)]))
data[data$VarRow == "Diff", 2:(n_colGrps+1)] <- (data[data$VarRow == "Sum", 2:(n_colGrps+1)] -
data[data$VarRow == "Ctrl_TOTAL", 2:(n_colGrps+1)]) ## i.e., RowAdj
## subtract Diff from other Sex (i.e., row above Sum row)
data[which(data$VarRow == "Sum")-1, 2:(n_colGrps+1)] <- (data[which(data$VarRow == "Sum")-1, 2:(n_colGrps+1)] -
data[data$VarRow == "Diff", 2:(n_colGrps+1)])
## check that this adjustment makes Diff now all equal 0
data[data$VarRow == "Sum", 2:(n_colGrps+1)] <- as.list(colSums(data[1:n_rowGrps, 2:(n_colGrps+1)]))
data[data$VarRow == "Diff", 2:(n_colGrps+1)] <- (data[data$VarRow == "Sum", 2:(n_colGrps+1)]
- data[data$VarRow == "Ctrl_TOTAL", 2:(n_colGrps+1)])
}
if(n_rowGrps > 2) {
### p.18-23: Reconcile column control totals (when more than 2 rows)
## p.22&23: Negatives not allowed, values subtracted from current row, add values to lower rows
## i.e., sum(RowAdj) is negative
## p.18 "In our example, adjustments were made to all columns, so all columns will have to have
## values adjusted in lower rows to ensure columns once again sum to column control totals."
## 1. make temp with all non-adjusted data rows and subtract RowAdj (which is neg) from each row (p.22)
temp <- data[CurrRow_value:n_rowGrps, 1:(n_colGrps+1)]
adj_matrix <- rbind(RowAdj, RowAdj, deparse.level = 0)
while (dim(adj_matrix)[1] < dim(temp)[1]) {
adj_matrix <- rbind(adj_matrix, RowAdj, deparse.level = 0)
}
temp[, -1] <- temp[, -1] - adj_matrix
rm(adj_matrix)
## 2. subtract temp from Ctrl_TOTAL, for each row (p.23, bullet 1)
## create adjustment matrix to subtract from temp data
Ctrl_TOTAL <- data[which(data[, 1] == "Ctrl_TOTAL"), 2:(n_colGrps+1)]
if (dim(temp)[1] > 1) {
adj_matrix <- rbind(Ctrl_TOTAL, Ctrl_TOTAL, deparse.level = 0)
} else {
adj_matrix <- rbind(Ctrl_TOTAL, deparse.level = 0)
}
while (dim(adj_matrix)[1] < dim(temp)[1]) {
adj_matrix <- rbind(adj_matrix, Ctrl_TOTAL, deparse.level = 0)
}
temp[, -1] <- adj_matrix - temp[, -1]
rm(adj_matrix, Ctrl_TOTAL)
## 3. determine min value in each row (p.23, bullet 2)
temp$Min <- apply(temp[, -1], 1, min, na.rm = TRUE)
## 4. get row sort order descending by Min column (using random fractional part to break ties)
temp <- add.random.fraction.to.cols(df = temp, my_col = "Min")
maxVarRow <- paste0(temp[(which(temp$sort_rows == min(temp$sort_rows))), 1]) ## i.e., VarRow (col 1) with largest Min value
rm(temp)
## 5. change only the maxVarRow (i.e., subtract RowAdj from only maxVarRow); data[ ,1] is data$VarRow
data[which(data[ ,1] == maxVarRow), 2:(n_colGrps + 1)] <- (data[which(data[ ,1] == maxVarRow), 2:(n_colGrps + 1)]
- RowAdj)
## 6. adjust that one row's Sum and Diff
data$Sum[which(data[ ,1] == maxVarRow)] <- sum(data[which(data[ ,1] == maxVarRow), 2:(n_colGrps + 1)])
data$Diff[which(data[ ,1] == maxVarRow)] <- (data$Ctrl_TOTAL[which(data[ ,1] == maxVarRow)]
- data$Sum[which(data[ ,1] == maxVarRow)])
## 7. update (again) original data Sum & Diff rows
data[which(data[ ,1] == "Sum"), 2:(n_colGrps+1)] <- as.list(colSums(data[1:n_rowGrps, 2:(n_colGrps+1)]))
data[which(data[ ,1] == "Diff"), 2:(n_colGrps+1)] <- (data[which(data[ ,1] == "Sum"), 2:(n_colGrps+1)]
- data[which(data[ ,1] == "Ctrl_TOTAL"), 2:(n_colGrps+1)])
}
#### ************************************ Check Point ************************************** ####
## At this point, columns (Regions) should sum to their control totals, AND
## rows (Sexes) should sum to their control totals
# sum(data[data$VarRow == "Sum", 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3]
# sum(data[1, 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1]
# sum(data[2, 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 2]
# sum(data[data$VarRow == "Diff", -1]) ## should be zero
#### ************************************************************************************** ####
data
}
#### raking algorithm function C: noNegsneedMargin ----
#' Raking Algorithm Function C: when negative values are NOT allowed and margin IS needed
#'
#' When raking, the appropriate algorithm is chosen (negative values allowed or not, margin needed
#' or not), "whole people" adjustments to be made to selected cells in a row are calculated, then
#' "residual people" adjustments to be made to selected cells in a row are calculated, then column
#' control totals are reconciled. Raking is run iteratively row-by-row. Regardless of the algorithm
#' needed, five arguments are needed. A sixth argument, "needMargin" is only needed for algorithm C
#' (noNegsneedMargin). This is a helper function used in \code{\link{dbRake}} when negative values
#' are NOT allowed AND margin IS needed. Specifically, values need to be added to cells in current
#' row, and subtracted from lower rows. Because adjustments are subtracted from lower rows but
#' negative values are not allowed, a margin is needed.
#'
#' @param CurrRow a subset of data with 3 rows and columns: XXX, n_colGrps, Sum, Ctrl_TOTAL,
#' Diff, adj_value, where XXX is Sex in Part 1, Region in Part 2, and Age in Part 3. The rows
#' are the one XXX row currently being worked on, Adjustments, and AdjCurrRow.
#' @param CurrRow_value a counter to iterate through rows in data
#' @param data a dataframe of rows to be iteratively raked with columns: XXX, n_colGrps, Sum,
#' Ctrl_TOTAL, Diff, adj_value, where XXX is the variable being raked (Sex in Part 1, Region in Part 2,
#' Age in Part 3). Rows are the rows of XXX as well as three summary rows: Sum, Ctrl_TOTAL, Diff.
#' @param n_colGrps the number of column groups to rake over (e.g., 89 LHAs, number of Age Groups, etc.)
#' @param n_rowGrps the number of rows groups to rake over (e.g., 2 Sexes, 89 LHAs, etc.)
#' @param RowAdj vector of zeros, of length n_colGrps (e.g., 89 zeros, etc.)
#' @param needMargin a logical (TRUE or FALSE) of whether a margin is needed. The margin is the
#' current row's Sum - CurrRow values - Adjustments - all rows above.
#' @return original dataframe, but with CurrRow's Diff now 0, and any adjustments made to CurrRow
#' reversed in the row below that has the largest minimum value
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
noNegsneedMargin <- function(CurrRow, CurrRow_value, data, n_colGrps, n_rowGrps, RowAdj, needMargin) {
### ************************* C. when needMargin is TRUE *********************** ###
### ** Add values to cells in current row, subtract from lower rows, negs not allowed ** ###
## additional set-up for when needMargin is TRUE
if(needMargin == TRUE) {
## if needed, add in Margin (Margin = Sum - CurrRow values - Adjustments - all rows above)
CurrRow[4, 1] <- "Margin"
CurrRow[4, 2:(n_colGrps+1)] <- (data[data$VarRow == "Sum", 2:(n_colGrps+1)] -
data[CurrRow_value, 2:(n_colGrps+1)] -
CurrRow[2, 2:(n_colGrps+1)])
## when CurrRow_value > 1, also subtract all rows above in data
if(CurrRow_value > 1) {
for(aa in 1:(CurrRow_value-1)) {
CurrRow[4, 2:(n_colGrps+1)] <- (CurrRow[4, 2:(n_colGrps+1)] - data[aa, 2:(n_colGrps+1)])
}
}
}
### ** calc "whole people" adjustments to be made to all cells in the row (see p.14 & 15)
## Get column number(s) of valid groups (i.e., Margin > 0)
valid_grps <- which(CurrRow[4, 2:(n_colGrps+1)] > 0) + 1
## For adjustments, repeat looping until Diff < number of valid_grps (i.e., until no "whole" people left)
while(CurrRow[1, "Diff"] >= length(valid_grps) & length(valid_grps != 0) ) {
## (re)set Adjustments row to zero
CurrRow[2, 2:(n_colGrps+1)] <- 0
## calc number of "whole people" who need to be added to each valid group and removed from other row(s)
adj_whole <- floor(CurrRow$Diff[1] / length(valid_grps))
## fill adjustment row CurrRow[2, ], with minimum of Margin or adj_whole, IFF a valid_grp
for(a in valid_grps) {
## if Margin (i.e., CurrRow[4, ]) < adj_whole, only take Margin amount, else take adj_whole
if (CurrRow[4, a] < adj_whole) {
CurrRow[2, a] <- CurrRow[4, a] * sign(CurrRow$Diff[1])
} else {
## set adjustment
CurrRow[2, a] <- adj_whole
}
}
## determine # people actually allocated
## (when CurrRow < adj_whole, not all of adj_whole will be allocated)
sum_adj <- sum(CurrRow[2, 2:(n_colGrps+1)])
## update Diff
CurrRow[1, "Diff"] <- CurrRow[1, "Diff"] - sum_adj
## update Margin row (i.e., Margin = Margin - Adjustments)
CurrRow[4, 2:(n_colGrps+1)] <- CurrRow[4, 2:(n_colGrps+1)] - CurrRow[2, 2:(n_colGrps+1)]
## update AdjCurrRow row (i.e., AdjCurrRow = AdjCurrRow + Adjustments)
CurrRow[3, 2:(n_colGrps+1)] <- CurrRow[3, 2:(n_colGrps+1)] + CurrRow[2, 2:(n_colGrps+1)]
## update RowAdj
RowAdj <- RowAdj + unlist(CurrRow[2, 2:(n_colGrps+1)])
## update valid groups (i.e., updated Margin > 0)
valid_grps <- which(CurrRow[4, 2:(n_colGrps+1)] > 0) + 1
}
### ** calc "residual people" adjustments to be made to selected cells in row
### (valid cells to adjust if values need to be added, negatives not allowed) (see p.17)
## add adjustments to date to original cell values (i.e., replace orig row with AdjCurrRow)
CurrRow[1, 2:(n_colGrps + 1)] <- CurrRow[3, 2:(n_colGrps + 1)]
## create random numbers to sort descending (updated) margin row in case 2+ are the same
CurrRow[5, 1] <- "Random"
CurrRow[5, 2:(n_colGrps+1)] <- as.list(sample(x = (1:n_colGrps), size = n_colGrps, replace = FALSE))
## add random numbers as fractions to margin (so margin will be considered first and random to break ties)
upPower10 <- 10*(10^ceiling(log10(n_colGrps))) ## round up to the nearest power of 10
CurrRow[6, 1] <- "Margin + Random"
CurrRow[6, 2:(n_colGrps+1)] <- (CurrRow[5, 2:(n_colGrps+1)])/upPower10 + CurrRow[4, 2:(n_colGrps+1)]
## sort cells in descending order Margin size, so adjustments always made to largest margins
## (use unlist() to get CurrRow row as a non-dataframe vector)
## subtract rank (where 1 = smallest) from (1+n_colGrps) so 1 = largest
CurrRow[7, 1] <- "Sort"
CurrRow[7, 2:(n_colGrps+1)] <- as.list(1 + n_colGrps - rank(unlist(CurrRow[6, 2:(n_colGrps+1)])))
## only adjust cells for which we have residual people (i.e., add where Margin largest)
for(i in which(CurrRow[7, ] <= CurrRow$Diff[1])) {
CurrRow[1, i] <- CurrRow[1, i] + 1
CurrRow[2, i] <- CurrRow[2, i] + 1 ## also update Adjustments row to add to RowAdj for end
}
## check: sum(CurrRow[1, 2:(n_colGrps+1)]) == CurrRow$Ctrl_TOTAL[1]
## update row 1's columns: Sum & Diff, and RowAdj
CurrRow[1, "Sum"] <- sum(CurrRow[1, 2:(n_colGrps+1)])
CurrRow[1, "Diff"] <- CurrRow[1, "Sum"] - CurrRow[1, "Ctrl_TOTAL"]
RowAdj <- RowAdj + unlist(CurrRow[2, 2:(n_colGrps+1)])
### ** reconcile column control totals (p.18-23)
## p.20-22: Negatives not allowed, values added to current row, subtract values from lower rows
## replace original data row with adjusted CurrRow[3,]
data[CurrRow_value, ] <- CurrRow[1, ]
## clean up
rm(upPower10, i, CurrRow)
## update CurrRow_value
CurrRow_value <- CurrRow_value + 1
if(n_rowGrps == 2) {
## if only two Sexes, reconciling is literally subtracting RowAdj (which should = updated Diff) from other Sex
## drop unneeded columns at end: Ctrl_TOTAL, Diff
data$Ctrl_TOTAL <- NULL
data$Diff <- NULL
## update Sum and Diff rows
data[data$VarRow == "Sum", 2:(n_colGrps+1)] <- as.list(colSums(data[1:n_rowGrps, 2:(n_colGrps+1)]))
data[data$VarRow == "Diff", 2:(n_colGrps+1)] <- (data[data$VarRow == "Sum", 2:(n_colGrps+1)]
- data[data$VarRow == "Ctrl_TOTAL", 2:(n_colGrps+1)]) ## RowAdj
## subtract Diff from other VarRow (i.e., row above Sum row)
data[which(data$VarRow == "Sum")-1, 2:(n_colGrps+1)] <- (data[which(data$VarRow == "Sum")-1, 2:(n_colGrps+1)]
- data[data$VarRow == "Diff", 2:(n_colGrps+1)])
## update Sum and Diff again (all diffs should be 0 now)
data[data$VarRow == "Sum", 2:(n_colGrps+1)] <- as.list(colSums(data[1:n_rowGrps, 2:(n_colGrps+1)]))
data[data$VarRow == "Diff", 2:(n_colGrps+1)] <- (data[data$VarRow == "Sum", 2:(n_colGrps+1)]
- data[data$VarRow == "Ctrl_TOTAL", 2:(n_colGrps+1)])
}
if(n_rowGrps > 2) {
### p.18-23: Reconcile column control totals (when more than 2 rows)
## p.20-22: Negatives not allowed, values added to current row, subtract values from lower rows
## i.e., sum(RowAdj) is positive
## 1. make temp with all non-adjusted data rows and subtract RowAdj (which is pos) from each row (p.21, bullet 1)
temp <- data[CurrRow_value:n_rowGrps, 1:(n_colGrps+1)]
adj_matrix <- rbind(RowAdj, RowAdj, deparse.level = 0)
while (dim(adj_matrix)[1] < dim(temp)[1]) {
adj_matrix <- rbind(adj_matrix, RowAdj, deparse.level = 0)
}
temp[, -1] <- temp[, -1] - adj_matrix
rm(adj_matrix)
## 2. determine min value in each row (p.21, bullet 2)
temp$Min <- apply(temp[, -1], 1, min, na.rm = TRUE)
## 3. get row sort order descending by Min column (using random fractional part to break ties)
temp <- add.random.fraction.to.cols(df = temp, my_col = "Min")
maxVarRow <- paste0(temp$VarRow[(which(temp$sort_rows == min(temp$sort_rows)))]) ## i.e., VarRow with largest Min value
rm(temp)
## 4. change only the maxVarRow (i.e., subtract RowAdj from only maxVarRow); data[ ,1] is data$VarRow
data[data$VarRow == maxVarRow, 2:(n_colGrps + 1)] <- (data[data$VarRow == maxVarRow, 2:(n_colGrps + 1)]
- RowAdj)
## 5. adjust that one row's Sum & Diff
data$Sum[data$VarRow == maxVarRow] <- sum(data[data$VarRow == maxVarRow, 2:(n_colGrps + 1)])
data$Diff[data$VarRow == maxVarRow] <- (data$Ctrl_TOTAL[data$VarRow == maxVarRow]
- data$Sum[data$VarRow == maxVarRow])
}
#### ************************************ Check Point ************************************** ####
## At this point, columns (Regions) should sum to their control totals, AND
## rows (Sexes) should sum to their control totals
# sum(data[data$Sex == "Sum", 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3]
# sum(data[1, 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1]
# sum(data[2, 2:(n_Regions+1)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 2]
# sum(data[data$Sex == "Diff", -1]) ## should be zero
#### ************************************************************************************** ####
data
}
#### dbRake ----
#' Rake population database
#'
#' @description
#' Reads a population database file (population, migration) and saves a population database file
#' with Region values raked for each of Age and Sex. Raking can be run with user-provided region
#' control totals, or without region control totals. Negative population values may be allowed or
#' not (default).
#'
#' This function assumes input files (e.g., InputData, CtrlPopTotals, etc.) are in an "inputs"
#' folder. The raked output will save to an "outputs" folder (which will be created if one does
#' not exist). If chosen, interim files are also saved to an "interim_files" folder within
#' "outputs" (this will be created if it does not exist and saveInterimFiles is TRUE). dbRake()
#' is a large function that takes a few minutes to run, and depends on multiple smaller functions.
#'
#' @details
#' dbRake is a large function with three main parts. \strong{Part 1} prorates and rakes Sex values
#' by Region. \strong{Part 2} prorates and rakes 5-year Age Group values by Region and Sex.
#' \strong{Part 3} prorates and rakes single-year Age values by Region and Sex. Throughout, checks
#' are performed and, if chosen, results are written to \strong{raking_log.csv} in the "outputs"
#' folder (regardless of whether raking succeeds or fails). As well, interim files may be saved to
#' an "interim_files" folder in "outputs" for future viewing. If raking succeeds, the final raked
#' data file is saved to "outputs".
#'
#' dbRake was originally an APL process (not in R). A PDF documenting that process, which holds
#' true for most of the underlying assumptions and procedures in dbRake, is available on BC Stats'
#' I drive (S152\\S52004) in \strong{Documentation > Raking > Methodology-Raking_Final.pdf}.
#'
#' @param InputData Name of database in environment that contains input data to be raked. If
#' `readFiles` is TRUE, this is the name of the .xlsx or .csv in the "inputs" folder to be read in.
#' This file is assumed to have Region (e.g., LHA) by Sex (e.g., 1, 2, 3) as rows, and
#' Ages (e.g., 0, 1, 2, ..., TOTAL (not '-999')) as columns. Values are population counts.
#' @param CtrlPopTotals Name of database in environment that contains overall control totals. If
#' `readFiles` is TRUE, this is the name of the .xlsx or .csv in the "inputs" folder to be read in
#' (e.g., "BC AS TOTALS.xlsx"). This file is assumed to have Sex (e.g., 1, 2, 3) as rows and
#' Ages (e.g., 0, 1, 2, ..., TOTAL (not '-999')) as columns. Values are population counts.
#' This file typically has dimensions of 3 (obs) by 103 variables.
#' @param CtrlRegionTotals Name of database in environment that contains overall control totals. If
#' `readFiles` is TRUE, this is the name of the .xlsx or .csv in the "inputs" folder to be read in
#' (e.g., "LHA TOTALS.xlsx"). Default = NULL. This file is assumed to have Region (e.g., 89 LHAs)
#' as the first column and TOTAL (population counts) as the second column; this file is not broken out
#' by Sex or Age. This file typically has dimensions of n (obs) by 2 variables, where "n" is the
#' number of individual regions (e.g., 89 for LHA). If no name is provided (i.e., NULL), then region
#' control totals are not used. Instead, the InputData will be used to generate "control" totals.
#' @param CtrlAgeGrpsTotals Name of database in environment that contains initial 5 year age group
#' totals. If `readFiles` is TRUE, this is the name of the .xlsx or .csv in the "inputs" folder to
#' be read in. Default = NULL. In virtually all cases, this variable will be NULL. In these cases,
#' the InputData will be used to generate "control" totals at 5-year groupings (e.g., 0-4, 5-9,
#' 10-14, etc). If age groups are of format -X1, -X2, ..., they will be transformed to "X-Y" format.
#' @param VarRegion Name of Region variable in all files (e.g., "LHA").
#' @param VarSex Name of Sex variable in all files (e.g., "Sex"). \strong{Note: Sex must be a
#' numeric variable (e.g., 1,2,3) where the Total is the maximum number (e.g., 3.}
#' @param VarSexTotal Value that corresponds to Total (e.g., 3, when 1 and 2 are Male and Female).
#' @param AgeGrpMax Age of the older population that will be prorated and raked separately from
#' other 5 year age groups. AgeGrpMax will include all ages, including itself, through the remainder
#' of the population. Default = NULL. If AgeGrpMax is not set, the function will use 75 and up
#' (not necessarily the oldest age; that is, the oldest age is usually 100, meaning 100 and up).
#' The BC Stats Demographics team determined that 75 was the best age for AgeGrpMax to ensure that
#' distortion in older populations is minimized.
#' @param allowNegatives Logical value for whether or not negative population values are allowed.
#' Default = FALSE. Only migration data should be allowed to have negative values.
#' @param saveInterimFiles Logical value for whether or not interim files (.csvs) should be saved
#' throughout the process. Default = FALSE. If saved, they will be saved in "interim_files" within
#' "outputs" folder. This folder will be created if it does not exist and is needed.
#' @param writeRakingLog Logical value for whether or not a log file (raking_log.csv) should be
#' written. Default = FALSE. If written, it will be saved in "outputs" folder.
#' @param writeOutputFile Logical value for whether or not final output file (.csv) should be written.
#' Default = FALSE. If TRUE, the final raked data will be saved as "RakedData.csv" to "outputs"
#' folder. Regardless of whether saved or not, the raked data returns to R's environment. Setting
#' to TRUE reduces a step (\code{\link{dbWrite}}). Setting to TRUE is not useful when raking
#' multiple years of data, as the output file will be overwritten for each successive year. In that
#' case, call the raking function from \code{\link{multiRake}}.
#' @param readFiles Logical value for whether or not input files (InputData, CtrlPopTotals,
#' CtrlRegionTotals, CtrlAgeGrpsTotals) need to be read in. Default = FALSE. If FALSE, files are
#' already in environment, likely by being called or created through another function (e.g.,
#' \code{\link{dbConvert}}, \code{\link{dbRead}}).
#' @return RakedData.csv will be saved to "outputs" folder (which will be created if one does not
#' already exist). If set to TRUE, various interim files will be saved in an "interim_files" folder
#' within "outputs". If set to TRUE, a log file ("raking_log.csv") will also be saved to the
#' "outputs" folder.
#' @examples
#' \dontrun{ dbRake(InputData = "POPHAE19.xlsx", CtrlPopTotals = "BC AS TOTALS.xlsx",
#' CtrlRegionTotals = "LHA TOTALS.xlsx", CtrlAgeGrpsTotals = NULL,
#' VarRegion = "LHA", VarSex = "Sex", VarSexTotal = 3, AgeGrpMax = NULL,
#' allowNegatives = FALSE, saveInterimFiles = FALSE, writeRakingLog = FALSE,
#' writeOutputFile = FALSE, readFiles = TRUE) }
#' \dontrun{ ## if dbRake() is called in \code{\link{dbConvert}}(), which brings in inputs
#' dbRake(InputData = ToDB, CtrlPopTotals = control_totals,
#' CtrlRegionTotals = region_totals, CtrlAgeGrpsTotals = NULL,
#' VarRegion = "LHA", VarSex = "Sex", VarSexTotal = 3, AgeGrpMax = NULL,
#' allowNegatives = FALSE, saveInterimFiles = FALSE, writeRakingLog = TRUE,
#' writeOutputFile = TRUE, readFiles = FALSE) }
#' @seealso Raking helpers include: \code{\link{rounded}}(), \code{\link{read.inputs}}(),
#' \code{\link{real.to.int}}(), \code{\link{calc.cols}}(), \code{\link{prorate.row}}(),
#' \code{\link{prep.prorate.col}}(), \code{\link{prorate.col}}(), and raking algorithm functions A, B, C:
#' \code{\link{allowNegsnoMargin}}(), \code{\link{noNegsnoMargin}}(), \code{\link{noNegsneedMargin}}()
#' @author Julie Hawkins, BC Stats
#' @export
dbRake <- function(InputData, CtrlPopTotals, CtrlRegionTotals = NULL, CtrlAgeGrpsTotals = NULL,
VarRegion, VarSex, VarSexTotal, AgeGrpMax = NULL, allowNegatives = FALSE,
saveInterimFiles = FALSE, writeRakingLog = FALSE, writeOutputFile = FALSE,
readFiles = FALSE) {
#### 0. Prep ----
## A. check for required folder(s); create if needed and doesn't exist
if(any(writeOutputFile == TRUE, saveInterimFiles == TRUE, writeRakingLog == TRUE) &
!file.exists(here::here("outputs"))) {
dir.create(here::here("outputs"))
message(paste0("An 'outputs' folder has been created at '", here::here(), "/'."))
}
if(saveInterimFiles == TRUE) {
if(!file.exists(here::here("outputs", "interim_files"))) {
dir.create(here::here("outputs", "interim_files"))
message(paste0("An 'interim_files' subfolder has been created at '", here::here("outputs"), "/'."))
} else {
message("Interim files were saved to '", here::here("outputs", "interim_files"), "/'.")
}
}
## B. create log file, if required
if(writeRakingLog == TRUE) {
raking_log <- data.frame(message = as.character(), stringsAsFactors = FALSE)
}
## C. read data, if needed
if(readFiles == TRUE) {
if(!file.exists(here::here("inputs"))) {
stop(paste0("The 'inputs' folder cannot be found. It should be at: '", here::here(), "/inputs'."))
}
## C1. read in population control totals
CtrlPopTotals <- read.inputs(inputFile = CtrlPopTotals)
## C2. read in region control totals, if they exist; set UseControlRegionTotals to TRUE or FALSE accordingly
if(!is.null(CtrlRegionTotals)) {
CtrlRegionTotals <- read.inputs(inputFile = CtrlRegionTotals)
UseControlRegionTotals <- TRUE
} else {
UseControlRegionTotals <- FALSE
}
## C3. read in age control totals, if they exist; set have5yrAgeGrps to TRUE or FALSE accordingly
if(!is.null(CtrlAgeGrpsTotals)) {
CtrlAgeGrpsTotals <- read.inputs(inputFile = CtrlAgeGrpsTotals)
have5yrAgeGrps <- TRUE
} else {
have5yrAgeGrps <- FALSE
}
## C4. read in input data; this is what needs to be raked
InputData <- read.inputs(inputFile = InputData)
}
if(readFiles == FALSE) {
## C2. set UseControlRegionTotals to TRUE or FALSE accordingly
if(!is.null(CtrlRegionTotals)) {
UseControlRegionTotals <- TRUE
} else {
UseControlRegionTotals <- FALSE
}
## C3. set have5yrAgeGrps to TRUE or FALSE accordingly
if(!is.null(CtrlAgeGrpsTotals)) {
have5yrAgeGrps <- TRUE
} else {
have5yrAgeGrps <- FALSE
}
}
## C5. update raking_log, if required
if(writeRakingLog == TRUE) {
if(UseControlRegionTotals == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- "Prep: 'UseControlRegionTotals' is set to TRUE because you named a Control Region Totals file."
} else {
raking_log[nrow(raking_log)+1, 1] <- "Prep: 'UseControlRegionTotals' is set to FALSE because you did not name a Control Region Totals file."
}
if(have5yrAgeGrps == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- "Prep: 'have5yrAgeGrps' is set to TRUE because you named a Control Age Totals file."
} else {
raking_log[nrow(raking_log)+1, 1] <- "Prep: 'have5yrAgeGrps' is set to FALSE because you did not name a Control Age Totals file."
}
}
## D. check for negative values if allowNegatives is FALSE
if(allowNegatives == FALSE) {
if(any(InputData < 0)) {
stop("You set 'allowNegatives' as FALSE, but there is at least one negative value in the data. Fix this.")
} else {
if(writeRakingLog == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- "Prep: 'allowNegatives' is set to FALSE."
}
}
}
## E. check for single Year of data
if("Year" %in% names(InputData)) {
if(length(unique(InputData$Year)) != 1) {
stop("InputData should have only one year of data. To rake over multiple years, use `multiRake()`.")
}
## save Year variable and remove temporarily from data
yr_InputData <- unique(InputData$Year)
InputData <- InputData %>% dplyr::select(-Year)
}
if("Year" %in% names(CtrlPopTotals)) {
if(length(unique(CtrlPopTotals$Year)) != 1) {
stop("CtrlPopTotals should have only one year of data. To rake over multiple years, use `multiRake()`.")
}
yr_CtrlPop <- unique(CtrlPopTotals$Year)
CtrlPopTotals <- CtrlPopTotals %>% dplyr::select(-Year)
}
## works even if CtrlRegionTotals is NULL (i.e., names are NULL so no "Year")
if("Year" %in% names(CtrlRegionTotals)) {
if(length(unique(CtrlRegionTotals$Year)) != 1) {
stop("CtrlRegionTotals should have only one year of data. To rake over multiple years, use `multiRake()`.")
}
## save Year variable and remove temporarily from data
yr_CtrlReg <- unique(CtrlRegionTotals$Year)
CtrlRegionTotals <- CtrlRegionTotals %>% dplyr::select(-Year)
}
if("Year" %in% names(CtrlAgeGrpsTotals)) {
if(length(unique(CtrlAgeGrpsTotals$Year)) != 1) {
stop("CtrlAgeGrpsTotals should have only one year of data. To rake over multiple years, use `multiRake()`.")
}
## save Year variable and remove temporarily from data
yr_CtrlAgeGrps <- unique(CtrlAgeGrpsTotals$Year)
CtrlAgeGrpsTotals <- CtrlAgeGrpsTotals %>% dplyr::select(-Year)
}
if(any(exists("yr_InputData"), exists("yr_CtrlPop"), exists("yr_CtrlReg"), exists("yr_CtrlAgeGrps"))) {
yrCheck <- paste(ls(pattern = "yr_"))
if(length(yrCheck) > 1) {
temp <- get(yrCheck[1])
for(i in 2:length(yrCheck)) {
if(temp != get(yrCheck[i])) { stop("The Year does not match in all files.") }
}
yr_OutputData <- temp
rm(i, temp)
} else { yr_OutputData <- yrCheck }
rm(yrCheck)
if(writeRakingLog == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- "Prep: The 'Year' column has been temporarily removed from the data."
}
}
## F. rename negative column names as 5yr age groups where necessary (this is why -999 MUST be called TOTAL)
InputData <- rename.age.grps(data = InputData, VarRegion, VarSex)
CtrlPopTotals <- rename.age.grps(data = CtrlPopTotals, VarRegion, VarSex)
if(!is.null(CtrlAgeGrpsTotals)) {
CtrlAgeGrpsTotals <- rename.age.grps(data = CtrlAgeGrpsTotals, VarRegion, VarSex)
}
## G. ensure that region variable is character
InputData <- InputData %>%
dplyr::rename(temp = {{VarRegion}}) %>%
dplyr::mutate(temp = as.character(temp)) %>%
dplyr::rename({{VarRegion}} := temp)
if(!is.null(CtrlRegionTotals)) {
CtrlRegionTotals <- CtrlRegionTotals %>%
dplyr::rename(temp = {{VarRegion}}) %>%
dplyr::mutate(temp = as.character(temp)) %>%
dplyr::rename({{VarRegion}} := temp)
}
## H. create OutputData from InputData, that will be updated with changes
OutputData <- InputData
if(writeRakingLog == TRUE) {
## F. add message to raking_log re: set values
raking_log[nrow(raking_log)+1, 1] <- paste0("Prep: you set '", VarRegion, "' as the region, and '",
VarSexTotal, "' as the value for Sex Total; also, '",
AgeGrpMax, "' is the maximum age, meaning this age and ",
"older will be prorated, but not raked, to minimize ",
"distortion in older populations.")
}
message("Raking process has begun...")
#### Part 1 ----
## Part 1. Updating initial estimates of male/female regional total values
## Part 1.0: prep ----
## 1A. redistribute input data as wide data, without Sex TOTAL info
## assumes InputData has a Region column, a Sex column, many age columns, and a TOTAL column
## want data with Region column; pivot "long" Sex as "wide" columns with just the TOTAL data, no age data
data <- InputData %>%
dplyr::rename(Sex = {{VarSex}}, Region = {{VarRegion}}) %>%
dplyr::filter(Sex != VarSexTotal) %>%
dplyr::select(VarRow = Region, Sex, TOTAL) %>%
tidyr::pivot_wider(names_from = "Sex", values_from = "TOTAL")
## this is now columns VarRow, 1, 2 with one row for each region, values are total counts (not by age)
## 1B. calc number of Regions and number of (non-Total) Sexes; to determine # of groups to adjust over
n_Regions <- dim(data)[1]
n_Sex <- dim(CtrlPopTotals)[1] - 1
## Part 1.1: prorate rows ----
if(UseControlRegionTotals == TRUE) {
## !!!! If there are CtrlRegionTotals !!!!
## 1C. rename CtrlRegionTotals's VarRegion as "VarRow" so left_join in calc.cols() works
temp <- dplyr::rename(CtrlRegionTotals, VarRow = {{VarRegion}})
## 1D. calc necessary columns: Sum, Ctrl_TOTAL, Diff, adj_value
## Step 1: calc actual sum, add in VarRow (i.e., Region) control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
n_colGrps <- n_Sex
n_rowGrps <- n_Regions
data <- calc.cols(data = data, temp, VarRow = VarRegion, n_colGrps)
## this has columns: VarRow, all non-Total Sexes, Sum, Ctrl_TOTAL, Diff, adj_value for all regions
## 1E. reconcile row by row (i.e., for 1:n_Sex, prorate so region totals sum to region control totals)
for (i in 1:n_rowGrps) {
## Step 3: add/subtract adjustment value to/from actual data to get first interim value
## Step 4: repeat Steps 1 through 3 while difference is not zero
CurrRow <- data[i, ]
## WHILE difference is NOT zero, adjust actual data
while(abs(CurrRow$Diff) > 0.0000000001) {
CurrRow <- prorate.row(CurrRow, n_colGrps, allowNegatives)
}
## ensure all numbers are integers (i.e., no fractional people allowed)
CurrRow[, 2:(n_colGrps + 1)] <- real.to.int(realNums = CurrRow[, 2:(n_colGrps + 1)])
## replace actual data with adjusted data in CurrRow
data[i, ] <- CurrRow
}
## 1F. remove no-longer-needed objects
rm(i, CurrRow, temp)
### ************************************ Check Point ************************************** ###
## At this point, Region rows sum to their control total, BUT
## Sex columns do NOT (necessarily) sum to their control totals
# sum(data$Sum) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3] ## should be TRUE
# sum(data$`1`) - CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1] ## likely not zero
# sum(data$`2`) - CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 2] ## likely not zero
### *************************************************************************************** ###
if(trunc(sum(data$Sum)) != CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3]) {
stop("Error 1.1 (prorate rows): Region rows should sum to their control total, but they do not.
Specifically, sum(data$Sum) differs from the overall CtrlPopTotals' TOTAL.")
} else {
if(writeRakingLog == TRUE) {
## add message to raking_log
if(UseControlRegionTotals == TRUE) {
raking_log[nrow(raking_log)+1, 1] <- "Part 1.1 (prorate rows): Region rows now sum to their control total."
} else {
raking_log[nrow(raking_log)+1, 1] <- "Part 1.1 (prorate rows): Region Control Totals were not provided, so will be created next."
}
}
}
}
## Part 1.2: prorate columns ----
### ** Rake down columns to match column totals, aka, prorate **
## 1G. prep for prorating columns
## get sex Ctrl_TOTALs
temp <- CtrlPopTotals %>%
dplyr::select(Sex, TOTAL) %>%
dplyr::filter(Sex != VarSexTotal) %>%
tidyr::pivot_wider(names_from = "Sex", values_from = "TOTAL") %>%
dplyr::mutate(VarRow = "Ctrl_TOTAL") %>%
dplyr::select(VarRow, tidyselect::everything())
n_colGrps <- n_Sex
n_rowGrps <- n_Regions
data <- data %>% dplyr::mutate(VarRow = as.character(VarRow))
## 1H. calculate necessary rows: Sum (colSums), Ctrl_TOTAL (temp), Diff (subtraction), adj_value (division)
## Step 1: calc actual sum, add in VarRow control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
dataCols <- prep.prorate.col(data, n_rowGrps, colGrps = 1:(n_colGrps+1),
ctrl_total_row = temp, AgeGrpMax = NULL, ageLast = NULL)
## this has dropped unneeded cols (Sum, Ctrl_TOTAL, Diff, adj_value used in 1.1) and added needed rows
## 1I. prorate column by column (i.e., over n_Regions)
for (i in 2:(n_colGrps+1)) {
## Step 3: add/subtract adjustment value to/from actual data to get first interim value
## Step 4: repeat Steps 1 through 3 while difference is not zero
CurrCol <- as.data.frame(dataCols[ ,c(1, i)])
## WHILE difference is NOT zero, adjust actual data
while(abs(CurrCol[which(CurrCol[, 1] == "Diff"), -1]) > 0.0000000001) {
CurrCol <- prorate.col(CurrCol, n_rowGrps, allowNegatives)
}
## ensure all numbers are integers (i.e., no fractional people allowed)
CurrCol[1:n_rowGrps, -1] <- real.to.int(realNums = CurrCol[1:n_rowGrps, -1]) ## this is a vector
## replace actual data with adjusted data in CurrCol
dataCols[ ,i] <- CurrCol[,-1]
}
### ************************************ Check Point ************************************** ###
## At this point, Region rows sum to their control total, AND
## Sex columns should also sum to their control totals
# sum(dataCols[1:89, -1]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3] ## should be TRUE
# sum(dataCols$`1`[1:89]) - CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1] ## should be zero
# sum(dataCols$`2`[1:89]) - CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 2] ## should be zero
## when region is CHSA
# sum(dataCols[1:218, -1]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3] ## should be TRUE
# sum(dataCols$`1`[1:218]) - CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1] ## should be zero
# sum(dataCols$`2`[1:218]) - CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 2] ## should be zero
## when fewer ages
# sum(dataCols[1:29, -1]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3] ## should be TRUE
# sum(dataCols$`1`[1:29]) - CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1] ## should be zero
# sum(dataCols$`2`[1:29]) - CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 2] ## should be zero
### *************************************************************************************** ###
check1.13 <- sum(dataCols[1:n_Regions, -1]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3]
check1.11 <- sum(dataCols[1:n_Regions, 2]) - CtrlPopTotals$TOTAL[1]
check1.12 <- sum(dataCols[1:n_Regions, 3]) - CtrlPopTotals$TOTAL[2]
check1.1 <- check1.13 == TRUE & identical(check1.11, check1.12, 0)
if(check1.1 != TRUE) {
stop("Error 1.2 (prorate cols): Sex columns should sum to their control total, but they do not.
Specifically, sum(dataCols[1:n_Regions, -1]) differs from the overall CtrlPopTotals' TOTAL,
OR, one or more of the individual sexes do not sum to their Ctrl_Pop_Total.")
} else {
if(writeRakingLog == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- "Part 1.2 (prorate columns): Sex columns now sum to their control total."
}
}
## remove no-longer-needed objects
rm(i, CurrCol, temp, check1.13, check1.11, check1.12, check1.1)
## Part 1.3: rake prorated data ----
if(UseControlRegionTotals == FALSE) {
## !!!! If there are NO CtrlRegionTotals !!!!
## need to create "Sum" column that would have been made if we ran steps 1J through 1Q
dataCols <- dataCols %>% dplyr::mutate(Sum = rowSums(dataCols[ , -1]))
## need to create "Ctrl_Regions_Total" file (to be used later in checks) now that we have created them
CtrlRegionTotals <- dataCols[1:n_Regions, ] %>% dplyr::select({{VarRegion}} := VarRow, TOTAL = Sum)
}
if(UseControlRegionTotals == TRUE) {
## !!!! If there are CtrlRegionTotals !!!!
## 1J. calc necessary columns: Sum, Ctrl_TOTAL, Diff, adj_value
## Step 1: calc actual sum, add in VarRow control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
## Region Ctrl_TOTALs
temp <- data %>% dplyr::select(VarRow, TOTAL = Ctrl_TOTAL)
dataCols <- calc.cols(data = dataCols, temp, VarRow, n_colGrps)
dataCols$adj_value <- NULL
rm(temp)
### ** Reorder rows, if necessary **
## 1K. sort (data) rows in ascending order (min -> max) of row totals (i.e., Sum column)
dataCols[1:n_rowGrps, ] <- dplyr::arrange(dataCols[1:n_rowGrps, ], Sum)
### ** Rake the data: **PrivateRakePreprocessedData, aka, rake **
### rake over all rows except for last row (it should be done by default)
## At this point, n_colGrps = n_Regions and n_rowGrps = n_Sex
## 1L. set CurrRow_value, and pull CurrRow from data to work on, iterate over rows
CurrRow_value <- 1 ## first row in data first time through; CurrRow[1,]
while(CurrRow_value < n_rowGrps) {
CurrRow <- dataCols[CurrRow_value, ]
## set up new row for Adjustments; initially, all adjustments = 0
CurrRow[2, 1] <- "Adjustments"
CurrRow[2, 2:(n_colGrps+1)] <- 0
## set up new row for ADJUSTED current row; = CurrRow + Adjustments
CurrRow[3, 1] <- "AdjCurrRow"
CurrRow[3, 2:(n_colGrps+1)] <- CurrRow[1, 2:(n_colGrps+1)] + CurrRow[2, 2:(n_colGrps+1)]
## create RowAdj to save all adjustments made that need to be made at end to the last row/reflected in data
RowAdj <- rep(0, times = n_colGrps)
## determine whether need Margin or not
if(allowNegatives == FALSE) {
## if negatives NOT allowed, determine whether we need Margin or not
if(CurrRow$Sum[1] < CurrRow$Ctrl_TOTAL[1]) {
needMargin <- TRUE ## algorithm C: if values need to be added & negatives not allowed
} else {
needMargin <- FALSE ## algorithm B: if values need to be subtracted & negatives not allowed
}
} else {
## else if negatives ARE allowed, Margin is not needed
needMargin <- FALSE ## algorithm A: if negatives ARE allowed
}
## 1M. choose and run appropriate algorithm:
## 1N. calc "whole people" adjustments to be made to all cells in the row
## 1O. calc "residual people" adjustments to be made to selected cells in row
## 1P. reconcile column control totals
if(allowNegatives == TRUE) {
## p.12: A. Adjust cells in the current row if negative values are allowed
dataCols <- allowNegsnoMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj)
} else {
if(needMargin == FALSE) {
## p.13: B. Adjust cells in the current row if values need to be taken away, negatives not allowed
## "If the sum of ages is greater than the control total, then values need to be taken away from
## age groups for the current row and added to age groups for rows below the current row."
dataCols <- noNegsnoMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj)
} else {
## i.e., needMargin == TRUE, allowNegatives == TRUE
## p.14: C. Adjust cells in the current row if values need to be added, negatives not allowed
## "If the sum of ages is less than the control total, then values needed to be added to age
## groups for the current row, and taken away from age groups for rows below the current row."
dataCols <- noNegsneedMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj, needMargin)
}
}
## update CurrRow_value
CurrRow_value <- CurrRow_value + 1
}
## 1Q. adjust last row and update Sum & Diff rows
dataCols[CurrRow_value, 2:(n_colGrps+1)] <- (dataCols[CurrRow_value, 2:(n_colGrps+1)]
- dataCols[which(dataCols[, 1] == "Diff"), 2:(n_colGrps+1)])
dataCols$Sum[CurrRow_value] <- rowSums(dataCols[CurrRow_value, 2:(n_colGrps+1)])
dataCols$Diff[CurrRow_value] <- dataCols$Ctrl_TOTAL[CurrRow_value] - dataCols$Sum[CurrRow_value]
dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+1)] <- as.list(colSums(dataCols[1:n_rowGrps, 2:(n_colGrps+1)]))
dataCols[which(dataCols[, 1] == "Diff"), 2:(n_colGrps+1)] <- (dataCols[which(dataCols[, 1] == "Ctrl_TOTAL"), 2:(n_colGrps+1)]
- dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+1)])
rm(CurrRow, CurrRow_value, needMargin, RowAdj)
}
## 1R. update OutputData with changes and save as interim file
## get just data (and Sum) rows
temp <- dataCols[1:(n_rowGrps), 1:(n_colGrps+2)]
## flip data back to long format
temp <- temp %>%
tidyr::pivot_longer(c(-VarRow), names_to = "VarSex", values_to = "TOTAL") %>%
dplyr::rename(VarRegion = VarRow) %>%
dplyr::mutate(VarSex = dplyr::case_when(VarSex == "Sum" ~ as.character(VarSexTotal), TRUE ~ as.character(VarSex))) %>%
# dplyr::mutate(VarSex = dplyr::case_when(VarSex == "Sum" ~ "3", TRUE ~ as.character(VarSex))) %>%
dplyr::mutate(VarSex = as.numeric(VarSex))
### ** Put the data back in order, if necessary **
## re-sort rows by InputData order
rows_order <- InputData %>%
dplyr::select(VarRegion = (which(names(InputData) == VarRegion)),
VarSex = (which(names(InputData) == VarSex))) %>%
dplyr::mutate(row_order = dplyr::row_number())
temp <- dplyr::left_join(temp, rows_order, by = c("VarRegion", "VarSex"))
temp <- dplyr::arrange(temp, row_order)
temp$row_order <- NULL
temp <- dplyr::rename(temp, {{VarRegion}} := VarRegion, {{VarSex}} := VarSex)
## drop original TOTAL from OutputData (it was a copy from InputData)
OutputData$TOTAL <- NULL
## join to OutputData the now raked estimates of male/female regional total values
OutputData <- OutputData %>%
dplyr::left_join(temp, by = c({{VarRegion}}, {{VarSex}}))
if(saveInterimFiles == TRUE) {
## save as interim file
readr::write_csv(OutputData, here::here("outputs", "interim_files",
"OutputData_1_updated_initial_estimates_Sex_Region_Totals.csv"))
}
## clean up
rm(temp, data, dataCols, n_colGrps, n_rowGrps, rows_order)
if(writeRakingLog == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- "Part 1: Total Sex values have been updated for each Region. See 'outputs\\interim_files' if interested."
}
#### Part 2 ----
## Part 2. Updating initial 5 year age group and maximum age group estimates, by Sex
## Part 2.0: prep ----
## if they don't exist, create 5-year age groups (i.e., "CtrlAgeGrpsTotals" and "OutputData5")
if(have5yrAgeGrps == FALSE) {
## find all ages ending in 0 and 5, merge in single vector, in numeric order; get name of last age
age0s <- names(CtrlPopTotals[tidyselect::ends_with(match = "0", vars = names(CtrlPopTotals))])
age5s <- names(CtrlPopTotals[tidyselect::ends_with(match = "5", vars = names(CtrlPopTotals))])
ageStarts <- sort(as.numeric(c(age0s, age5s)))
ageLast <- as.character(max(as.numeric(age0s)))
rm(age0s, age5s)
## AgeGrps5Yr names (ignore last age0s which should be last age group in data (e.g., 100, 90))
AgeGrps5Yr <- rep(NA, length(ageStarts)-1)
for (i in 1:length(AgeGrps5Yr)) {
AgeGrps5Yr[i] <- paste0(ageStarts[i], "-", (ageStarts[i]+4))
}; rm(i)
# AgeGrps5Yr: "0-4" "5-9" "10-14" ... "85-89" "90-94" "95-99"
## create 5 year & maximum age groups **control totals**
CtrlAgeGrpsTotals <- CtrlPopTotals
for (i in 1:length(AgeGrps5Yr)) {
CtrlAgeGrpsTotals <- CtrlAgeGrpsTotals %>%
dplyr::mutate(temp = rowSums(CtrlAgeGrpsTotals[
which(names(CtrlAgeGrpsTotals) == ageStarts[i]):(which(names(CtrlAgeGrpsTotals) == ageStarts[i])+4)
])) %>%
dplyr::rename(!!AgeGrps5Yr[i] := temp)
}; rm(i)
CtrlAgeGrpsTotals <- CtrlAgeGrpsTotals %>%
dplyr::select(Sex, tidyselect::all_of(AgeGrps5Yr), tidyselect::all_of(ageLast), TOTAL)
## for some reason age group columns might be named (bad), drop these names
for(j in seq_along(names(CtrlAgeGrpsTotals))) {
attributes(CtrlAgeGrpsTotals[,j]) <- NULL
}; rm(j)
## create 5 year & maximum age groups **sample data**
OutputData5 <- OutputData %>% dplyr::rename(Region = {{VarRegion}}, Sex = {{VarSex}})
for (i in 1:length(AgeGrps5Yr)) {
OutputData5 <- OutputData5 %>%
dplyr::mutate(temp = rowSums(OutputData5[
which(names(OutputData5) == ageStarts[i]):(which(names(OutputData5) == ageStarts[i])+4)
])) %>%
dplyr::rename(!!AgeGrps5Yr[i] := temp)
}; rm(i)
OutputData5 <- OutputData5 %>% dplyr::select(Region, Sex, tidyselect::all_of(AgeGrps5Yr), tidyselect::all_of(ageLast), TOTAL)
} else {
ageLast <- names(CtrlAgeGrpsTotals)[ncol(CtrlAgeGrpsTotals)-1]
## AgeGrps5Yr names (ignore last age0s which should be last age group in data (e.g., 100, 90))
AgeGrps5Yr <- names(CtrlAgeGrpsTotals)[stringr::str_detect(names(CtrlAgeGrpsTotals), "-")]
## OutputData5: prep 5 year & maximum age groups data
OutputData5 <- OutputData %>% dplyr::rename(Region = {{VarRegion}}, Sex = {{VarSex}}) %>%
dplyr::select(Region, Sex, tidyselect::all_of(AgeGrps5Yr), tidyselect::all_of(ageLast), TOTAL)
}
## 2A. identify Sexes (e.g., 1, 2, 3), set CurrSex as first element,
## id cols of 5 yr age groups and max age group (ageLast)
Sexes <- InputData %>% dplyr::pull({{VarSex}}) %>% unique()
CurrSex <- Sexes[1]
AgeGrps5Yr <- CtrlAgeGrpsTotals %>%
dplyr::select(which(stringr::str_detect(string = names(CtrlAgeGrpsTotals), pattern = "-"))) %>% names()
## either create AgeGrpMax or check that it is the beginning of a 5YrGrp
if(is.null(AgeGrpMax)) {
## if AgeGrpMax does not yet exist, set as 75+
AgeGrpMax <- 75
if(!(75 %in% names(InputData))) {
## if age 75 does not exist (e.g., birth data), get max age group (second-to-last column name)
AgeGrpMax <- names(InputData)[ncol(InputData)-1]
}
} else {
## otherwise use what AgeGrpMax was set as in raking arguments
## check that AgeGrpMax is beginning of a 5YrGrp; if not, have user re-set the value
AgeGrpsStarts <- unique(stringr::str_sub(AgeGrps5Yr, start = 1, end = (stringr::str_locate(AgeGrps5Yr, pattern = "-")-1)))
if(!(AgeGrpMax %in% AgeGrpsStarts) & AgeGrpMax != ageLast){
stop("Error 2.0 (prep 5yr age groups): The value you set for AgeGrpMax is not the beginning of a 5-year age group.
Please choose another value and restart the code.")
}
rm(AgeGrpsStarts)
}
## set/sum oldest age group(s)
if(AgeGrpMax != ageLast) {
## if AgeGrpMax is NOT the last column, create AgeGrpsOldest as sum from AgeGrpMax thru ageLast
temp <- which(unique(stringr::str_sub(AgeGrps5Yr, start = 1, end = (stringr::str_locate(AgeGrps5Yr, pattern = "-")-1))) == AgeGrpMax)
AgeGrpsOldest <- c(AgeGrps5Yr[temp:length(AgeGrps5Yr)], ageLast)
# AgeGrpsOldest: "75-59" "80-84" "85-89" "90-94" "95-99" "100"
rm(temp)
} else {
## if AgeGrpMax is the last column, set AgeGrpsOldest the same as ageLast
AgeGrpsOldest <- ageLast
}
n_AgeGrps <- dim(CtrlAgeGrpsTotals)[2] - 2 ## -2 to not count Sex and TOTAL columns
## Part 2.1-2.4: raking done in while() loop for non-total sexes ----
## 2B. repeat for all Sexes, except Total
while(CurrSex < VarSexTotal) {
## Part 2.1: prorate AgeGrpsOldest columns ----
if(have5yrAgeGrps == FALSE) {
### when OutputData does not originally have 5 year age groups already (i.e., have5yrAgeGrps == FALSE)
## 2C. filter data by CurrSex
data <- OutputData5 %>%
dplyr::filter(Sex == CurrSex)
## 2D. add total level data as last row
temp <- CtrlAgeGrpsTotals %>%
dplyr::rename(Sex = {{VarSex}}) %>%
dplyr::filter(Sex == CurrSex) %>%
dplyr::mutate(Region = "Ctrl_TOTAL") %>%
dplyr::select(Region, Sex, tidyselect::everything())
data <- rbind(data, temp); rm(temp)
} else {
### when OutputData has 5 year age groups already (i.e., have5yrAgeGrps == TRUE)
## 2C. filter data by CurrSex
data <- OutputData %>%
dplyr::rename(Sex = {{VarSex}}, Region = {{VarRegion}}) %>%
dplyr::filter(Sex == CurrSex)
## 2D. add total level data as last row
temp <- CtrlPopTotals %>%
dplyr::rename(Sex = {{VarSex}}) %>%
dplyr::filter(Sex == CurrSex) %>%
dplyr::mutate(Region = "Ctrl_TOTAL") %>%
dplyr::select(Region, Sex, tidyselect::everything())
data <- rbind(data, temp); rm(temp)
data <- dplyr::select(data, Region, Sex, tidyselect::all_of(AgeGrps5Yr), tidyselect::all_of(ageLast), TOTAL)
}
## rename Region as VarRow
data <- dplyr::rename(data, VarRow = Region)
## set n_rowGrps
n_rowGrps <- n_Regions
## 2E. prorate AgeGrpsOldest (i.e., prorate maximum age group(s))
## prep for prorating AgeGrpsOldest columns
temp <- data[data$VarRow == "Ctrl_TOTAL", c("VarRow", AgeGrpsOldest)]
# temp: 1 obs, ~7 vars, with Ctrl_TOTALs for age groups: "VarRow" "75-79" "80-84" "85-89" "90-94" "95-99" "100"
MaxAge <- prep.prorate.col(data, n_rowGrps, colGrps = c("VarRow", AgeGrpsOldest),
ctrl_total_row = temp, AgeGrpMax, ageLast)
# MaxAge: obs = each region + 4 (Sum row, Ctrl_TOTAL row, Diff row, adj_value row) of ~7 vars
## call prorate function; ensure all numbers are integers (i.e., no fractional people allowed)
for (i in 2:ncol(MaxAge)) {
## add/subtract adjustment value to/from actual data to get first interim value
## repeat while difference is not zero
CurrCol <- as.data.frame(MaxAge[ ,c(1, i)])
## WHILE difference is NOT zero, adjust actual data
while(abs(CurrCol[which(CurrCol[, 1] == "Diff"), -1]) > 0.0000000001) {
CurrCol <- prorate.col(CurrCol, n_rowGrps, allowNegatives)
}
## ensure all numbers are integers (i.e., no fractional people allowed)
CurrCol[1:n_rowGrps, -1] <- real.to.int(realNums = CurrCol[1:n_rowGrps, -1]) ## this is a vector
## replace actual data with adjusted data in CurrCol
MaxAge[ ,i] <- CurrCol[,-1]
}; rm(i, CurrCol, temp)
check2.1 <- sum(MaxAge[1:n_Regions, -1]) == sum(CtrlAgeGrpsTotals[CtrlAgeGrpsTotals$Sex == CurrSex, AgeGrpsOldest])
if(check2.1 != TRUE) {
stop("Error 2.1 (prorate AgeGrpsOldest columns): MaxAge columns for AgeGrpsOldest should sum to the current sex's control total, but they do not.")
} else {
if(writeRakingLog == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- paste0("Part 2.1, Sex ", CurrSex,
" (prorate columns): MaxAge columns (",
paste(AgeGrpsOldest, collapse = ", "),
") now sum to their control total for this Sex.")
}
rm(check2.1)
}
## sum AgeGrpsOldest
if(AgeGrpMax != ageLast) {
## use rowSums() if AgeGrpsMax is more than one group (to get each row's sum)
MaxAge$TotalOldest <- rowSums(MaxAge[, -1])
} else {
MaxAge$TotalOldest <- MaxAge[,which(names(MaxAge) == AgeGrpMax)]
}
## Part 2.2: prorate non-AgeGrpsOldest rows ----
### ** Now, work on non-AgeGrpsOldest **
## 2F. subtract the prorated oldest age group(s) from Region totals to rake majority of 5 yr age groups
## set up: rows = regions w/ last row = Ctrl_TOTAL; cols = VarRow, Sex, 5 yr age groups, TOTAL
## Do NOT include oldest age group(s) in prorating rows of 5 year age groups! i.e., drop AgeGrpsOldest
data <- data %>%
dplyr::select(-tidyselect::all_of(AgeGrpsOldest)) %>%
dplyr::left_join(MaxAge, by = "VarRow") %>%
dplyr::mutate(TOTAL = TOTAL - TotalOldest) %>%
dplyr::select(-tidyselect::all_of(AgeGrpsOldest), -TotalOldest)
### ** Rake across rows to match row totals: PrivateReconcileTotalsByAdding, aka, prorate; on non-AgeGrpsOldest **
## 2G. calc necessary columns (raking by addition prep)
## Step 1: calc actual sum, add in VarRow (i.e., Region) control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
## set n_colGrps; n_rowGrps is still = n_Regions
n_colGrps <- n_AgeGrps - length(AgeGrpsOldest) ## exclude AgeGrpsOldest
## get Region control totals for CurrSex and number of 5 year age groups
temp <- data %>% dplyr::select(VarRow, TOTAL)
## drop Sex & TOTAL from data
data <- data %>% dplyr::select(-Sex, -TOTAL)
data <- calc.cols(data, temp, VarRow, n_colGrps)
## this has cols: VarRow, non-Oldest 5yr age groups, Sum, Ctrl_TOTAL, Diff, adj_value
## 2H. prorate row by row (i.e., for 1:n_Regions, prorate so region age group totals sum to region control totals)
for (i in 1:n_rowGrps) {
## Step 3: add/subtract adjustment value to/from actual data to get first interim value
## Step 4: repeat Steps 1 through 3 while difference is not zero
CurrRow <- data[i, ]
## WHILE difference is NOT zero, adjust actual data
while(abs(CurrRow$Diff) > 0.0000000001) {
CurrRow <- prorate.row(CurrRow, n_colGrps, allowNegatives)
}
## ensure all numbers are integers (i.e., no fractional people allowed)
CurrRow[, 2:(n_colGrps + 1)] <- real.to.int(realNums = CurrRow[, 2:(n_colGrps + 1)])
## replace actual data with adjusted data in CurrRow
data[i, ] <- CurrRow
}
## remove no-longer-needed objects
rm(i, CurrRow, temp)
### ************************************ Check Point ************************************** ###
## At this point, Region rows (across 5 year age groups) sum to their control total, BUT
## 5 year age group columns do NOT (necessarily) sum to their control totals
## example: sum(data$`0-4`[-nrow(data)]) - CtrlAgeGrpsTotals$`0-4`[CtrlAgeGrpsTotals$Sex == CurrSex] ## likely not zero
### *************************************************************************************** ###
## Part 2.3: prorate non-AgeGrpsOldest columns ----
### ** Rake down columns to match column totals: PrivateReconcileTotalsByAdding, aka, prorate; on non-AgeGrpsOldest **
## get age Ctrl_TOTALs for CurrSex
temp <- data[nrow(data), 1:(n_colGrps+1)] ## Ctrl_TOTAL row
## 2I. calculate necessary rows: Sum (colSums), Ctrl_TOTAL (temp), Diff (subtraction), adj_value (division)
## Step 1: calc actual sum, add in VarRow control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
dataCols <- prep.prorate.col(data, n_rowGrps, colGrps = 1:(n_colGrps+1),
ctrl_total_row = temp, AgeGrpMax = NULL, ageLast = NULL)
## 2J. prorate column by column
for (i in 2:(n_colGrps+1)) {
## Step 3: add/subtract adjustment value to/from actual data to get first interim value
## Step 4: repeat Steps 1 through 3 while difference is not zero
CurrCol <- dataCols[ ,c(1, i)]
## WHILE difference is NOT zero, adjust actual data
while(abs(CurrCol[which(CurrCol[, 1] == "Diff"), -1]) > 0.0000000001) {
CurrCol <- prorate.col(CurrCol, n_rowGrps, allowNegatives)
}
## ensure all numbers are integers (i.e., no fractional people allowed)
# CurrCol[1:n_rowGrps, -1] <- real.to.int(realNums = dplyr::pull(CurrCol[1:n_rowGrps, -1]))
## if no `pull` then real.to.int() does not work as intended (i.e., realNums is a single col df (bad), not a vector (good))
if("tbl" %in% class(CurrCol)) {
CurrCol[1:n_rowGrps, -1] <- real.to.int(realNums = dplyr::pull(CurrCol[1:n_rowGrps, -1]))
} else {
CurrCol[1:n_rowGrps, -1] <- real.to.int(realNums = CurrCol[1:n_rowGrps, -1])
}
## replace actual data with adjusted data in CurrCol
dataCols[ ,i] <- CurrCol[,-1]
}
### ************************************ Check Point ************************************** ###
## At this point, Region rows (across 5 year age groups) sum to their control total, and
## 5 year age group columns (EXCEPT for AgeGrpsOldest) do sum to their control totals; example:
## sum(dataCols$`0-4`[1:89]) - CtrlAgeGrpsTotals$`0-4`[CtrlAgeGrpsTotals$Sex == CurrSex] ## should be zero
## sum(dataCols$`70-74`[1:89]) - CtrlAgeGrpsTotals$`70-74`[CtrlAgeGrpsTotals$Sex == CurrSex] ## should be zero
### *************************************************************************************** ###
AgeGrpsNotOldest <- setdiff(AgeGrps5Yr, AgeGrpsOldest)
check2.3 <- sum(dataCols[1:n_Regions, -1]) == sum(CtrlAgeGrpsTotals[CtrlAgeGrpsTotals$Sex == CurrSex, AgeGrpsNotOldest])
if(check2.3 != TRUE) {
stop("Error 2.3 (prorate non-AgeGrpsOldest columns): 5 year age group columns (EXCEPT for AgeGrpsOldest) should sum to the current sex's control total, but they do not.")
} else {
if(writeRakingLog == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- paste0("Part 2.3, Sex ", CurrSex,
" (prorate columns): Remaining 5-year age group columns (",
AgeGrpsNotOldest[1], " through ",
AgeGrpsNotOldest[length(AgeGrpsNotOldest)],
") now sum to their control total for this Sex.")
}
}
## remove no-longer-needed objects
rm(i, CurrCol, temp, check2.3, AgeGrpsNotOldest) ## check2.3, AgeGrpsNotOldest
## Part 2.4: rake prorated data ----
## 2K. calc necessary columns: Sum, Ctrl_TOTAL, Diff, adj_value
## Step 1: calc actual sum, add in VarRow control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
## Region Ctrl_TOTALs for CurrSex (AgeGrpOldest have been subtracted already)
temp <- data %>% dplyr::select(VarRow, TOTAL = Ctrl_TOTAL)
dataCols <- calc.cols(data = dataCols, temp, VarRow, n_colGrps)
dataCols$adj_value <- NULL
rm(temp)
### ** Reorder rows, if necessary **
## sort (data) rows in ascending order (min -> max) of row totals (i.e., Sum column)
dataCols[1:n_rowGrps, ] <- dplyr::arrange(dataCols[1:n_rowGrps, ], Sum)
### ** Rake the data: **PrivateRakePreprocessedData, aka, rake **
### rake over all rows except for last row (it should be done by default)
## At this point, n_colGrps = n_AgeGrps-1 and n_rowGrps = n_Regions
## 2L. set CurrRow_value, and pull CurrRow from data to work on, iterate over rows
CurrRow_value <- 1 ## first row in data first time through; CurrRow[1,]
## note: this does NOT run when there is only ONE n_rowGrps! (e.g., raking VI, VI_NO_CRD)
while(CurrRow_value < n_rowGrps) {
CurrRow <- dataCols[CurrRow_value, ]
## set up new row for Adjustments; initially, all adjustments = 0
CurrRow[2, 1] <- "Adjustments"
CurrRow[2, 2:(n_colGrps+1)] <- 0
## set up new row for ADJUSTED current row; = CurrRow + Adjustments
CurrRow[3, 1] <- "AdjCurrRow"
CurrRow[3, 2:(n_colGrps+1)] <- CurrRow[1, 2:(n_colGrps+1)] + CurrRow[2, 2:(n_colGrps+1)]
## create RowAdj to save all adjustments made that need to be made at end to the last row/reflected in data
RowAdj <- rep(0, times = n_colGrps)
## determine whether need Margin or not
if(allowNegatives == FALSE) {
## if negatives NOT allowed, determine whether we need Margin or not
if(CurrRow$Sum[1] < CurrRow$Ctrl_TOTAL[1]) {
needMargin <- TRUE ## algorithm C: if values need to be added & negatives not allowed
} else {
needMargin <- FALSE ## algorithm B: if values need to be subtracted & negatives not allowed
}
} else {
## else if negatives ARE allowed, Margin is not needed
needMargin <- FALSE ## algorithm A: if negatives ARE allowed
}
## 2M. choose and run appropriate algorithm for Part 2:
## 2N. calc "whole people" adjustments to be made to all cells in the row
## 2O. calc "residual people" adjustments to be made to selected cells in row
## 2P. reconcile column control totals (p.18-23)
if(allowNegatives == TRUE) {
## p.12: A. Adjust cells in the current row if negative values are allowed
dataCols <- allowNegsnoMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj)
} else {
if(needMargin == FALSE) {
## p.13: B. Adjust cells in the current row if values need to be taken away, negatives not allowed
## "If the sum of ages is greater than the control total, then values need to be taken away from
## age groups for the current row and added to age groups for rows below the current row."
dataCols <- noNegsnoMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj)
} else {
## i.e., needMargin == TRUE, allowNegatives == TRUE
## p.14: C. Adjust cells in the current row if values need to be added, negatives not allowed
## "If the sum of ages is less than the control total, then values needed to be added to age
## groups for the current row, and taken away from age groups for rows below the current row."
dataCols <- noNegsneedMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj, needMargin)
}
}
rm(CurrRow, RowAdj, needMargin)
## update CurrRow_value
CurrRow_value <- CurrRow_value + 1
}
## 2Q. adjust last row and update Sum & Diff rows
## updated Sum row
dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+2)] <- as.list(colSums(dataCols[1:n_rowGrps, 2:(n_colGrps+2)]))
## update Diff row
dataCols[which(dataCols[, 1] == "Diff"), 2:(n_colGrps+2)] <- (dataCols[which(dataCols[, 1] == "Ctrl_TOTAL"), 2:(n_colGrps+2)]
- dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+2)])
## update last row (add Diff to last row)
dataCols[CurrRow_value, 2:(n_colGrps+2)] <- (dataCols[CurrRow_value, 2:(n_colGrps+2)]
+ dataCols[which(dataCols[, 1] == "Diff"), 2:(n_colGrps+2)])
## update Ctrl_TOTAL's Diff cell
dataCols$Diff[CurrRow_value] <- dataCols$Ctrl_TOTAL[CurrRow_value] - dataCols$Sum[CurrRow_value]
## update Sum row to reflect Diffs added in
dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+2)] <- as.list(colSums(dataCols[1:n_rowGrps, 2:(n_colGrps+2)]))
## update Diff row to reflect new Diffs
dataCols[which(dataCols[, 1] == "Diff"), 2:(n_colGrps+2)] <- (dataCols[which(dataCols[, 1] == "Ctrl_TOTAL"), 2:(n_colGrps+2)]
- dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+2)])
### ** Put the data back in order, if necessary **
## 2R. re-sort rows by InputData order
rows_order <- InputData %>%
dplyr::filter(Sex == CurrSex) %>%
dplyr::select(VarRow = which(names(InputData) == VarRegion)) %>%
dplyr::mutate(row_order = dplyr::row_number(),
VarRow = as.character(VarRow))
dataCols <- dplyr::left_join(dataCols, rows_order, by = "VarRow")
dataCols[1:n_rowGrps, ] <- dplyr::arrange(dataCols[1:n_rowGrps, ], row_order)
dataCols$row_order <- NULL
## 2S. Insert the MaxAge values back into the data, and add them back on to the Region totals
## add back MaxAge data
dataCols <- dplyr::left_join(dataCols, MaxAge, by = "VarRow")
## update Sum with AgeGrpsOldest sum (i.e., TotalOldest)
dataCols <- dataCols %>%
dplyr::mutate(Sum = Sum + TotalOldest) %>%
dplyr::select(VarRow, tidyselect::all_of(AgeGrps5Yr), tidyselect::all_of(ageLast), Sum)
## 2T. update OutputData5 with changes for this CurrSex and save as interim file
n_colGrps <- n_AgeGrps
## get just data rows (and data and Sum columns)
temp <- dataCols[1:n_rowGrps,]
## rename VarRow & Sum columns as Region & TOTAL to match with OutputData5, add Sex column back in
temp <- temp %>%
dplyr::mutate(Sex = CurrSex) %>%
dplyr::rename(Region = VarRow, TOTAL = Sum) %>%
dplyr::select(Region, Sex, tidyselect::everything())
## replace in OutputData5 the now raked estimates of 5 year and maximum age group values for CurrSex
OutputData5 <- OutputData5 %>% dplyr::mutate(Region = as.character(Region)) ## in case region is numeric
OutputData5[OutputData5$Sex == CurrSex, ] <- temp
if(saveInterimFiles == TRUE) {
## save as interim file
readr::write_csv(OutputData5, here::here("outputs", "interim_files", paste0("OutputData5_2S_Sex", CurrSex,
"_updated_initial_estimates_Sex_Region_Age_Groups.csv")))
}
### ************************************ Check Point ************************************** ###
## At this point, Region rows sum to their control total, AND
## 5-year age group columns sum to their control totals, for the current sex
# sum(dataCols$Sum[1:n_rowGrps]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == CurrSex] ## should be TRUE
### *************************************************************************************** ###
if(sum(dataCols$Sum[1:n_rowGrps]) != CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == CurrSex]) {
stop("Error 2.4 (raking AgeGrps data): 5-year age group columns rows for the current sex should sum to their control total, but they do not.")
} else {
if(writeRakingLog == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- paste0("Part 2, Sex ", CurrSex, ": 5-year age group values ",
"have been updated for each Region for this Sex.",
" See 'outputs\\interim_files' if interested.")
}
}
### clean up
rm(temp, data, CurrRow_value, dataCols, rows_order, n_colGrps, n_rowGrps, MaxAge)
## 2U. move to next Sex
CurrSex <- CurrSex + 1
}
## Part 2.5: update total Sex values, by 5-year age groups ----
## 2V. sum all Sexes to get updated total Sex values
TotalSex <- OutputData5 %>% dplyr::filter(Sex == 1) ## initiate with first Sex value
counter <- 2 ## set counter to second Sex value
ageGrpCols <- unique(c(AgeGrps5Yr, ageLast, "TOTAL"))
while(counter < VarSexTotal) {
## in case there are more than 2 non-total sexes
temp <- OutputData5 %>% dplyr::filter(Sex == counter)
TotalSex[, ageGrpCols] <- TotalSex[, ageGrpCols] + temp[, ageGrpCols] ## for all age groups, sum this sex in
counter <- counter + 1
TotalSex$Sex <- counter ## needed for when there are more than 2 non-total Sexes (e.g., 1+2+3 != 4)
rm(temp)
}
## 2W. replace in OutputData5 the updated estimates of 5 year and maximum age group regional total values for VarSexTotal
for(i in seq_along(AgeGrps5Yr)) {
OutputData5[OutputData5$Sex == counter, names(OutputData5) == AgeGrps5Yr[i]] <- TotalSex[, names(TotalSex) == AgeGrps5Yr[i]]
}
OutputData5[OutputData5$Sex == counter, names(OutputData5) == ageLast] <- TotalSex[, names(TotalSex) == ageLast]
if(saveInterimFiles == TRUE) {
## save as interim file
readr::write_csv(OutputData5, here::here("outputs", "interim_files",
paste0("OutputData5_2W_Sex", CurrSex,
"_updated_initial_estimates_Sex_Region_Age_Groups.csv")))
}
### ************************************ Check Point ************************************** ###
## At this point, Region rows (across 5 year age groups) sum to their control total, and
## 5 year age group columns sum to their control totals
## sum(OutputData5[OutputData5$Sex == 1, 3:23]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1] ## should be TRUE
## sum(OutputData5[OutputData5$Sex == 2, 3:23]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 2] ## should be TRUE
## sum(OutputData5[OutputData5$Sex == 3, 3:23]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 3] ## should be TRUE
## all(OutputData5[OutputData5$Sex == 3, "TOTAL"] == CtrlRegionTotals$TOTAL) ## should be TRUE
## sum(CtrlPopTotals[CtrlPopTotals$Sex == 1, 2:6]) - sum(OutputData5[OutputData5$Sex == 1, 3]) ## should be zero
## sum(CtrlPopTotals[CtrlPopTotals$Sex == 1, 102]) - sum(OutputData5[OutputData5$Sex == 1, 23]) ## should be zero
### *************************************************************************************** ###
## 2X. clean up
rm(CurrSex, i, TotalSex, counter)
if(writeRakingLog == TRUE) {
## 2Y. add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- "Part 2: Now, 5 year and maximum age group values have been updated for each Sex and Region. See 'outputs\\interim_files' if interested."
}
#### Part 3 ----
## Part 3. Updating initial single year of age estimates, by Sex
## Part 3.0: prep ----
## 3A. set number of single ages (minus Sex and TOTAL columns) and CurrSex
## Already set Sexes, n_AgeGrps, AgeGrpMax, AgeGrpsOldest, ageLast, AgeGrps5Yr, etc. in Part 2
n_Ages <- dim(CtrlPopTotals)[2] - 2
CurrSex <- Sexes[1]
## Part 3.1-3.3: raking done in while() loop for non-total sexes, for() each AgeGrps5Yr ----
## 3B. repeat for all Sexes, except Total
while(CurrSex < VarSexTotal) {
# age <- 1 ## this is actually age GROUP (redundant with seq_along code below)
for(age in seq_along(AgeGrps5Yr)) {
## Part 3.1: prorate rows ----
## 3C. set relevant single ages: CurrAgeGrp and AgeSingles
CurrAgeGrp <- AgeGrps5Yr[age]
AgeBegin <- stringr::str_split(string = CurrAgeGrp, pattern = "-")[[1]][1]
AgeEnd <- stringr::str_split(string = CurrAgeGrp, pattern = "-")[[1]][2]
AgeSingles <- AgeBegin:AgeEnd
rm(AgeBegin, AgeEnd)
## 3D. filter input data by CurrSex and AgeSingles
## "OutputData" IS input data at this point as only the TOTAL column has been updated
data <- OutputData %>%
dplyr::rename(Sex = {{VarSex}}, VarRow = {{VarRegion}}) %>%
dplyr::filter(Sex == CurrSex) %>%
dplyr::select(VarRow, Sex, as.character(AgeSingles[1]), as.character(AgeSingles[2]),
as.character(AgeSingles[3]), as.character(AgeSingles[4]), as.character(AgeSingles[5]))
### ** Rake across rows to match row totals: PrivateReconcileTotalsByAdding, aka, prorate **
## 3E. calc necessary columns (raking by addition prep)
## set up: rows = regions w/ last row = Ctrl_TOTAL; cols = single ages, TOTAL (from Part2)
## Step 1: calc actual sum, add in VarRow (i.e., Region) control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
## temporarily save CurrAgeGrp column from OutputData5 (i.e., Ctrl_TOTAL for current Sex and CurrAgeGrp)
## OutputData5 has been updated (in Part 2)
temp <- OutputData5 %>%
dplyr::rename(Sex = {{VarSex}}) %>%
dplyr::filter(Sex == CurrSex) %>%
dplyr::select(VarRow = Region, TOTAL = tidyselect::all_of(CurrAgeGrp)) ## this is 1:n_Regions X 2 cols (VarRow and updated 5-yr age group)
## drop Sex column for now (it's just all = CurrSex) so calc.cols() works properly
data <- data %>% dplyr::select(-{{VarSex}})
## set n_colGrps (batch of 5 single age cols) and n_rowGrps (region rows)
n_colGrps <- length(AgeSingles)
n_rowGrps <- n_Regions
## calc columns (Sum, Ctrl_TOTAL, Diff, adj_value)
data <- calc.cols(data, temp, VarRow, n_colGrps)
## this has cols: VarRow, five age columns, Sum, Ctrl_TOTAL, Diff, adj_value
## 3F. prorate row by row (i.e., for 1:n_Regions, prorate so region totals sum to region control totals)
for (i in 1:n_rowGrps) {
## Step 3: add/subtract adjustment value to/from actual data to get first interim value
## Step 4: repeat Steps 1 through 3 while difference is not zero
CurrRow <- data[i, ]
## WHILE difference is NOT zero, adjust actual data
while(abs(CurrRow$Diff) > 0.0000000001) {
CurrRow <- prorate.row(CurrRow, n_colGrps, allowNegatives)
}
## ensure all numbers are integers (i.e., no fractional people allowed)
CurrRow[, 2:(n_colGrps + 1)] <- real.to.int(realNums = CurrRow[, 2:(n_colGrps + 1)]) ## 1 row of cols ~ vector
## replace actual data with adjusted data in CurrRow
data[i, ] <- CurrRow
}
data$adj_value <- NULL
## clean up
rm(temp, i, CurrRow)
### ************************************ Check Point ************************************** ###
## At this point, columns across each row (Region) sum to their control total,
## BUT rows down each column (single ages) do NOT (necessarily) sum to their control totals (AgeGrp)
## sum(data[1:n_rowGrps, 2:(n_colGrps+1)]) - sum(OutputData5[1:n_Regions, CurrAgeGrp]) ## should be zero
## sum(data[,names(data) %in% AgeSingles]) - sum(CtrlPopTotals[CtrlPopTotals$Sex == CurrSex, names(CtrlPopTotals) %in% AgeSingles]) ## should be zero
## example: sum(data[,names(data) %in% AgeSingles[1]]) - CtrlPopTotals[CtrlPopTotals$Sex == CurrSex, names(CtrlPopTotals) == AgeSingles[1]] ## likely not zero
### *************************************************************************************** ###
if(allowNegatives == FALSE & !all(data[,names(data) %in% AgeSingles] >= 0)) {
message(paste0("Error 3.1: (prorating single age data, rows): Negatives are not allowed, ",
"but exist in Age group ", CurrAgeGrp, " after prorating rows."))
}
## Part 3.2: prorate columns ----
### ** Rake down columns to match column totals: PrivateReconcileTotalsByAdding, aka, prorate **
## 3G. prep to prorate columns
## single age Ctrl_TOTALs for CurrSex
temp <- CtrlPopTotals %>%
dplyr::filter(Sex == CurrSex) %>%
dplyr::mutate(VarRow = "Ctrl_TOTAL") %>%
dplyr::select(VarRow, which(names(CtrlPopTotals) %in% AgeSingles))
n_colGrps <- length(AgeSingles)
n_rowGrps <- n_Regions
## 3H. calculate necessary rows: Sum (colSums), Ctrl_TOTAL (temp), Diff (subtraction), adj_value (division)
## Step 1: calc actual sum, add in VarRow control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
dataCols <- prep.prorate.col(data, n_rowGrps, colGrps = 1:(n_colGrps+1),
ctrl_total_row = temp, AgeGrpMax = NULL, ageLast = NULL)
## 3I. prorate column by column
for (i in 2:(n_colGrps+1)) {
## Step 3: add/subtract adjustment value to/from actual data to get first interim value
## Step 4: repeat Steps 1 through 3 while difference is not zero
CurrCol <- dataCols[ ,c(1, i)]
## WHILE difference is NOT zero, adjust actual data
while(abs(CurrCol[which(CurrCol[, 1] == "Diff"), -1]) > 0.0000000001) {
CurrCol <- prorate.col(CurrCol, n_rowGrps, allowNegatives)
}
## ensure all numbers are integers (i.e., no fractional people allowed)
if("tbl" %in% class(CurrCol)) {
CurrCol[1:n_rowGrps, -1] <- real.to.int(realNums = dplyr::pull(CurrCol[1:n_rowGrps, -1]))
} else {
CurrCol[1:n_rowGrps, -1] <- real.to.int(realNums = CurrCol[1:n_rowGrps, -1])
}
## replace actual data with adjusted data in CurrCol
dataCols[ ,i] <- CurrCol[,-1]
}
## remove no-longer-needed objects
rm(i, CurrCol, temp)
### ************************************ Check Point ************************************** ###
## At this point, columns across each row (Region) sum close to their control total,
## and rows down each column (single ages) sum to their control totals (age)
## sum(dataCols[1,-1]) - OutputData5[CurrSex, CurrAgeGrp] ## possibly no longer zero, but close
## sum(dataCols[1:n_Regions, names(dataCols) == AgeSingles[1]]) - CtrlPopTotals[CurrSex, names(CtrlPopTotals) == AgeSingles[1]] ## should be zero
## sum(dataCols[1:n_Regions, names(dataCols) == AgeSingles[2]]) - CtrlPopTotals[CurrSex, names(CtrlPopTotals) == AgeSingles[2]] ## should be zero
## sum(dataCols[1:n_Regions, names(dataCols) == AgeSingles[3]]) - CtrlPopTotals[CurrSex, names(CtrlPopTotals) == AgeSingles[3]] ## should be zero
## sum(dataCols[1:n_Regions, names(dataCols) == AgeSingles[4]]) - CtrlPopTotals[CurrSex, names(CtrlPopTotals) == AgeSingles[4]] ## should be zero
## sum(dataCols[1:n_Regions, names(dataCols) == AgeSingles[5]]) - CtrlPopTotals[CurrSex, names(CtrlPopTotals) == AgeSingles[5]] ## should be zero
### *************************************************************************************** ###
if(allowNegatives == FALSE & !all(dataCols[1:n_Regions, names(dataCols) %in% AgeSingles] >= 0)) {
stop(paste0("Error 3.2: (prorating single age data, rows): Negatives are not allowed, ",
"but exist in Age group ", CurrAgeGrp, " after prorating columns"))
}
## Part 3.3: rake prorated data ----
## 3J. calc necessary columns: Sum, Ctrl_TOTAL, Diff, adj_value
## Step 1: calc actual sum, add in VarRow control totals, calc difference
## Step 2: calc adjustment value (difference divided by number of groups)
## Region Ctrl_TOTALs for CurrSex and CurrAgeGrp
temp <- OutputData5 %>%
dplyr::filter(Sex == CurrSex) %>%
dplyr::select(VarRow = Region, TOTAL = tidyselect::all_of(CurrAgeGrp))
dataCols <- calc.cols(data = dataCols, temp, VarRow, n_colGrps)
dataCols$adj_value <- NULL
rm(temp)
### ** Reorder rows, if necessary **
## 3K. sort (data) rows in ascending order (min -> max) of row totals (i.e., Sum column)
dataCols[1:n_rowGrps, ] <- dplyr::arrange(dataCols[1:n_rowGrps, ], Sum)
### ** Rake the data: **PrivateRakePreprocessedData, aka, rake **
### rake over all rows except for last row (it should be done by default)
## At this point, n_colGrps = the five single ages and n_rowGrps = n_Regions
## 3L. set CurrRow_value, and pull CurrRow from data to work on, iterate over rows
CurrRow_value <- 1 ## first row in data first time through; CurrRow[1,]
## note: this does NOT run when there is only ONE n_rowGrps! (e.g., raking VI, VI_NO_CRD)
while(CurrRow_value < n_rowGrps) {
CurrRow <- dataCols[CurrRow_value, ]
## set up new row for Adjustments; initially, all adjustments = 0
CurrRow[2, 1] <- "Adjustments"
CurrRow[2, 2:(n_colGrps+1)] <- 0
## set up new row for ADJUSTED current row; = CurrRow + Adjustments
CurrRow[3, 1] <- "AdjCurrRow"
CurrRow[3, 2:(n_colGrps+1)] <- CurrRow[1, 2:(n_colGrps+1)] + CurrRow[2, 2:(n_colGrps+1)]
## create RowAdj to save all adjustments made that need to be made at end to the last row/reflected in data
RowAdj <- rep(0, times = n_colGrps)
## determine whether need Margin or not
if(allowNegatives == FALSE) {
## if negatives NOT allowed, determine whether we need Margin or not
if(CurrRow$Sum[1] < CurrRow$Ctrl_TOTAL[1]) {
needMargin <- TRUE ## algorithm C: if values need to be added & negatives not allowed
} else {
needMargin <- FALSE ## algorithm B: if values need to be subtracted & negatives not allowed
}
} else {
## else if negatives ARE allowed, Margin is not needed
needMargin <- FALSE ## algorithm A: if negatives ARE allowed
}
## 3M. choose and run appropriate algorithm:
## 3N. calc "whole people" adjustments to be made to all cells in the row
## 3O. calc "residual people" adjustments to be made to selected cells in row
## 3P. reconcile column control totals (p.18-23)
if(allowNegatives == TRUE) {
## p.12: A. Adjust cells in the current row if negative values are allowed
dataCols <- allowNegsnoMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj)
} else {
if(needMargin == FALSE) {
## p.13: B. Adjust cells in the current row if values need to be taken away, negatives not allowed
## "If the sum of ages is greater than the control total, then values need to be taken away from
## age groups for the current row and added to age groups for rows below the current row."
dataCols <- noNegsnoMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj)
} else {
## i.e., needMargin == TRUE, allowNegatives == TRUE
## p.14: C. Adjust cells in the current row if values need to be added, negatives not allowed
## "If the sum of ages is less than the control total, then values needed to be added to age
## groups for the current row, and taken away from age groups for rows below the current row."
dataCols <- noNegsneedMargin(CurrRow, CurrRow_value, data = dataCols, n_colGrps, n_rowGrps, RowAdj, needMargin)
}
}
rm(CurrRow, RowAdj, needMargin)
## update CurrRow_value
CurrRow_value <- CurrRow_value + 1
}
## 3Q. adjust last row and update Sum & Diff rows
dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+1)] <- as.list(colSums(dataCols[1:n_rowGrps, 2:(n_colGrps+1)]))
dataCols[which(dataCols[, 1] == "Diff"), 2:(n_colGrps+1)] <- (dataCols[which(dataCols[, 1] == "Ctrl_TOTAL"), 2:(n_colGrps+1)]
- dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+1)])
dataCols[CurrRow_value, 2:(n_colGrps+1)] <- (dataCols[CurrRow_value, 2:(n_colGrps+1)]
+ dataCols[which(dataCols[, 1] == "Diff"), 2:(n_colGrps+1)])
dataCols$Sum[CurrRow_value] <- rowSums(dataCols[CurrRow_value, 2:(n_colGrps+1)])
dataCols$Diff[CurrRow_value] <- dataCols$Ctrl_TOTAL[CurrRow_value] - dataCols$Sum[CurrRow_value]
dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+1)] <- as.list(colSums(dataCols[1:n_rowGrps, 2:(n_colGrps+1)]))
dataCols[which(dataCols[, 1] == "Diff"), 2:(n_colGrps+1)] <- (dataCols[which(dataCols[, 1] == "Ctrl_TOTAL"), 2:(n_colGrps+1)]
- dataCols[which(dataCols[, 1] == "Sum"), 2:(n_colGrps+1)])
### ** Put the data back in order, if necessary **
## 3R. re-sort rows by InputData order
rows_order <- InputData %>%
dplyr::filter(Sex == CurrSex) %>%
dplyr::select(VarRow = which(names(InputData) == VarRegion)) %>%
dplyr::mutate(row_order = dplyr::row_number(),
VarRow = as.character(VarRow))
dataCols <- dplyr::left_join(dataCols, rows_order, by = "VarRow")
dataCols[1:n_rowGrps, ] <- dplyr::arrange(dataCols[1:n_rowGrps, ], row_order)
dataCols$row_order <- NULL
### ************************************ Check Point ************************************** ###
## At this point, Region rows (across 5 year age groups) sum to their control total, and
## 5 year age group columns sum to their control totals, AND now single age values for
## the CurrAgeGrp and CurrSex should sum to their control totals (for each region).
### *************************************************************************************** ###
testAll <- sum(dataCols[1:n_rowGrps, which(names(dataCols) %in% AgeSingles)]) == sum(OutputData5[OutputData5$Sex == CurrSex, AgeGrps5Yr[age]])
testAge1 <- sum(dataCols[1:n_rowGrps, which(names(dataCols) %in% AgeSingles[1])]) == CtrlPopTotals[CtrlPopTotals$Sex == CurrSex, which(names(CtrlPopTotals) == AgeSingles[1])]
testAge2 <- sum(dataCols[1:n_rowGrps, which(names(dataCols) %in% AgeSingles[2])]) == CtrlPopTotals[CtrlPopTotals$Sex == CurrSex, which(names(CtrlPopTotals) == AgeSingles[2])]
testAge3 <- sum(dataCols[1:n_rowGrps, which(names(dataCols) %in% AgeSingles[3])]) == CtrlPopTotals[CtrlPopTotals$Sex == CurrSex, which(names(CtrlPopTotals) == AgeSingles[3])]
testAge4 <- sum(dataCols[1:n_rowGrps, which(names(dataCols) %in% AgeSingles[4])]) == CtrlPopTotals[CtrlPopTotals$Sex == CurrSex, which(names(CtrlPopTotals) == AgeSingles[4])]
testAge5 <- sum(dataCols[1:n_rowGrps, which(names(dataCols) %in% AgeSingles[5])]) == CtrlPopTotals[CtrlPopTotals$Sex == CurrSex, which(names(CtrlPopTotals) == AgeSingles[5])]
if(allowNegatives == FALSE & !all(dataCols[1:n_rowGrps, which(names(dataCols) %in% AgeSingles)] >= 0)) {
message(paste0("Error 3.3 (raking single age data): Age group ", CurrAgeGrp,
" is NOT raked ok. Negatives are not allowed, but exist in raked data."))
}
if(all(testAll, testAge1, testAge2, testAge3, testAge4, testAge5) == TRUE) {
if(writeRakingLog == TRUE) {
## add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- paste0("Part 3, Sex ", CurrSex, ": Single age values ",
"(for each region) in age group ", CurrAgeGrp,
" are raked ok.")
}
rm(testAll, testAge1, testAge2, testAge3, testAge4, testAge5)
} else {
message(paste0("Error 3.3 (raking single age data): Age group ", CurrAgeGrp,
" is NOT raked ok. Something does not balance to one or more Control Total(s)."))
}
## 3S. update OutputData with changes for this CurrSex & CurrAgeGrp
## get just data rows and columns
temp <- dataCols[1:n_rowGrps, 2:(n_colGrps+1)]
## replace in OutputData the now raked single age estimates for CurrSex & CurrAgeGrp
OutputData[OutputData$Sex == CurrSex, as.character(AgeSingles)] <- temp
## clean up
rm(temp, data, CurrRow_value, dataCols, rows_order, n_colGrps, n_rowGrps, CurrAgeGrp, AgeSingles)
## 3T. move to next age in AgeGrps5Yr
#age <- age + 1
} ## end for loop through AgeGrps5Yr
## replace ageLast values with those run in Part 2 (i.e., saved in OutputData5)
OutputData[OutputData$Sex == CurrSex, as.character(ageLast)] <- OutputData5[OutputData5$Sex == CurrSex, as.character(ageLast)]
if(saveInterimFiles == TRUE) {
## save as interim file
readr::write_csv(OutputData, here::here("outputs", "interim_files",
paste0("OutputData_3S_Sex", CurrSex,
"_raked_estimates_Sex_Region_Ages.csv")))
}
## 3U. move to next Sex
CurrSex <- CurrSex + 1
} ## end while loop through non-total Sexes
## Part 3.4: update total Sex values, by single year ages ----
## 3V. sum all Sexes to get updated total Sex values
TotalSex <- OutputData %>% dplyr::filter(Sex == 1) ## initiate with first Sex value
counter <- 2 ## set counter to second Sex value
ageSingleCols <- names(OutputData)[stringr::str_detect(names(OutputData), "-", negate = TRUE)]
ageSingleCols <- ageSingleCols[ageSingleCols %in% -999:999]
while(counter < VarSexTotal) {
## in case there are more than 2 non-total sexes
temp <- OutputData %>% dplyr::filter(Sex == counter)
TotalSex[, c(ageSingleCols, "TOTAL")] <- TotalSex[, c(ageSingleCols, "TOTAL")] + temp[, c(ageSingleCols, "TOTAL")] ## for all age groups, sum this sex in
rm(temp)
counter <- counter + 1
TotalSex$Sex <- counter ## needed for when there are more than 2 non-total Sexes (e.g., 1+2+3 != 4)
}
## 3W. replace in OutputData the updated estimates of single age group regional total values for VarSexTotal
OutputData[OutputData$Sex == counter, ] <- TotalSex
if(saveInterimFiles == TRUE) {
## save as interim file
readr::write_csv(OutputData, here::here("outputs", "interim_files",
paste0("OutputData_3S_Sex", CurrSex,
"_raked_estimates_Sex_Region_Ages.csv")))
}
## 3X. final check point
### ************************************ Check Point ************************************** ###
## At this point, columns across each row (Region) sum to their control total,
## AND rows down each column (single ages) sum to their control totals (AgeGrp),
## AND no cells are negative (if allowNegatives = FALSE).
## sum(OutputData[1:n_Regions, 3:(n_Ages+2)]) == CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == 1] ## TRUE
## OutputData$TOTAL[OutputData$Sex == VarSexTotal] == CtrlRegionTotals$TOTAL ## all TRUE
## any(OutputData < 0) ## need FALSE if allowNegatives = FALSE
## all(OutputData >= 0) ## need TRUE if allowNegatives = FALSE
### *************************************************************************************** ###
# ageSingleCols <- names(OutputData)[stringr::str_detect(names(OutputData), "-", negate = TRUE)]
# ageSingleCols <- ageSingleCols[ageSingleCols %in% -999:999]
testCols <- vector(length = 0)
for(i in seq_along(Sexes)) {
testCols[i] <- sum(OutputData[OutputData$Sex == Sexes[i], ageSingleCols]) ==
CtrlPopTotals$TOTAL[CtrlPopTotals$Sex == Sexes[i]]
}; rm(i)
testRows <- all((OutputData$TOTAL[OutputData$Sex == VarSexTotal] == CtrlRegionTotals$TOTAL) == TRUE)
if(allowNegatives == FALSE & any(OutputData < 0) == FALSE) {
testCells <- TRUE
} else {
if(allowNegatives == FALSE & any(OutputData < 0) == TRUE) {
testCells <- FALSE
} else {
testCells <- TRUE
}
}
if(all(c(testCols, testRows, testCells) == TRUE)) {
## add Year column if needed
if(exists("yr_OutputData")) {
OutputData <- OutputData %>% dplyr::mutate(Year = yr_OutputData) %>%
dplyr::select(Year, tidyselect::everything())
if(writeRakingLog == TRUE) {
message("The 'Year' column has been added back to the data.")
}
}
if(writeOutputFile == TRUE) {
## write final (raked) output file
readr::write_csv(OutputData, here::here("outputs", "RakedData.csv"))
message("Data has been successfully raked. See 'outputs' for 'RakedData.csv'.")
} else { message(" Data has been successfully raked.") }
if(writeRakingLog == TRUE) {
# add message to raking_log
raking_log[nrow(raking_log)+1, 1] <- "SUCCESS. Data has been successfully raked."
}
rm(testCols, testRows, testCells)
rm(CurrSex, TotalSex, counter, age)
} else {
if(writeRakingLog == TRUE) {
raking_log[nrow(raking_log)+1, 1] <- "FAIL. Something has gone wrong. Check tests."
for(i in seq_along(testCols)){
if(testCols[i] == FALSE) { raking_log[nrow(raking_log)+1, 1] <-
paste0("The sum of raked columns in Sex ", i, " does not match Population Control Total.") }
}
# if(testCols1 == FALSE) { raking_log[nrow(raking_log)+1, 1] <- ("The sum of raked columns in Sex 1 does not match Population Control Total.") }
# if(testCols2 == FALSE) { raking_log[nrow(raking_log)+1, 1] <- ("The sum of raked columns in Sex 2 does not match Population Control Total.") }
# if(testCols3 == FALSE) { raking_log[nrow(raking_log)+1, 1] <- ("The sum of raked columns in Sex 3 does not match Population Control Total.") }
if(testRows == FALSE) { raking_log[nrow(raking_log)+1, 1] <- ("One or more rows do not sum to its/their Region Control Total(s).") }
if(testCells == FALSE) { raking_log[nrow(raking_log)+1, 1] <- ("Negatives are NOT allowed, but one or more raked values are negative.") }
}
message("FAIL. Raking has stopped. One or more final checks did not pass: ")
for(i in seq_along(testCols)){
if(testCols[i] == FALSE) { raking_log[nrow(raking_log)+1, 1] <-
message(paste0("The sum of raked columns in Sex ", i, " does not match Population Control Total.")) }
}
# if(testCols1 == FALSE) { message("The sum of raked columns in Sex 1 does not match Population Control Total.") }
# if(testCols2 == FALSE) { message("The sum of raked columns in Sex 2 does not match Population Control Total.") }
# if(testCols3 == FALSE) { message("The sum of raked columns in Sex 3 does not match Population Control Total.") }
if(testRows == FALSE) { message("One or more rows do not sum to its/their Region Control Total(s).") }
if(testCells == FALSE) { message("Negatives are NOT allowed, but one or more raked values are negative.") }
utils::View(OutputData)
readr::write_csv(OutputData, here::here("outputs", "RakedData_failed.csv"))
}
if(writeRakingLog == TRUE) {
readr::write_csv(raking_log, here::here("outputs", "raking_log.csv"))
}
#### DONE ----
if(writeRakingLog == TRUE) {
return(list(RakedData = OutputData,
RakingLog = raking_log))
} else {
return(list(RakedData = OutputData))
}
}
#### multiRake ----
#' Rake over multiple years
#'
#' The raking function, \code{\link{dbRake}}, will run for one year of data but the data often needs
#' to be raked for more than one year. In \strong{multiRake}, arguments include \strong{Years} in
#' the data, any \strong{censusYears} that should not be raked as these are generally considered
#' definitive, name of \strong{InputData}, name of \strong{CtrlPopTotals}, either name of
#' \strong{CtrlRegionTotals} or default of NULL, name of region variable in all data, and whether
#' you want to change any arguments required by \code{\link{dbRake}}.
#' This is a helper function used to run \code{\link{dbRake}} for multiple years.
#'
#' @param years Vector of all years in the data (e.g., Years = 2011:2020).
#' @param censusYears Any year(s) that are a census year and should not be raked (e.g., 2011, 2016).
#' Default is FALSE. If all years need to be raked, set as FALSE.
#' @param InputData Name of .xlsx or .csv file that contains input data to be raked.
#' This file is assumed to have Region (e.g., "TypeID", "CHSA") by Sex (e.g., 1, 2, 3) as rows, and
#' Ages (e.g., 0, 1, 2, ..., TOTAL) as columns. Values are population counts.
#' @param CtrlPopTotals Name of .xlsx or .csv file that contains overall control totals
#' (e.g., "BC AS TOTALS.xlsx"). This file is assumed to have Sex (e.g., 1, 2, 3) as rows and
#' Ages (e.g., 0, 1, 2, ..., TOTAL) as columns. Values are population counts.
#' This file typically has dimensions of 3 (obs) by 103 variables.
#' @param CtrlRegionTotals Name of .xlsx or .csv file that contains overall control totals
#' (e.g., "CHSA TOTALS.xlsx"). Default = NULL. This file is assumed to have Region (e.g., 218 CHSAs)
#' as the first column and TOTAL (population counts) as the second column; this file is not broken out
#' by Sex or Age. This file typically has dimensions of n (obs) by 2 variables, where "n" is the
#' number of individual regions (e.g., 218 for CHSA). If no name is provided (i.e., NULL), then region
#' control totals are not used. Instead, the InputData will be used to generate "control" totals.
#' @param VarRegion Name of Region variable in all files (e.g., "TypeID", "CHSA").
#' @param change_rake_args Logical value whether any remaining raking argument defaults need to be
#' changed. Default = FALSE. If set to TRUE, user will be asked to set the following arguments:
#' \strong{CtrlAgeGrpsTotals} (default = NULL);
#' \strong{VarSex} (otherwise pre-specified as "Sex");
#' \strong{VarSexTotal} (otherwise pre-specified from data's column names);
#' \strong{AgeGrpMax} (default = NULL which would trigger \code{\link{dbRake}} to use age 75 if
#' exists; however, multiRake sets this to the strongly recommended age 75);
#' \strong{allowNegatives} (default = FALSE, should only be TRUE for migration data);
#' \strong{saveInterimFiles} (default = FALSE);
#' \strong{writeOutputFile} (default = FALSE);
#' \strong{writeRakingLog} (default = FALSE);
#' \strong{readFiles} (default = FALSE which will use files already in environment; if files need
#' to be read in, set to TRUE).
#' @return RakedData.csv will be saved to "outputs" folder (which will be created if one does not
#' already exist). If set to TRUE, various interim files will be saved in an "interim_files" folder
#' within "outputs". If set to TRUE, a log file ("raking_log.csv") will also be saved to the
#' "outputs" folder.
#' @examples
#' \dontrun{ ## if files need to be read in, set 'change_rake_args' to TRUE
#' multiRake(years = 2011:2020, censusYears = c(2011, 2016),
#' InputData = "POPHAE19.xlsx", CtrlPopTotals = "BC AS TOTALS.xlsx",
#' CtrlRegionTotals = "LHA TOTALS.xlsx", VarRegion = "LHA",
#' change_rake_args = TRUE) } ## two census years not to be raked
#' \dontrun{ multiRake(years = 2012:2016, censusYears = FALSE,
#' InputData = "POPHAE19.xlsx", CtrlPopTotals = "BC AS TOTALS.xlsx",
#' CtrlRegionTotals = "LHA TOTALS.xlsx", VarRegion = "LHA",
#' change_rake_args = FALSE) } ## all years need to be raked
#' @family raking helpers
#' @seealso The overall raking function: \code{\link{dbRake}}()
#' @author Julie Hawkins, BC Stats
#' @export
multiRake <- function(years, censusYears = FALSE, InputData, CtrlPopTotals, CtrlRegionTotals = NULL,
VarRegion, change_rake_args = FALSE) {
## PREP ----
## 1. get/set raking arguments
if(change_rake_args == TRUE) {
## ask user for raking arguments
message("You set change_raking_args to TRUE. Please set them now. What do you want to use for: ")
CtrlAgeGrpsTotals <- readline(prompt = "CtrlAgeGrpsTotals: (NULL or name of .xlsx or .csv file of initial 5 year age group totals.) ");
VarSex <- readline(prompt = "VarSex: (Name of Sex variable in database) ");
VarSexTotal <- readline(prompt = "VarSexTotal: (Value of Sex Total (e.g., 3)) ")
AgeGrpMax <- readline(prompt = "AgeGrpMax: (NULL or an age ending in 0 or 5; recommend 75) ")
allowNegatives <- readline(prompt = "allowNegatives: (TRUE or FALSE) ")
saveInterimFiles <- readline(prompt = "saveInterimFiles: (TRUE or FALSE) ")
writeOutputFile <- readline(prompt = "writeOutputFile: (TRUE or FALSE) ")
writeRakingLog <- readline(prompt = "writeRakingLog: (TRUE or FALSE) ")
readFiles <- readline(prompt = "readFiles: (TRUE or FALSE) ")
} else {
## set raking arguments (use dbRake() defaults)
CtrlAgeGrpsTotals <- NULL
VarSex <- "Sex"
VarSexTotal <- length(unique(InputData$Sex))
AgeGrpMax <- 75
allowNegatives <- FALSE
saveInterimFiles <- FALSE
writeOutputFile <- FALSE
writeRakingLog <- FALSE
readFiles <- FALSE
}
## 2. if any year(s) should not be raked (i.e., definitive census years), remove them
if(length(censusYears) == 1) {
if(censusYears == FALSE) {
years_rake <- years
} else {
years_rake <- years[!years %in% censusYears]
}
} else if(length(censusYears) > 1) {
years_rake <- years[!years %in% censusYears]
}
## 3. create lists to hold raked data and log for each year being iterated
raked_all <- vector(mode = "list", length = length(years_rake))
raking_log_all <- vector(mode = "list", length = length(years_rake))
## RAKING ----
## 4. iterate dbRake over years_rake
for(yr in seq_along(years_rake)) {
## get years_rake[yr]'s input data and population control totals
data_to_rake <- InputData %>% dplyr::filter(Year == years_rake[yr]) %>% dplyr::select(-Year)
control_totals <- CtrlPopTotals %>% dplyr::filter(Year == years_rake[yr]) %>% dplyr::select(-Year)
## get years_rake[yr]'s region control totals, as specified
if(!is.null(CtrlRegionTotals)) {
ctrl_reg_totals <- CtrlRegionTotals %>% dplyr::filter(Year == years_rake[yr]) %>% dplyr::select(-Year)
} else {
ctrl_reg_totals <- NULL
}
## print message of Year being raked, and rake that year
message(paste0("Year ", years_rake[yr]))
raked <- dbutils::dbRake(InputData = data_to_rake, CtrlPopTotals = control_totals,
CtrlRegionTotals = ctrl_reg_totals, CtrlAgeGrpsTotals,
VarRegion, VarSex, VarSexTotal, AgeGrpMax, allowNegatives,
saveInterimFiles, writeRakingLog, writeOutputFile, readFiles)
## add raked data to list
raked_all[[yr]] <- raked[["RakedData"]]
if(writeRakingLog == TRUE) { raking_log_all[[yr]] <- raked[["RakingLog"]] }
rm(data_to_rake, control_totals, ctrl_reg_totals, raked)
}
## FORMATTING ----
## 5a. add back Year, merge all Years of now-raked data
data_done <- purrr::map(.x = 1:length(years_rake), ~ dplyr::mutate(raked_all[[.]], Year = years_rake[.x]))
data_done <- purrr::map_dfr(.x = 1:length(years_rake), ~ dplyr::bind_rows(data_done[[.]])) %>%
dplyr::select(Year, tidyselect::everything())
## 5b. ensure that region variable is character
InputData <- InputData %>%
dplyr::rename(temp = {{VarRegion}}) %>%
dplyr::mutate(temp = as.character(temp)) %>%
dplyr::rename({{VarRegion}} := temp)
## 5c. if any year(s) were not raked (i.e., definitive census years), add them to raked data
if(length(censusYears) == 1) {
if(censusYears != FALSE) {
data_done <- dplyr::bind_rows(data_done, InputData %>% dplyr::filter(Year %in% censusYears))
}
} else if(length(censusYears) > 1) {
data_done <- dplyr::bind_rows(data_done, InputData %>% dplyr::filter(Year %in% censusYears))
}
## 6. sort final data
data_done <- data_done %>% dplyr::arrange(Year, Sex, TypeID)
## 7. combine and write raking log, if specified
if(writeRakingLog == TRUE) {
raking_log <- purrr::map(.x = 1:length(years_rake), ~ dplyr::mutate(raking_log_all[[.]], Year = years_rake[.x]))
raking_log <- purrr::map_dfr(.x = 1:length(years_rake), ~ dplyr::bind_rows(raking_log[[.]])) %>%
dplyr::select(Year, message)
## if any year(s) were not raked (i.e., definitive census years), add message(s) to log
if(length(censusYears) == 1) {
if(censusYears != FALSE) {
temp <- data.frame(Year = censusYears, message = "Data was not asked to be raked.",
stringsAsFactors = FALSE)
raking_log <- dplyr::bind_rows(raking_log, temp) %>% dplyr::arrange(Year); rm(temp)
}
} else if(length(censusYears) > 1) {
temp <- data.frame(Year = censusYears, message = "Data was not asked to be raked.",
stringsAsFactors = FALSE)
raking_log <- dplyr::bind_rows(raking_log, temp) %>% dplyr::arrange(Year); rm(temp)
}
## write log to outputs folder
readr::write_csv(raking_log, here::here("outputs", "raking_log.csv"))
}
## OUTPUT ----
if(writeRakingLog == TRUE) {
return(list(RakedData = data_done,
RakingLog = raking_log))
} else {
return(list(RakedData = data_done))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.