#' @title
#' Update the OFM tables in the POC SQL Server database.
#'
#' @description
#' Imports, cleans, and organizes the data needed for the OFM tables in the POC
#' SQL server database. It then drops and recreates the tables with the cleaned
#' data.
#'
#'
#' After downloading the data from the source, the data is imported as a list of
#' dataframes, one for each xlsx file. The columns of each dataframe are given
#' arbitrary names. We then filter out irrelevant columns, such as area_id.
#'
#' Each data frame needs to be filtered in a unique way:
#'
#' -The total population counts dataframe (total) must include all counts,
#' except the actuall totals. Race based totals are kept for the construction
#' of the dataframe for other ethnic backgrounds.
#'
#' -The hispanic population count dataframe (hispanic) must include only the
#' totals.
#'
#' -The non-hispanic population count dataframe (non_hispanic) must include only
#' black and white counts.
#'
#' -Construction of the other dataframe: The count for other ethnic background
#' (others) is necessary. To construct the dataframe, all but the
#' race based totals from total are filtered. The counts are derived by
#' observing that
#'
#' others count = total count - hispanic count - non_hipanic black
#' - non_hispanic white counts
#'
#' The dataframes are not tidy: column headers contain the variables race and
#' gender. Hence, the dataframes are melted with the colvars: county, age and
#' measurement_year (year). Then the new column is mutated into two seperate
#' columns: race and gender. An additional column is added for source_census.
#'
#' All data frames are filtered row-wise to remove irrelevant age groups and
#' totals.
#'
#' Data is now tidy at this point, but data formatting begins. The counts are
#' rounded and values are replaced by reference codes. Age groups are given in
#' an inconsistent manner, so the data is grouped to get the count for each
#' age group.
#'
#' Once formatting is finished, rows corresponding to 2015 are created by using
#' data from 2014.
#'
#' NOTE: Once 2015 data is available the code requires some upkeep.
#'
#'
#' @param poc_connection Need to provide an active RODBC connection to the
#' POC SQL server. NOTE: It is assumed that the object passed in is meaningful.
#' @param start_year Optional parameter where the user can specify the start
#' year. NOTE: It is assumed that the start year given is meaningful.
#' @param end_year Optional parameter where the user can specify the end
#' year. NOTE: It is assumed that the end year given is meaningful.
#' @return
#' The function updates the POC SQL database and returns TRUE on success.
#'
#' @import stringr
#' @import RODBC
#' @import reshape2
#' @import readxl
#' @import dplyr
#'
#' @export
update_ofm <- function(poc_connection,
start_year = 2000,
end_year = 2014) {
# test here for possible input problems
# we cannot catch all of the errors so it is assumed that users will input
# relevant and meaningful objects.
# Setting the default poc_connection to null allows us to check if something
# is given
poc_connection = validate_RODBC_input(poc_connection)
if(!poc_connection$test_result){
stop(poc_connection$test_message)
}
poc_connection = poc_connection$connection
# It is likely that the data is from the year 2000 and up
if(start_year < 2000 | end_year < 2000){
stop("Any year specified must be 2000 or greater")
}
# Allowing the start year to be greater than the end year violates the
# purpose of the variables
if(start_year > end_year ){
stop("Start year cannot be greater than end year")
}
# create year collection
years <- as.character(start_year:end_year)
# create a collection of files to acquire - this collection will be
# continuously updated and will eventually contain the desired clean
# data
ofm_file_names <- list("hispanic" = 'sade_county10_h_5y_s1.xlsx',
"non_hispanic" = 'sade_county10_n_5y_s1.xlsx')
# download the most current versions of the the OFM files
test <- lapply(ofm_file_names,
function(x) {
download.file(paste0("http://www.ofm.wa.gov/pop/asr/sade/",
x),
destfile = x,
mode = "wb")
}
)
# get the raw data from the xlsx files - this requires reading
# multiple sheets from each file; returns a list of dataframes
ofm_collection <- lapply(ofm_file_names, function(x) {
returned_list <- vector("list" , length(years))
names(returned_list) <- years
for(i in 1:length(years)){
returned_list[[ i ]] <- as.data.frame(
read_excel(x,
sheet = years[ i ],
col_names = FALSE,
skip = 4))
returned_list[[ i ]]$sheetName = years[ i ]
}
return(returned_list)
}
)
# clean out the downloaded files
lapply(ofm_file_names, unlink)
# we want to bind the dataframes in the list together
ofm_collection <- lapply(ofm_collection, function(x) do.call(rbind, x))
# All of the df's have the same sort of columns. We use the symbols below
# for column names. A_ID is Area_id. The column names that use only two
# letters refer to columns names that contain values for two variables. The
# character at the starting index of these names refers to race and the last
# character in the name refers to sex.
# So, for example, T_T refers to a column of counts that are asscoiated with
# rows with a race value of total and a gender value of total. They need to
# be split later. Here are race codes for this set of names
# (gender is obvious).
# A = Asian | B = Black | W = White | T = Total Race | N = Native |
# P = Pacific Islander/Hawian | M = Mixed Race
col_names <- c("county_desc", "A_ID", "age_group", "T_T", "T_M", "T_F",
"W_T" , "W_M" , "W_F" , "B_T" , "B_M" , "B_F",
"N_T" , "N_M" , "N_F" , "A_T" , "A_M" , "A_F",
"P_T" ,"P_M" , "P_F" ,"M_T" , "M_M" , "M_F","Year")
# set col names
ofm_collection <- lapply(ofm_collection, function(x){
colnames(x) <- col_names
return(x)
}
)
# We need to manipulate the df's individually, since we need different
# things from each.
# The only thing neeced from the Hispanic or Latino Population is Total for Male and Female
# we also need to aggregate 15, 16 and 17 together
ofm_collection[[1]] <- select(ofm_collection[[1]], Year, county_desc, age_group, T_M, T_F) %>%
filter(age_group %in% c('0-4', '5-9', '10-14', '15', '16', '17') & county_desc != 'Washington State') %>%
reshape2::melt(id = c("county_desc", "age_group", "Year"),
variable = "column",
value.name = "pop_cnt") %>%
mutate(cd_gndr = ifelse(substring(column, 3) == 'F', 1, 2),
cd_race = 5,
year = Year) %>%
select(year, county_desc, cd_gndr, cd_race, age_group, pop_cnt)
hisp_15_17 <- filter(ofm_collection[[1]], age_group %in% c(15, 16, 17)) %>%
group_by(year, county_desc, cd_gndr, cd_race) %>%
summarize(pop_cnt = sum(pop_cnt)) %>%
mutate(age_group = '15-17') %>%
select(year, county_desc, cd_gndr, cd_race, age_group, pop_cnt)
ofm_collection[[1]] <-filter(ofm_collection[[1]], age_group %in% c('0-4', '5-9', '10-14'))
# Getting totals for other groups
# we also need to aggregate 15, 16 and 17 together
ofm_collection[[2]] <- select(ofm_collection[[2]], -starts_with('T'), -ends_with('T'), -A_ID) %>%
filter(age_group %in% c('0-4', '5-9', '10-14', '15', '16', '17') & county_desc != 'Washington State') %>%
reshape2::melt(id = c("county_desc", "age_group", "Year"),
variable = "column",
value.name = "pop_cnt") %>%
mutate(cd_gndr = ifelse(substring(column, 3) == 'F', 1, 2),
cd_race = ifelse(substring(column, 1, 1) == "A", 2,
ifelse(substring(column, 1, 1) == "B", 3,
ifelse(substring(column, 1, 1) == "W", 6,
ifelse(substring(column, 1, 1) == "N", 1,
ifelse(substring(column, 1, 1) == "P", 4,
ifelse(substring(column, 1, 1) == "M", 7, 8)))))),
year = Year) %>%
select(year, county_desc, cd_gndr, cd_race, age_group, pop_cnt)
non_hisp_15_17 <- filter(ofm_collection[[2]], age_group %in% c(15, 16, 17)) %>%
group_by(year, county_desc, cd_gndr, cd_race) %>%
summarize(pop_cnt = sum(pop_cnt)) %>%
mutate(age_group = '15-17') %>%
select(year, county_desc, cd_gndr, cd_race, age_group, pop_cnt)
ofm_collection[[2]] <-filter(ofm_collection[[2]], age_group %in% c('0-4', '5-9', '10-14'))
# the 4 datasets now need to be put together
raceeth <- rbind(ofm_collection[[1]], ofm_collection[[2]], hisp_15_17, non_hisp_15_17)
# round date to whole numbers
raceeth[[6]] <- round(raceeth[[6]])
# change age grouping to correct poc codes
raceeth[[5]] <- ifelse(raceeth[[5]] == '0-4', 1,
ifelse(raceeth[[5]] == '5-9', 2,
ifelse(raceeth[[5]] == '10-14', 3, 4)))
# tries to drop the previous table if it exists
try(sqlDrop(channel = poc_connection,
sqtable = "public_data.mb_census_population"))
# sends the df to sql
sqlSave(channel = poc_connection, dat = raceeth,
tablename = "public_data.mb_census_population",
rownames = FALSE, colnames = FALSE)
# the end result
return("TRUE")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.