#' Load a census file
#'
#' @param census_file string indicating location of file.
#' @param cols_to_keep vector indicating colname names of variables to keep. Default is just what is needed for match: first name, last name, age, sex, serial number, person number.
#' @param males_only whether to just keep males. Default is \code{TRUE}.
#' @return a census dataframe with match key and unique IPUMS identifier
#' @import data.table
#' @import tidyverse
#' @export
load_census_dmf_match <- function(census_file,
cols_to_keep = NULL,
males_only = TRUE){
all_cols_to_keep <- c("HISTID", "AGE", "SEX", "NAMELAST", "NAMEFRST")
if(!is.null(cols_to_keep)){
all_cols_to_keep <- c(all_cols_to_keep, cols_to_keep)
}
# read in census file, only keeping desired columns
census <- fread(census_file, select = all_cols_to_keep)
# clean variables
census[,"fname" := enc2native(NAMEFRST)]
census[,"fname" := str_to_upper(fname)]
census[,"fname" := get_first_word(fname)]
census[,"lname" := enc2native(NAMELAST)]
census[,"lname" := str_to_upper(lname)]
census[,"census_age" := as.numeric(AGE)]
# remove those with no name info
# remove blanks or anything that has question marks
census <- census[!(grepl("\\?", census$fname)|grepl("\\?", census$lname)|census$lname==""),]
# create key
census[,"tmp_key" := paste(lname, fname, census_age, sep = "_")]
census[,"clean_key" := clean_key(tmp_key),]
census[,"n_clean_key" := .N, by = clean_key]
census <- census[,!"tmp_key"]
## remove rows where fname, lname, or census age is missing.
census <- na.omit(census, cols=c("fname", "lname", "census_age"))
# remove fname, lname, census_age
census[, c("fname", "lname", "census_age") := NULL]
if(males_only==TRUE){
census <- census[census$SEX == 1,]
}
return(census)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.