R/hispCensus2000.R

Defines functions hispCensus2000

Documented in hispCensus2000

#' Hispanic identity imputation from last name.
#' @return The percent of individuals who identified themselfs as hispanics in the US Census 2000 
#' @description Hispanic origin imputation from the individual's last name using the US 2000 Census data. 
#' @param last.name  The individual's last name \cr \cr
#' @examples
#' hispCensus2000("ShemTov")
#' hispCensus2000("Shem Tov")
#' hispCensus2000(c("ShemTov","li","Londono","smith"))
#' @import dplyr
#' @export 

hispCensus2000 = function(last.name){
  
  if (is.character(last.name)==FALSE){
    warning("The last name is not a character variable and the function will return an NA")
  }
  
  # Move to character and save original names inserted to the function:
  last.name = as.character(last.name)
  last.name.original = last.name
  
  # Move to upper case letters to match census records:
  last.name = toupper(last.name)
  
  # Match to census names
  index.match = match(last.name,census00$name)
  
  # generate missing values for the non-matched names
  if (sum(is.na(index.match))>0){
    results.na = data.frame(name.last=last.name[which(is.na(index.match))],
                            percent.hispanic = NA,
                            hispanic.impute=NA,
                            count=NA
    )
  }
  
  # find hispanic for the matched names
  results <- census00 %>%
    slice(index.match) %>%
    mutate(
      hispanic.impute = as.numeric(percent.hispanic > 50)
    ) %>%
    select(name.last, percent.hispanic, hispanic.impute, count)
  
  # Adding the non-matched names to the results data frame
  if (sum(is.na(index.match))>0){
    results = rbind(results,results.na)
  }
  
  results = results[order(results$name.last),]
  
  # Add the original names:
  results$last.name.original = last.name.original[order(last.name.original)]
  return(results)
}
yotamshemtov/NameSexRace documentation built on May 4, 2019, 5:33 p.m.