R/sexSSA.R

Defines functions sexSSA

Documented in sexSSA

#' Imputing sex from first names 
#' @return The function returns a data frame with 4 columns: 
#' \itemize{
#'    \item cohort
#'    \item min.cohort
#'    \item max.cohort
#'    \item name.first - the first name that was inserted to the function.
#'    \item percent.female - the percent of females with this first name.
#'    \item frequency - the count of times the name appeared in this cohort year.   
#' }
#' @examples
#' sexSSA("Jon",cohort=2000)
#' sexSSA(c("Jon","Angel","Alesandra","Chris"),cohort=2000)
#' sexSSA("Jon",min.cohort=2000,max.cohort=2002)
#' sexSSA(c("Jon","Angel","Alesandra","Chris"),min.cohort=2000,max.cohort=2002)
#' @description Sexual identity imputation from first name using Social Secuarity Administration (SSA) data.
#' @param first.name  The individual first name \cr \cr
#' @param cohort  Year of birth of the individual \cr \cr
#' @param max.cohort The latest cohort year to include in the sex imputation.
#' @param min.cohort The earliest cohort year to include in the sex imputation.
# @param additional 
# \itemize{
#    \item sdsf
#    \item sdsd
#}    
#' 
#' @import dplyr
#' @export 

# I need to corect the function...
sexSSA = function(first.name,cohort=NULL, min.cohort=NULL, max.cohort=NULL){  
  
  # The cohort year is specified:
  cohort.year = cohort
  
  if (is.null(cohort.year) & ( is.null(min.cohort) | is.null(max.cohort) ) ){
    stop("cohort is not specified and is required")
  }
  
  # A specific cohort year is specified
  if (is.null(cohort.year)==FALSE){
    
    if (cohort.year>2015 | cohort.year<1880){
      stop("cohort year must be between 1880 to 2015")
    }
    
    # restrict by cohort year:
    ssa.1880.2015 <- ssa.1880.2015 %>% filter(cohort==cohort.year)
  }
  
  # A range of cohort years is specified
  if (is.null(cohort)==TRUE){
    
    if (is.null(min.cohort) | is.null(max.cohort)){
      stop("when 'cohort' is set to NULL both 'min.cohort' and 'max.cohort' need to be specified to years between 1880 to 2015")
    }
    
    if (max.cohort>2015 | min.cohort<1880 | max.cohort<min.cohort) {
      stop("cohort years must be between 1880 to 2015 and 'max.cohort' needs to be strictly grater than 'min.cohort'")
    }
    
    # restrict by cohort years:
    ssa.1880.2015 <- ssa.1880.2015 %>% 
      filter(cohort>=min.cohort & cohort<=max.cohort) %>%
      mutate(
        female = frequency*percent.female
        ) %>%
      select(-percent.female) %>%
      group_by(name.first) %>%
      summarise(
        female = sum(female),
        total.freq = sum(frequency),
        percent.female = 100 * (female/total.freq)
      ) %>%
      select( -female ) %>%
      ungroup() %>%
      rename(frequency = total.freq) %>%
      data.frame()
  }
  
  # Move to upper case letters to match census records:
  first.name = tolower(first.name)
  substring(first.name, 1, 1) = toupper(substring(first.name, 1, 1))
  
  # Match to census names
  index.match = match(first.name,ssa.1880.2015$name.first)
  
  # generate missing values for the non-matched names
  if (sum(is.na(index.match))>0){
    results.na = data.frame(name.first=first.name[which(is.na(index.match))],
                            frequency =NA,
                            percent.female=NA,
                            cohort=ifelse(is.null(cohort.year),NA,cohort.year),
                            max.cohort=ifelse(is.null(max.cohort),NA,max.cohort),
                            min.cohort=ifelse(is.null(min.cohort),NA,min.cohort)
    )
  }
    
  # find the sex for the matched names
  results <- ssa.1880.2015 %>%
    slice(index.match) %>%
    mutate(
      cohort=ifelse(is.null(cohort.year),NA,cohort.year),
      max.cohort=ifelse(is.null(max.cohort),NA,max.cohort),
      min.cohort=ifelse(is.null(min.cohort),NA,min.cohort)
      )
    
  
  # Adding the non-matched names to the results data frame
  if (sum(is.na(index.match))>0){
    results = rbind(results,results.na)
  }
  results = results[order(results$name.first),]
  
  return(results)
}
yotamshemtov/NameSexRace documentation built on May 4, 2019, 5:33 p.m.