#' Imputing sex from first names
#' @return The function returns a data frame with 4 columns:
#' \itemize{
#' \item cohort
#' \item min.cohort
#' \item max.cohort
#' \item name.first - the first name that was inserted to the function.
#' \item percent.female - the percent of females with this first name.
#' \item frequency - the count of times the name appeared in this cohort year.
#' }
#' @examples
#' sexSSA("Jon",cohort=2000)
#' sexSSA(c("Jon","Angel","Alesandra","Chris"),cohort=2000)
#' sexSSA("Jon",min.cohort=2000,max.cohort=2002)
#' sexSSA(c("Jon","Angel","Alesandra","Chris"),min.cohort=2000,max.cohort=2002)
#' @description Sexual identity imputation from first name using Social Secuarity Administration (SSA) data.
#' @param first.name The individual first name \cr \cr
#' @param cohort Year of birth of the individual \cr \cr
#' @param max.cohort The latest cohort year to include in the sex imputation.
#' @param min.cohort The earliest cohort year to include in the sex imputation.
# @param additional
# \itemize{
# \item sdsf
# \item sdsd
#}
#'
#' @import dplyr
#' @export
# I need to corect the function...
sexSSA = function(first.name,cohort=NULL, min.cohort=NULL, max.cohort=NULL){
# The cohort year is specified:
cohort.year = cohort
if (is.null(cohort.year) & ( is.null(min.cohort) | is.null(max.cohort) ) ){
stop("cohort is not specified and is required")
}
# A specific cohort year is specified
if (is.null(cohort.year)==FALSE){
if (cohort.year>2015 | cohort.year<1880){
stop("cohort year must be between 1880 to 2015")
}
# restrict by cohort year:
ssa.1880.2015 <- ssa.1880.2015 %>% filter(cohort==cohort.year)
}
# A range of cohort years is specified
if (is.null(cohort)==TRUE){
if (is.null(min.cohort) | is.null(max.cohort)){
stop("when 'cohort' is set to NULL both 'min.cohort' and 'max.cohort' need to be specified to years between 1880 to 2015")
}
if (max.cohort>2015 | min.cohort<1880 | max.cohort<min.cohort) {
stop("cohort years must be between 1880 to 2015 and 'max.cohort' needs to be strictly grater than 'min.cohort'")
}
# restrict by cohort years:
ssa.1880.2015 <- ssa.1880.2015 %>%
filter(cohort>=min.cohort & cohort<=max.cohort) %>%
mutate(
female = frequency*percent.female
) %>%
select(-percent.female) %>%
group_by(name.first) %>%
summarise(
female = sum(female),
total.freq = sum(frequency),
percent.female = 100 * (female/total.freq)
) %>%
select( -female ) %>%
ungroup() %>%
rename(frequency = total.freq) %>%
data.frame()
}
# Move to upper case letters to match census records:
first.name = tolower(first.name)
substring(first.name, 1, 1) = toupper(substring(first.name, 1, 1))
# Match to census names
index.match = match(first.name,ssa.1880.2015$name.first)
# generate missing values for the non-matched names
if (sum(is.na(index.match))>0){
results.na = data.frame(name.first=first.name[which(is.na(index.match))],
frequency =NA,
percent.female=NA,
cohort=ifelse(is.null(cohort.year),NA,cohort.year),
max.cohort=ifelse(is.null(max.cohort),NA,max.cohort),
min.cohort=ifelse(is.null(min.cohort),NA,min.cohort)
)
}
# find the sex for the matched names
results <- ssa.1880.2015 %>%
slice(index.match) %>%
mutate(
cohort=ifelse(is.null(cohort.year),NA,cohort.year),
max.cohort=ifelse(is.null(max.cohort),NA,max.cohort),
min.cohort=ifelse(is.null(min.cohort),NA,min.cohort)
)
# Adding the non-matched names to the results data frame
if (sum(is.na(index.match))>0){
results = rbind(results,results.na)
}
results = results[order(results$name.first),]
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.