#' Function to enrich a filtered corpus with Twitter users' most likely age and gender, based on baby names statistics
#'@description This function crossreferences the 'name' field in the corpus files with a large database of baby names statistics, drawn from two sources: United States Social Security (included in the R package 'babynames' by Hadley Wickham) and the Spanish Instituto Nacional de Estadisticas (INE). The function implements a cascade system, attempting first to find exact matches, after which it results to approximate string matching using Levenhstein distance.
#' @param filteredCorpus filtered corpus. Do not use on unfiltered data if you want to get results in this century.
#' @param maxDistance maximum Levenhstein distance to use for approximate string matching. Defaults to 2
#' @param nthreads number of threads to use in the C++ code for approximate string matching. Defaults to the number of CPU cores on your machine and it's probably a good idea to use that default.
#' @return a data.frame with the two added columns: gender (column 'sex') and most likely year of birth (column 'year')
#' @export
#' @import Rcpp
#' @import RcppProgress
#' @import readr
#' @import dplyr
#' @import parallel
#' @import stringi
addAgeGender<-function(filtered_corpus, language=c("English", "Spanish"), maxDistance=1, nthreads=parallel::detectCores()) {
language<-match.arg(language)
options(stringsAsFactors = F)
database<-switch(language, English=english_baby_names, Spanish=spanish_baby_names)
filtered_corpus <- filtered_corpus %>%
mutate(name=sapply(name, FUN=function(x) stri_split_boundaries(stri_trim(x))[[1]][1])) %>%
mutate(name=stri_replace_all_regex(name, "[^[:alpha:]]", ""))
exactMatches<-filtered_corpus %>%
left_join(database, by="name")
toBeCompleted<- exactMatches %>%
filter(is.na(year)) %>%
select(-year, -sex)
key<-data.frame(name=unique(toBeCompleted$name), output=fuzzyMatch(unique(toBeCompleted$name), database$name, maxDistance = maxDistance, nthreads = nthreads))
toBeCompleted<-toBeCompleted %>%
left_join(key, by="name") %>%
mutate(output=ifelse(output=="", name, output)) %>%
select(-name) %>%
rename(name=output) %>%
filter(name!="") %>%
left_join(database)
exactMatches<-exactMatches %>%
filter(!is.na(year))
done<- dplyr::bind_rows(toBeCompleted, exactMatches) %>%
addTokenId() %>%
filter(!duplicated(tokenId)) %>%
select(-tokenId)
return(done)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.