Nothing
#' Predict gender from first names using historical data
#'
#' This function predicts the gender of a first name given a year or range of
#' years in which the person was born. The prediction can use one of several
#' data sets suitable for different time periods or geographical regions. See
#' the package vignette for suggestions on using this function with multiple
#' names and for a discussion of which data set is most suitable for your
#' research question. When using certain methods, the \code{genderdata} data
#' package is required; you will be prompted to install it if it is not already
#' available.
#'
#' @param names First names as a character vector. Names are case insensitive.
#' @param years The birth year of the name whose gender is to be predicted. This
#' argument can be either a single year, a range of years in the form
#' \code{c(1880, 1900)}. If no value is specified, then for the \code{ "ssa"}
#' method it will use the period 1932 to 2012; acceptable years for the SSA
#' method range from 1880 to 2012, but for years before 1930 the IPUMS method
#' is probably more accurate. For the \code{"ipums"} method the default range
#' is the period 1789 to 1930, which is also the range of acceptable years.
#' For the \code{"napp"} method the default range is the period 1758 to 1910,
#' which is also the range of acceptable years. If a year or range of years is
#' specified, then the names will be looked up for that period.
#' @param method This value determines the data set that is used to predict the
#' gender of the name. The \code{"ssa"} method looks up names based from the
#' U.S. Social Security Administration baby name data. (This method is based
#' on an implementation by Cameron Blevins.) The \code{"ipums"} method looks
#' up names from the U.S. Census data in the Integrated Public Use Microdata
#' Series. (This method was contributed by Ben Schmidt.) The \code{"napp"}
#' method uses census microdata from Canada, Great Britain, Denmark,
#' Iceland, Norway, and Sweden from 1801 to 1910 created by the
#' \href{https://www.nappdata.org/napp/}{North Atlantic Population Project}.
#' The
#' \code{"kantrowitz"} method uses the Kantrowitz corpus of male and female
#' names. The \code{"genderize"} method uses the Genderize.io
#' <\url{https://genderize.io/}> API, which is based on "user profiles across
#' major social networks." The \code{"demo"} method is uses the top 100 names
#' in the SSA method; it is provided only for demonstration purposes when the
#' \code{genderdata} package is not installed and it is not suitable for
#' research purposes.
#' @param countries The countries for which datasets are being used. For the
#' \code{"ssa"} and \code{"ipums"} methods, the only valid option is
#' \code{"United States"} which will be assumed if no argument is specified.
#' For the \code{"napp"} method, you may specify a character vector with any
#' of the following countries: \code{"Canada"}, \code{"United Kingdom"},
#' \code{"Denmark"}, \code{"Iceland"}, \code{"Norway"}, \code{"Sweden"}. For
#' the \code{"kantrowitz"} and \code{"genderize"} methods, no country should
#' be specified.
#' @return Returns a data frame containing the results of predicting the gender.
#' The exact components of the returned list will depend on the specific
#' method used. They include the following: \item{name}{The name for which the
#' gender has been predicted.} \item{proportion_male}{The proportion of male
#' names for the given range of years.} \item{proportion_female}{The
#' proportion of female names for the given range of years.} \item{gender}{The
#' predicted gender based on the proportion of male and female names. Possible
#' values are \code{"male"} and \code{"female"} for proportions above
#' \code{0.5}, \code{"either"} for proportions that are exactly \code{0.5},
#' and \code{NA} for combinations of names and years for which a gender cannot
#' be predicted using the given method.} \item{year_min}{The lower bound
#' (inclusive) of the year range used for the prediction.} \item{year_max}{The
#' upper bound (inclusive) of the year range used for the prediction.}
#' @export
#' @examples
#' gender("madison", method = "demo", years = 1985)
#' gender("madison", method = "demo", years = c(1900, 1985))
#' # SSA method
#' \dontrun{gender("madison", method = "demo", years = c(1900, 1985))}
#' # IPUMS method
#' \dontrun{gender("madison", method = "ipums", years = 1860)}
#' # NAPP method
#' \dontrun{gender("madison", method = "napp", countries = c("Sweden", "Denmark"))}
gender <- function(names, years = c(1932, 2012),
method = c("ssa", "ipums", "napp", "kantrowitz",
"genderize", "demo"),
countries = c("United States", "Canada", "United Kingdom",
"Denmark", "Iceland", "Norway", "Sweden"))
{
method <- match.arg(method)
# If we need the genderdata package, check that it is installed
if (!method %in% c("demo", "genderize")) {
check_genderdata_package()
}
# Check that the name is a character vector
if (class(names) != "character") stop("Data must be a character vector.")
# Check the validity of the years argument
if (length(years) == 1) years <- c(years, years)
if (length(years) > 2) {
stop("Year should be a numeric vector with no more than two values.")
}
if (years[1] > years[2]) {
stop("The first value for years should be smaller than the second value.")
}
if (missing(years)) {
switch(method,
ssa = years <- c(1932, 2012),
ipums = years <- c(1789, 1930),
napp = years <- c(1758, 1910)
)
}
# Hand off the arguments to functions based on method, and do error checking
if (method == "ssa") {
if (years[1] < 1880 || years[2] > 2012) {
warning("The year range provided has been trimmed to fit within 1880 to 2012.")
if (years[1] < 1880) years[1] <- 1880
if (years[2] > 2012) years[2] <- 2012
}
if (!missing(countries) && countries != "United States") {
stop("SSA data is only available for the United States of America.")
}
gender_ssa(names = names, years = years)
} else if (method == "demo") {
if (years[1] < 1880 || years[2] > 2012) {
warning("The year range provided has been trimmed to fit within 1880 to 2012.")
if (years[1] < 1880) years[1] <- 1880
if (years[2] > 2012) years[2] <- 2012
}
if (!missing(countries) && countries != "United States") {
stop("Demo data is only available for the United States of America.")
}
gender_demo(names = names, years = years)
} else if (method == "kantrowitz") {
if (!missing(years))
stop("Kantrowitz method does not account for year.")
if (!missing(countries))
stop("Kantrowitz method does not account for country.")
gender_kantrowitz(names = names)
} else if (method == "ipums") {
if (years[1] < 1789 || years[2] > 1930) {
warning("The year range provided has been trimmed to fit within 1789 to 1930.")
if (years[1] < 1789) years[1] <- 1789
if (years[2] > 1930) years[2] <- 1930
}
if (!missing(countries) && countries != "United States") {
stop("IPUMS data is only available for the United States of America.")
}
gender_ipums_usa(names = names, years = years)
} else if (method == "napp") {
if (years[1] < 1758 || years[2] > 1910) {
warning("The year range provided has been trimmed to fit within 1758 to 1910.")
if (years[1] < 1758) years[1] <- 1758
if (years[2] > 1910) years[2] <- 1910
}
if (missing(countries))
countries <- countries[countries != "United States"]
countries <- match.arg(countries, several.ok = TRUE)
if ("United States" %in% countries)
stop("NAPP data is only available for European countries. See ",
"the documentation.")
gender_napp(names = names, years = years, countries = countries)
} else if (method == "genderize") {
if (!missing(years))
stop("Genderize method does not account for year.")
if (!missing(countries))
stop("Genderize method does not account for country.")
gender_genderize(names = names)
}
}
# Hide variables from R CMD check
if (getRversion() >= "2.15.1") {
c("year", "male", "female", "proportion_female", "proportion_male",
"ssa_national", "kantrowitz", ".", "ipums_usa", "ratio_male",
"ratio_female", "name", "year_min", "year_max", "country", "napp") %>%
utils::globalVariables()
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.