#' Match names to the Getty ULAN
#'
#' Queries the Getty ULAN to find matching entries for a given string. You
#' may filter the results by specifying an early or late date.
#'
#' @param names A character vector of names to match to a canonical ULAN id.
#' @param early_year Match only artists who died after this year. Like
#' \code{late_year}, this argument should be a numeric vector of length 1, or
#' of the same length as \code{names}. If length 1, the same date restrictions
#' will be used to match every value of \code{names}. Otherwise, each name
#' match can be restricted to its own pair of early_year and late_year. If no
#' \code{early_year} or \code{late_year} are specified, then artists from all
#' time periods will be eligible for matching. Any NA values in
#' \code{early_year} or \code{late_year} will be coerced to default maxima and
#' minima.
#' @param late_year Match only artists who were born before this year.
#' @param strictly_between Method for filtering search results using the
#' early_year/late_year parameters. TRUE will only include artists whose life
#' dates fall within the range [late_year, early_year]. FALSE (the default)
#' will include artists whose life dates intersect with [early_year,
#' late_year]
#' @param method This value determines which method will be used to match the
#' name to a canonical ULAN id. \code{sparql} will query the Getty's live
#' endpoint, relying on its Lucene index for finding close matches, while
#' \code{local} instead uses string cosine similarity based on a local table
#' of ULAN entries.
#' @param max_results The maximum number of results to return. Defaults to 5.
#' Depending on the query, the actual number of results returned may be lower.
#' When \code{method = "sparql"} is used, values over 50 will be ignored.
#' @param cutoff_score The minimum similarity score that must be returned by the
#' chosen method for a candidate to be included in results. \code{NULL} will
#' use default values for each method: 0.95 for the \code{local} method, and 3
#' for the \code{sparql} method.
#'
#' @note \code{cutoff_score} will be ingored for
#'
#' @return A named list of data.frames, one per submitted name, with 7 columns
#' and no more than \code{max_results} rows:
#' \describe{
#' \item{\code{id}}{integer. ULAN id}
#' \item{\code{pref_name}}{character. ULAN preferred name}
#' \item{\code{birth_year}}{integer. Artist birth year, if assigned.}
#' \item{\code{death_year}}{integer. Artist death year, if assigned}
#' \item{\code{gender}}{character. Artist gender, if assigned.}
#' \item{\code{nationality}}{character. Artist nationality, if assigned.}
#' \item{\code{score}}{numeric. The score of the result. When \code{method =
#' "sparql"}, this is the Lucene index score. When \code{method = "local"}, it
#' will instead be a cosine similarity score. Results with a score below
#' \code{cutoff_score} are dropped.}
#' }
#'
#' Unmatched names will return a data.frame with NAs for all values save
#' \code{name}.
#'
#' @note \code{method = "sparql"} requires an internet connection.
#'
#' @export
#' @examples
#' \dontrun{ulan_id("Rembrandt", early_year = 1600,
#' late_year = 1700, method = "sparql")}
#' \dontrun{ulan_id(c("Rembrandt", "Rothko"), early_year = c(1600, 1900),
#' late_year = c(1700, 2000), method = "sparql")}
ulan_match <- function(names, early_year = -9999, late_year = 2090, strictly_between = FALSE, method = c("sparql", "local"), max_results = 5, cutoff_score = NULL) {
method <- match.arg(method)
# Check names validity
stopifnot(is.character(names))
stopifnot(all(!is.na(names)))
stopifnot(all(!names == ""))
# early_year and late_year must be numeric
stopifnot(all(is.numeric(early_year), is.numeric(late_year)))
# Check if early_year and late_year are compatible
stopifnot(length(early_year) == length(late_year))
stopifnot(length(early_year) == 1 | length(early_year) == length(names))
# Check strictly_between validity
stopifnot(is.logical(strictly_between))
# Check that ulanrdata is installed
if(method == "local")
check_ulanrdata_package()
# Set cutoff_score if not specified
if(is.null(cutoff_score)) {
cutoff_score <- switch(method,
"local" = 0.95,
"sparql" = 3)
}
# Replace any NA values in early_year and late_year with default time range
if(any(is.na(early_year))) {
warning("NAs in early_year have been coerced to -9999")
early_year[is.na(early_year)] <- -9999
}
if(any(is.na(late_year))) {
warning("NAs in late_year have been coerced to 2090")
late_year[is.na(late_year)] <- 2090
}
# Dispatch name to query handler based on selected method
ulan_dispatcher <- switch(method,
"local" = ulan_stringdist_match_handler,
"sparql" = ulan_sparql_match_handler)
# For long queries or if explicitly set, create and increment txtProgressBar
if(all(interactive(), length(names) > 5)) {
pb <- txtProgressBar(min = 0, max = length(names), style = 3)
ids <- mapply(function(a, b, c) {
setTxtProgressBar(pb, (getTxtProgressBar(pb) + 1))
ulan_dispatcher(a, b, c, strictly_between, max_results, cutoff_score)},
names, early_year, late_year, SIMPLIFY = FALSE, USE.NAMES = TRUE)
close(pb)
} else {
ids <- mapply(function(a, b, c) ulan_dispatcher(a, b, c, strictly_between, max_results, cutoff_score),
names, early_year, late_year, SIMPLIFY = FALSE, USE.NAMES = TRUE)
}
return(ids)
}
#' @describeIn ulan_match Return a vector of matching ULAN IDs. This function is a wrapper around \link{ulan_match} that simply returns a
#' vector of the top match ID for each name given (as if \code{max_results} were
#' 1). \code{max_results} will be ignored.
#' @export
ulan_id <- function(names, early_year = -9999, late_year = 2090, strictly_between = FALSE, method = c("sparql", "local"), max_results = 1) {
matches <- ulan_match(names, early_year, late_year, strictly_between, method, max_results = 1, cutoff_score = NULL)
dplyr::bind_rows(matches)$id
}
#' @describeIn ulan_match Return a dataframe of matching ULAN IDs with attributes, with an additional column \code{names} containing the original input vector. This function is a wrapper around \link{ulan_match}.
#' @export
ulan_data <- function(names, early_year = -9999, late_year = 2090, strictly_between = FALSE, method = c("sparql", "local"), max_results = 1) {
matches <- ulan_match(names, early_year, late_year, strictly_between, method, max_results = 1, cutoff_score = NULL)
dplyr::bind_rows(matches, .id = "names")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.