R/tw_filter.R

Defines functions tw_filter_people tw_filter_first tw_filter

Documented in tw_filter tw_filter_first tw_filter_people

#' Filter search result and keep only items with matching property and Q identifier
#'
#' @param search A data frame generated by `tw_search()`, or a search query. If a data frame is given, language and limits are ignore.
#' @param p A character vector of length 1, a property. Must always start with the capital letter "P", e.g. "P31" for "instance of".
#' @param q A character vector of length 1, a wikidata id. Must always start with the capital letter "Q", e.g. "Q5" for "human being".
#' @param language Language to be used for the search. Can be set once per session with `tw_set_language()`. If not set, defaults to "en". For a full list, see https://www.wikidata.org/wiki/Help:Wikimedia_language_codes/lists/all
#' @param limit Maximum numbers of responses to be given.
#' @param include_search Logical, defaults to FALSE. If TRUE, the search is returned as an additional column.
#' @param wait In seconds, defaults to 0. Time to wait between queries to Wikidata. If data are cached locally, wait time is not applied. If you are running many queries systematically you may want to add some waiting time between queries.
#' @param cache Defaults to NULL. If given, it should be given either TRUE or FALSE. Typically set with `tw_enable_cache()` or `tw_disable_cache()`.
#' @param overwrite_cache Defaults to FALSE. If TRUE, overwrites cache.
#' @param cache_connection Defaults to NULL. If NULL, and caching is enabled, `tidywikidatar` will use a local sqlite database. A custom connection to other databases can be given (see vignette `caching` for details).
#' @param disconnect_db Defaults to TRUE. If FALSE, leaves the connection to cache open.
#'
#' @return A data frame with three columns, `id`, `label`, and `description`, filtered by the above criteria.
#' @export
#'
#' @examples
#' tw_search(search = "Margaret Mead", limit = 3) %>%
#'   tw_filter(p = "P31", q = "Q5")
tw_filter <- function(search,
                      p,
                      q,
                      language = tidywikidatar::tw_get_language(),
                      limit = 10,
                      include_search = FALSE,
                      wait = 0,
                      cache = NULL,
                      overwrite_cache = FALSE,
                      cache_connection = NULL,
                      disconnect_db = TRUE) {
  db <- tw_connect_to_cache(
    connection = cache_connection,
    language = language,
    cache = cache
  )

  search_result <- tw_check_search(
    search = search,
    language = language,
    limit = limit,
    include_search = include_search,
    wait = wait,
    cache = cache,
    overwrite_cache = overwrite_cache,
    cache_connection = db,
    disconnect_db = FALSE
  )

  if (nrow(search_result) == 0) {
    tw_disconnect_from_cache(
      cache = cache,
      cache_connection = db,
      disconnect_db = disconnect_db,
      language = language
    )

    return(search_result)
  }

  p_df <- tw_get_property(
    id = search_result$id,
    p = p,
    language = language,
    cache = cache,
    overwrite_cache = overwrite_cache,
    cache_connection = db,
    disconnect_db = FALSE,
    wait = wait
  ) %>%
    dplyr::filter(.data$value %in% q)

  tw_disconnect_from_cache(
    cache = cache,
    cache_connection = db,
    disconnect_db = disconnect_db,
    language = language
  )

  search_result %>%
    dplyr::semi_join(
      y = p_df,
      by = "id"
    )
}

#' Filter search result and keep only and keep only the first match
#'
#' Same as `tw_filter()`, but consistently returns data frames with a single row.
#'
#' @param search A data frame generated by `tw_search()`, or a search query. If a data frame is given, language and limits are ignore.
#' @param p A character vector of length 1, a property. Must always start with the capital letter "P", e.g. "P31" for "instance of".
#' @param q A character vector of length 1, a wikidata id. Must always start with the capital letter "Q", e.g. "Q5" for "human being".
#' @param language Language to be used for the search.
#' @param limit Maximum numbers of responses to be given.
#' @param include_search Logical, defaults to FALSE. If TRUE, the search is returned as an additional column.
#' @param cache Defaults to NULL. If given, it should be given either TRUE or FALSE. Typically set with `tw_enable_cache()` or `tw_disable_cache()`
#' @param wait In seconds, defaults to 0. Time to wait between queries to Wikidata. If data are cached locally, wait time is not applied. If you are running many queries systematically you may want to add some waiting time between queries.
#' @param overwrite_cache Defaults to FALSE. If TRUE, overwrites cache.
#' @param cache_connection Defaults to NULL. If NULL, and caching is enabled, `tidywikidatar` will use a local sqlite database. A custom connection to other databases can be given (see vignette `caching` for details).
#' @param disconnect_db Defaults to TRUE. If FALSE, leaves the connection to cache open.
#'
#' @return A data frame with one row and three columns, `id`, `label`, and `description`, filtered by the above criteria.
#' @export
#'
#' @examples
#' tw_search("Margaret Mead") %>%
#'   tw_filter_first(p = "P31", q = "Q5")
tw_filter_first <- function(search,
                            p,
                            q,
                            language = tidywikidatar::tw_get_language(),
                            limit = 10,
                            include_search = FALSE,
                            wait = 0,
                            cache = NULL,
                            overwrite_cache = FALSE,
                            cache_connection = NULL,
                            disconnect_db = TRUE) {
  if (tw_check_cache(cache) == TRUE) {
    db <- tw_connect_to_cache(
      connection = cache_connection,
      language = language,
      cache = cache
    )
  }

  search_result <- tw_check_search(
    search = search,
    language = language,
    limit = limit,
    include_search = include_search,
    wait = wait,
    cache = cache,
    overwrite_cache = overwrite_cache,
    cache_connection = db,
    disconnect_db = FALSE
  )

  if (nrow(search_result) == 0) {
    return(search_result)
  }

  first_match_id <- purrr::detect(
    .x = seq_along(search_result$id),
    .f = function(current_row_number) {
      search_result %>%
        dplyr::slice(current_row_number) %>%
        tw_filter(
          p = p,
          q = q,
          language = language,
          limit = limit,
          include_search = include_search,
          wait = wait,
          cache = cache,
          overwrite_cache = overwrite_cache,
          cache_connection = cache_connection,
          disconnect_db = FALSE
        ) %>%
        nrow() %>%
        `>`(0)
    }
  )

  tw_disconnect_from_cache(
    cache = cache,
    cache_connection = db,
    disconnect_db = disconnect_db,
    language = language
  )

  if (is.null(first_match_id)) {
    search_result %>%
      dplyr::slice(0)
  } else {
    search_result %>%
      dplyr::slice(first_match_id)
  }
}

#' Filter search result and keep only people
#'
#' A wrapper of `tw_filter()` that defaults to keep only "instance of" (P31) "human being" (Q5).
#'
#' @param search A data frame generated by `tw_search()`, or a search query. If a data frame is given, language and limits are ignore.
#' @param language Language to be used for the search.
#' @param limit Maximum numbers of responses to be given.
#' @param include_search Logical, defaults to FALSE. If TRUE, the search is returned as an additional column.
#' @param stop_at_first Logical, defaults to TRUE. If TRUE, returns only the first match from the search that satisfies the criteria.
#' @param wait In seconds, defaults to 0. Time to wait between queries to Wikidata. If data are cached locally, wait time is not applied. If you are running many queries systematically you may want to add some waiting time between queries.
#' @param overwrite_cache Defaults to FALSE. If TRUE, overwrites cache.
#' @param cache_connection Defaults to NULL. If NULL, and caching is enabled, `tidywikidatar` will use a local sqlite database. A custom connection to other databases can be given (see vignette `caching` for details).
#' @param disconnect_db Defaults to TRUE. If FALSE, leaves the connection to cache open.
#'
#' @return A data frame with three columns, `id`, `label`, and `description`; all rows refer to a human being.
#' @export
#'
#' @examples
#' tw_search("Ruth Benedict")
#'
#' tw_search("Ruth Benedict") %>%
#'   tw_filter_people()
tw_filter_people <- function(search,
                             language = tidywikidatar::tw_get_language(),
                             limit = 10,
                             include_search = FALSE,
                             stop_at_first = TRUE,
                             wait = 0,
                             overwrite_cache = FALSE,
                             cache_connection = NULL,
                             disconnect_db = TRUE) {
  if (stop_at_first == TRUE) {
    tidywikidatar::tw_filter_first(
      search = search,
      p = "P31",
      q = "Q5",
      language = language,
      limit = limit,
      include_search = include_search,
      wait = wait,
      overwrite_cache = overwrite_cache,
      cache_connection = cache_connection,
      disconnect_db = disconnect_db
    )
  } else {
    tidywikidatar::tw_filter(
      search = search,
      p = "P31",
      q = "Q5",
      language = language,
      limit = limit,
      include_search = include_search,
      wait = wait,
      overwrite_cache = overwrite_cache,
      cache_connection = cache_connection,
      disconnect_db = disconnect_db
    )
  }
}
giocomai/tidywikidatar documentation built on Aug. 2, 2024, 5:33 p.m.