R/googler.R

Defines functions googler googler_ python_path is_unix win_py_version sys_which check_count check_query `%|||%` googler_path

Documented in googler googler_

#' googler
#'
#' Searches Google and returns information about the results.
#'
#' @param query Search query.
#' @param start Start at the Nth result.
#' @param count Show N results–defaults to 10
#' @param news Show results from news section.
#' @param tld Country-specific search with top-level domain TLD, e.g., 'in' for
#'   India.
#' @param lang Search for the language, e.g., 'fi' for Finnish.
#' @param exact Disable automatic spelling correction. Search exact keywords.
#' @param time Time limit search, e.g., 'h5' (5 hrs), 'd5' (5 days), 'w5' (5
#'   weeks), 'm5' (5 months), 'y5' (5 years)
#' @param site Search a site using Google.
#' @param unfilter Do not omit similar results.
#' @return A tibble data frame where each row represents one search result and
#'   columns contain information about each result–i.e., abstract, phrase
#'   matching, title, and URL.
#' @examples
#' \dontrun{
#' ## search google within past month
#' googler("rstats", time = "m1")
#'
#' ## search google news
#' googler_news("rstats")
#' }
#' @export
googler <- function(query,
                    start = NULL,
                    count = NULL,
                    news = NULL,
                    tld = NULL,
                    lang = NULL,
                    exact = NULL,
                    time = NULL,
                    site = NULL,
                    unfilter = NULL) {
  ## parse JSON and return as a tibble
  try_tbl(tibble::as_tibble(jsonlite::fromJSON(
    do.call("googler_", as.list(environment()))
  )))
}

#' googler_
#'
#' googler_(): Internal version of googler that returns stdout
#'
#' @inheritParams googler
#' @keywords internal
googler_ <- function(query,
                     start = NULL,
                     count = NULL,
                     news = NULL,
                     tld = NULL,
                     lang = NULL,
                     exact = NULL,
                     time = NULL,
                     site = NULL,
                     unfilter = NULL) {

  ## path to python3
  cmd <- py_path()

  ## path to googler
  ggl <- googler_path()

  ## issue automation warning if count > 100
  check_count(count > 100)

  ## check query
  check_query(query)

  ## set/reset encoding
  op <- getOption("encoding")
  on.exit(options(encoding = op), add = TRUE)
  options(encoding = "UTF-8")

  ## compile args
  args <- c(ggl, shQuote(query),
    start    %|||% paste0("--start ", start),
    count    %|||% paste0("--count ", count),
    news     %|||% paste0("--news"),
    tld      %|||% paste0("--tld ", tld),
    lang     %|||% paste0("--lang ", lang),
    exact    %|||% paste0("--exact"),
    time     %|||% paste0("--time ", time),
    site     %|||% paste0("--site ", site),
    unfilter %|||% paste0("--unfilter"),
    "-C", "--json"
  )

  ## execute command and return stdout
  system2(cmd, args = args, stdout = TRUE)
}

python_path <- function() {
  path <- sys_which(paste0("python",
    c("", "4", "3.9", "3.8", "3.7", "3.6", "3.5", "3.4", "3")))
  if (!any(grepl("python[[:punct:] ]{0,}3|3[[:punct:] ]{0,}python",
    path, ignore.case = TRUE))) {
    path <- grep("python", path, value = TRUE, ignore.case = TRUE)[1]
  } else {
    path <- grep("3[[:punct:] ]{0,}python|python[[:punct:] ]{0,}3",
      path, value = TRUE, ignore.case = TRUE)[1]
  }
  if (is.na(path) || !nzchar(path)) {
    stop("'googler' requires python3, which does not appear to be installed.")
  }
  path
}

is_unix <- function() grepl("unix", .Platform$OS.type, ignore.case = TRUE)

win_py_version <- function() {
  system("python --version", intern = TRUE, invisible = TRUE)
}

sys_which <- function(names) {
  names <- names[!is.na(names)]
  res <- character(length(names))
  names(res) <- names
  if (is_unix()) {
    which <- "/usr/bin/which"
  } else {
    which <- "where"
  }
  for (i in seq_along(names)) {
    ans <- suppressWarnings(
      tryCatch(system(paste(which, shQuote(names[i])),
      intern = TRUE, ignore.stderr = TRUE),
        error = function(e) ""))
    if (grepl("solaris", R.version$os)) {
      tmp <- strsplit(ans[1], " ", fixed = TRUE)[[1]]
      if (identical(tmp[1:3], c("no", i, "in")))
        ans <- ""
    }
    res[i] <- if (length(ans))
      ans[1]
    else ""
    if (is.na(res[i]))
      res[i] <- ""
    if (!file.exists(res[i]))
      res[i] <- ""
  }
  res
}

check_count <- function(x) {
  if (length(x) == 0) {
    return(invisible())
  }
  if (x) {
    warning("Automated bulk web scraping is not the intended use of {googler}",
      immediate. = TRUE)
  }
  invisible()
}

check_query <- function(query) {
  stopifnot(
    length(query) == 1,
    is.character(query),
    nzchar(query)
  )
}

`%|||%` <- function(a, b) {
  if (!is.null(a))
    b
  else
    a
}

googler_path <- function() {
  ## if googler.path in options
  if (!is.null(path <- getOption("googler.path"))) {
    return(path)
  }
  ## if googler is installed
  if (grepl("googler", path <- sys_which("googler"))) {
    options(googler.path = path)
    return(path)
  }

  ## otherwise download and store script
  if (file.exists(file.path(tempdir(), "googler"))) {
    unlink(file.path(tempdir(), "googler"))
  }
  utils::download.file(
    "https://raw.githubusercontent.com/jarun/googler/v3.9/googler",
    path <- file.path(tempdir(), "googler"),
    quiet = TRUE
  )
  options(googler.path = path)
  path
}

Try the googler package in your browser

Any scripts or data that you put into this service are public.

googler documentation built on Sept. 4, 2019, 5:03 p.m.