R/search-pv.R

#' @noRd
get_base <- function(endpoint)
  sprintf("http://www.patentsview.org/api/%s/query", endpoint)

#' @noRd
tojson_2 <- function(x, ...) {
  json <- jsonlite::toJSON(x, ...)
  if (!grepl("[[:alnum:]]", json, ignore.case = TRUE)) json <- ""
  json
}

#' @noRd
to_arglist <- function(fields, subent_cnts, mtchd_subent_only,
                       page, per_page, sort) {
  list(
    fields = fields,
    sort = list(as.list(sort)),
    opts = list(
      include_subentity_total_counts = subent_cnts,
      matched_subentities_only = mtchd_subent_only,
      page = page,
      per_page = per_page
    )
  )
}

#' @noRd
get_get_url <- function(query, base_url, arg_list) {
  j <- paste0(
    base_url,
    "?q=", query,
    "&f=", tojson_2(arg_list$fields),
    "&o=", tojson_2(arg_list$opts, auto_unbox = TRUE),
    "&s=", tojson_2(arg_list$sort, auto_unbox = TRUE)
  )
  utils::URLencode(j)
}

#' @noRd
get_post_body <- function(query, arg_list) {
  body <- paste0(
    '{',
    '"q":', query, ",",
    '"f":', tojson_2(arg_list$fields), ",",
    '"o":', tojson_2(arg_list$opts, auto_unbox = TRUE), ",",
    '"s":', tojson_2(arg_list$sort, auto_unbox = TRUE),
    '}'
  )
  gsub('(,"[fs]":)([,}])', paste0('\\1', "{}", '\\2'), body)
}

#' @noRd
one_request <- function(method, query, base_url, arg_list, error_browser, ...) {
  ua <- httr::user_agent("https://github.com/ropensci/patentsview")

  if (method == "GET") {
    get_url <- get_get_url(
      query = query, base_url = base_url, arg_list = arg_list
    )
    resp <- httr::GET(url = get_url, ua, ...)
  } else {
    body <- get_post_body(query = query, arg_list = arg_list)
    resp <- httr::POST(url = base_url, body = body, ua, ...)
  }

  if (httr::http_error(resp))
    throw_er(resp = resp, error_browser = error_browser)

  process_resp(resp = resp)
}

#' @noRd
request_apply <- function(ex_res, method, query, base_url, arg_list,
                          error_browser, ...) {
  req_pages <- ceiling(ex_res$query_results[[1]]/10000)
  if (req_pages < 1)
    stop("No records matched your query...Can't download multiple pages",
         .call = FALSE)

  tmp <- sapply(1:req_pages, function(i) {
    arg_list$opts$per_page <- 10000
    arg_list$opts$page <- i
    x <- one_request(
      method = method, query = query, base_url = base_url, arg_list = arg_list,
      error_browser = error_browser, ...
    )
    x$data[[1]]
  }, simplify = FALSE)

  do.call("rbind", c(tmp, make.row.names = FALSE))
}

#'Search PatentsView
#'
#'This function makes an HTTP request to the PatentsView API for data matching
#'the user's query.
#'
#'@param query The query that the API will use to filter records. \code{query}
#'  can come in any one of the following forms:
#'  \itemize{
#'    \item A character string with valid JSON. \cr
#'    E.g., \code{'{"_gte":{"patent_date":"2007-01-04"}}'}
#'
#'    \item A list which will be converted to JSON by \code{search_pv}. \cr
#'    E.g., \code{list("_gte" = list("patent_date" = "2007-01-04"))}
#'
#'    \item An object of class \code{pv_query}, which you create by calling one
#'    of the functions found in the \code{\link{qry_funs}} list...See the
#'    \href{http://ropensci.github.io/patentsview/articles/writing-queries.html}{writing
#'    queries vignette} for details.\cr
#'    E.g., \code{qry_funs$gte(patent_date = "2007-01-04")}
#'  }
#'@param fields A character vector of the fields that you want returned to you.
#'  A value of \code{NULL} indicates that the default fields should be
#'  returned. Acceptable fields for a given endpoint can be found at the API's
#'  online documentation (e.g., check out the field list for the
#'  \href{http://www.patentsview.org/api/patent.html#field_list}{patents
#'  endpoint}) or by viewing the \code{fieldsdf} data frame
#'  (\code{View(fieldsdf)}). You can also use \code{\link{get_fields}} to list
#'  out the fields available for a given endpoint.
#'@param endpoint The web service resource you wish to search. \code{endpoint}
#'  must be one of the following: "patents", "inventors", "assignees",
#'  "locations", "cpc_subsections", "uspc_mainclasses", or "nber_subcategories".
#'@param subent_cnts Do you want the total counts of unique subentities to be
#'  returned? This is equivalent to the \code{include_subentity_total_counts}
#'  parameter found \href{http://www.patentsview.org/api/query-language.html#options_parameter}{here}.
#'@param mtchd_subent_only Do you want only the subentities that match your
#'  query to be returned? A value of \code{TRUE} indicates that the subentity
#'  has to meet your query's requirements in order for it to be returned, while
#'  a value of \code{FALSE} indicates that all subentity data will be returned,
#'  even those records that don't meet your query's requirements. This is
#'  equivalent to the \code{matched_subentities_only} parameter found
#'  \href{http://www.patentsview.org/api/query-language.html#options_parameter}{here}.
#'@param page The page number of the results that should be returned.
#'@param per_page The number of records that should be returned per page. This
#'  value can be as high as 10,000 (e.g., \code{per_page = 10000}).
#'@param all_pages Do you want to download all possible pages of output? If
#'  \code{all_pages = TRUE}, the values of \code{page} and \code{per_page} are
#'  ignored.
#'@param sort A named character vector where the name indicates the field to
#'  sort by and the value indicates the direction of sorting (direction should
#'  be either "asc" or "desc"). For example, \code{sort = c("patent_number" =
#'  "asc")} or \cr\code{sort = c("patent_number" = "asc", "patent_date" =
#'  "desc")}. \code{sort = NULL} (the default) means do not sort the results.
#'  You must include any fields that you wish to sort by in \code{fields}.
#'@param method The HTTP method that you want to use to send the request.
#'  Possible values include "GET" or "POST". Use the POST method when
#'  your query is very long (say, over 2,000 characters in length).
#'@param error_browser The program used to view the HTML error
#'  messages sent by the API. This should be one of the following:
#'  \itemize{
#'    \item The string "false" (the default), which turns error browsing off.
#'    Instead of viewing the HTML in this case, you will get a portion of the
#'    parsed HTML text returned to you as an R error.
#'
#'    \item The name of the program to use to view the HTML error message. If
#'    the name of the program is on your PATH, just specify the program name
#'    (e.g., \code{error_browser = "chrome"}). Otherwise, include the full
#'    path to the program.
#'
#'    \item An R function to be called to invoke the browser (e.g., \cr
#'    \code{error_browser = rstudioapi::viewer})
#'
#'    \item Under Windows, a value of \code{NULL} is allowed and implies that
#'    the file association mechanism will be used to determine which browser is
#'    used.
#'  }
#'@param ... Arguments passed along to httr's \code{\link[httr]{GET}} or
#'  \code{\link[httr]{POST}} function.
#'
#'@return A list with the following three elements:
#'  \describe{
#'    \item{data}{A list with one element - a named data frame containing the
#'    data returned by the server. Each row in the data frame corresponds to a
#'    single value for the primary entity. For example, if you search the
#'    assignees endpoint, then the data frame will be on the assignee-level,
#'    where each row corresponds to a single assignee. Fields that are not on
#'    the assignee-level would be returned in list columns.}
#'
#'    \item{query_results}{Entity counts across all pages of output (not just
#'    the page returned to you). If you set \code{subent_cnts = TRUE}, you will
#'    be returned both the counts of the primary entities and the subentities.}
#'
#'    \item{request}{Details of the HTTP request that was sent to the server.
#'    When you set \code{all_pages = TRUE}, you will only get a sample request.
#'    In other words, you will not be given multiple requests for the multiple
#'    calls that were made to the server (one for each page of results).}
#'  }
#'
#' @examples
#' search_pv(query = '{"_gt":{"patent_year":2010}}')
#'
#' search_pv(
#'   query = qry_funs$gt(patent_year = 2010),
#'   fields = get_fields("patents", c("patents", "assignees"))
#' )
#'
#' search_pv(
#'   query = qry_funs$gt(patent_year = 2010),
#'   method = "POST",
#'   fields = "patent_number",
#'   sort = c("patent_number" = "asc")
#' )
#'
#' search_pv(
#'   query = qry_funs$eq(inventor_last_name = "crew"),
#'   all_pages = TRUE
#' )
#'
#' search_pv(
#'   query = qry_funs$contains(inventor_last_name = "smith"),
#'   endpoint = "assignees"
#' )
#'
#' search_pv(
#'   query = qry_funs$contains(inventor_last_name = "smith"),
#'   config = httr::timeout(40)
#' )
#'
#'\dontrun{
#'
#' # Will view error message in RStudio's viewer pane:
#' search_pv(
#'    query = with_qfuns(not(text_any(patent_title = "hi"))),
#'    error_browser = rstudioapi::viewer
#' )
#'}
#'@export
search_pv <- function(query,
                      fields = NULL,
                      endpoint = "patents",
                      subent_cnts = FALSE,
                      mtchd_subent_only = TRUE,
                      page = 1,
                      per_page = 25,
                      all_pages = FALSE,
                      sort = NULL,
                      method = "GET",
                      error_browser = getOption("pv_browser"),
                      ...) {

  validate_endpoint(endpoint = endpoint)

  if (is.list(query)) {
    check_query(query = query, endpoint = endpoint)
    query <- jsonlite::toJSON(query, auto_unbox = TRUE)
  }

  validate_misc_args(
    query = query, fields = fields, endpoint = endpoint, method = method,
    subent_cnts = subent_cnts, mtchd_subent_only = mtchd_subent_only,
    page = page, per_page = per_page, sort = sort
  )

  arg_list <- to_arglist(
    fields = fields, subent_cnts = subent_cnts,
    mtchd_subent_only = mtchd_subent_only, page = page, per_page = per_page,
    sort = sort
  )

  base_url <- get_base(endpoint = endpoint)

  res <- one_request(
    method = method, query = query, base_url = base_url,  arg_list = arg_list,
    error_browser = error_browser, ...
  )

  if (!all_pages) return(res)

  full_data <- request_apply(
    ex_res = res, method = method, query = query, base_url = base_url,
    arg_list = arg_list, ...
  )
  res$data[[1]] <- full_data

  res
}
crew102/patentsview documentation built on May 14, 2019, 11:33 a.m.