R/scraper_core.R

Defines functions query params set_param

#' @importFrom stats runif
#' @importFrom rjson fromJSON
#' @importFrom httr GET user_agent verbose
#' @importFrom xml2 read_html
#' @importFrom rvest html_text
#' @importFrom tibble tibble
#' @importFrom dplyr if_else bind_rows transmute %>%
#' @importFrom tidyr expand_grid
#' @importFrom purrr transpose insistently rate_backoff


### this function generates a query string ###
set_param <- function(param_list, for_url = "") {
  db <- param_list[["db"]]
  categ <- param_list[["categ"]]
  params <- `[<-`(param_list, c("db", "categ"), NULL)
  params[["callback"]] <- params[["callback"]]()
  paste0(for_url, "/", db, "?", paste(names(params), params, sep = "=", collapse = "&"))
}


### this function generates a list of query parameters ###
params <- function(admin_codes, category) {
  ### this function generates a unix timestamp ###
  gen_callback <- function() {
    paste(
      "jsonp",
      format(as.numeric(Sys.time()) * 1000, digits = 13L),
      as.integer(runif(1L) * 8999999 + 1e6),
      sep = "_"
    )
  }

  gen_admin_level <- function(admin_code, categ) {
    if_else(
      as.integer(admin_code) %% 10000L == 0L &
        categ != "internal_flow",
      "province", "city"
    )
  }

  url_dta <- expand_grid(categ = category, id = admin_codes)
  transpose(transmute(
    url_dta,
    db = unname(c(
      "inflow" = "historycurve.jsonp",
      "outflow" = "historycurve.jsonp",
      "internal_flow" = "internalflowhistory.jsonp"
    )[categ]),
    dt = gen_admin_level(id, categ),
    id = as.character(id),
    type = unname(c(
      "inflow" = "move_in",
      "outflow" = "move_out",
      "internal_flow" = ""
    )[categ]),
    date = unname(c(
      "inflow" = "",
      "outflow" = "",
      "internal_flow" = format(Sys.Date(), "%Y%m%d")
    )[categ]),
    callback = list(gen_callback),
    categ = categ
  ))
}


### this function is the main function to generate a query call.
query <- function(param_list, master_url, nap, verbose) {
  if (!is.null(nap)) Sys.sleep(nap())
  q_str <- set_param(param_list, master_url)
  trial <- paste0("- Querying: ", basename(q_str), "...")
  message(trial)
  trial <-
    q_str %>%
    insistently(GET, rate_backoff(5), quiet = FALSE)(
      user_agent(sample(ua_list, 1L)),
      if (verbose) verbose() else NULL
    ) %>%
    read_html() %>%
    html_text() %>%
    gsub("jsonp_\\d{13}_\\d{7}\\((.*)\\)", "\\1", .)
  trial <- fromJSON(trial)
  attr(trial, "src") <- q_str
  attr(trial, "place") <- param_list[["id"]]
  attr(trial, "categ") <- param_list[["categ"]]
  trial
}
query_ls <- function(master_url, params, f_nap, verbose) {
  on.exit(return(cache))
  cache <- list()
  while(length(params) > 0) {
    cache <- c(cache, list(query(params[[1L]], master_url, f_nap, verbose)))
    params <- params[-1L]
  }
}


### this function process the json data received.
json_tree_handler <- function(json_tree) {
  node_process <- function(x) {
    dta <- unlist(x[["data"]][["list"]])
    src <- attr(x, "src", TRUE)
    place <- attr(x, "place", TRUE)
    categ <- attr(x, "categ", TRUE)
    if (is.null(dta)) return(NULL)
    tibble(
      data = list(tibble(admin_code = place, date = names(dta), category = categ, value = unname(dta))),
      meta = list(tibble(admin_code = place, category = categ, source = src))
    )
  }
  tryCatch(
    bind_rows(lapply(json_tree, node_process)),
    error = function(e) {warning(e); assign("json_tree", json_tree, .GlobalEnv)}
  )
}


#' Get the data you need from qianxi.baidu.
#' @description This function invokes query calls based on the parameters you
#'   provide.
#' @param admin_codes a vector of administrative codes for each city that you
#'   want to query. See \code{\link{baidu_names}} for a full list of administrative codes.
#' @param category one or more of the following values: "inflow", "outflow", and
#'   "internal_flow". Each of these values corresponds to one type of data on
#'   Baidu's website.
#' @param url the master url of the website to be scraped. Default to
#'   \href{https://huiyan.baidu.com/migration}{Baidu Huiyan}.
#' @param nap_control controls the time interval between two query calls. You
#'   need this parameter to avoid being caught by any anti-bot mechanism.
#' @param verbose whether or not to show additional information for a query
#'   call. Default to \code{FALSE}.
#' @param storage to append the data received to an existing dataframe provided.
#'   if you provide this argument, then it becomes the default value that the
#'   function returns. Thus, even if a query call fails, your existing data will
#'   not be overwritten by a \code{NULL} value.
#' @return A nested tibble with two columns \code{data} and \code{meta}. Each
#'   row stores the data received from one query call.
#' @examples
#' # Query the inflow and outflow data for both Beijing ("110000") and Tianjin ("120000").
#' # The returned dataframe (of a tibble type) has a total of four rows.
#' df1 <- get_data(c("110000", "120000"), c("inflow", "outflow"))
#'
#' \dontrun{
#' # Use an existing object/tibble to store your returned data.
#' storage_space <- df1
#' storage_space <- get_data("110000", "internal_flow", storage = storage_space)
#' }
#' @export
get_data <- function(admin_codes,
                     category = c("inflow", "outflow", "internal_flow"),
                     url = "https://huiyan.baidu.com/migration",
                     nap_control = function() runif(1L, .5, 1),
                     verbose = FALSE,
                     storage = tibble()) {
  ret <- storage
  ret <-
    bind_rows(storage, json_tree_handler(query_ls(
      url,
      params(admin_codes, category),
      nap_control,
      verbose
    )))
  ret
}
sppmao/ioiscraper documentation built on Sept. 26, 2020, 2:45 p.m.