R/instruction.R

Defines functions instruction_set_result execute_instruction as_instruction.list as_instruction print.chewie_instruction is.chewie_instruction instruction

Documented in as_instruction as_instruction.list execute_instruction instruction instruction_set_result

#' Base instruction class
#'
#' `instruction` provides the guidelines to `chewie` scraping by setting all of
#'   searching and returning object parameters.
#'
#' @param title an arbitrary name to the scraped object
#' @param selector whether `path`/`alternative_path` is a css or xpath selector,
#'   defaults to `NULL`
#' @param path a css or xpath path to the object to be scraped
#' @param alternative_path an alternative css or xpath path to the object to be
#'   scraped
#' @param parse_as indicates if an extractor should be applied to the resulting
#'   scraped item. Currently available extractors are:
#'   * `extract_text`
#'   * `extract_numeric`
#'   * `extract_table`
#'   * `extract_date`
#'   * `extract_datetime`
#'   * `extract_timedelta`
#'   * `extract_price`.
#' @param pattern a RegEx pattern to be applied before parsing
#'
#' @return a `chewie_instruction` object
#'
#' @examples
#' sample_instruction <- instruction(
#'   title = "price_header",
#'   selector = "css",
#'   path = "h1:nth-of-type(1)",
#'   alternative_path = "h2:nth-of-type(2)",
#'   parse_as = "text"
#' )
#'
#' @export
instruction <- function(
  title, path, selector = "css", alternative_path = NULL,
  parse_as = NULL, pattern = NULL
) {
  structure(
    list(
      title            = title,
      path             = path,
      alternative_path = alternative_path,
      selector         = selector,
      pattern          = pattern,
      parse_as         = parse_as,
      result           = NULL
    ),
    class = "chewie_instruction"
  )
}

#' @export
is.chewie_instruction <- function(x) inherits(x, "chewie_instruction")

#' @export
print.chewie_instruction <- function(x) {
  cat("<chewie_instruction>", "\n", sep = "")
  cat("    * title:    ", x$title, "\n", sep = "")
  cat("    * path:     ", x$path, "\n", sep = "")
  cat("    * selector: ", x$selector, "\n", sep = "")
  cat("    * parse as: ", x$parse_as, "\n", sep = "")
  cat("    * pattern:  ", x$pattern, "\n", sep = "")

  if (inherits(x$result, "data.frame")) {
    cat("    * result:   a ", nrow(x$result),"x", length(x$result), " `data.frame`", "\n", sep = "")
  } else {
    cat("    * result:   ", x$result, "\n", sep = "")
  }

  invisible(x)
}

#' Converts objects into instruction
#'
#' \code{as_instruction} parses a named list or vector as a `chewie_instruction`
#'   object
#'
#' The function helps users and internal functions to deal with object
#' conversions into `chewie_instruction` class. As the package currently views
#' `data.frame` type of objects (later transformed into `chewie_scheme` objects)
#' as the main target of user experience, rows should be parsed as instructions
#' above all else.
#'
#' @param x an object generated by `list()`
#'
#' @export
#' @rdname as_instruction
as_instruction <- function(x) {
  UseMethod("as_instruction")
}

#' @export
#' @rdname as_instruction
as_instruction.list <- function(x) {
  instruction(
    title = x$title,
    path = x$path,
    selector = ifelse(is.null(x$selector), "css", x$selector),
    pattern = x$pattern,
    parse_as = x$parse_as
  )
}

#' Runs an instruction on a HTML page
#'
#' \code{execute_instruction} executes a `chewie_instruction` on a parsed HTML
#'   page and deliver a polished scraped item to the user.
#'
#' @param page a `xml_document` object
#' @param instruction a `chewie_instruction` object
#'
#' @return an `chewie_instruction` object containing the extracted element from
#'   the `path`/`alternative_path` provided.
#'
#' @export
execute_instruction <- function(page, instruction) {
  check_instruction(instruction)

  result <- page |>
    find_elements(instruction) |>
    parse_field(as = instruction$parse_as, pattern = instruction$pattern)

  instruction_set_result(instruction, result)
}

#' @export
#' @rdname instruction
instruction_set_result <- function(x, result) {
  x$result <- result
  x
}
leonardodiegues/chewie documentation built on Dec. 21, 2021, 10:41 a.m.