#' Base instruction class
#'
#' `instruction` provides the guidelines to `chewie` scraping by setting all of
#' searching and returning object parameters.
#'
#' @param title an arbitrary name to the scraped object
#' @param selector whether `path`/`alternative_path` is a css or xpath selector,
#' defaults to `NULL`
#' @param path a css or xpath path to the object to be scraped
#' @param alternative_path an alternative css or xpath path to the object to be
#' scraped
#' @param parse_as indicates if an extractor should be applied to the resulting
#' scraped item. Currently available extractors are:
#' * `extract_text`
#' * `extract_numeric`
#' * `extract_table`
#' * `extract_date`
#' * `extract_datetime`
#' * `extract_timedelta`
#' * `extract_price`.
#' @param pattern a RegEx pattern to be applied before parsing
#'
#' @return a `chewie_instruction` object
#'
#' @examples
#' sample_instruction <- instruction(
#' title = "price_header",
#' selector = "css",
#' path = "h1:nth-of-type(1)",
#' alternative_path = "h2:nth-of-type(2)",
#' parse_as = "text"
#' )
#'
#' @export
instruction <- function(
title, path, selector = "css", alternative_path = NULL,
parse_as = NULL, pattern = NULL
) {
structure(
list(
title = title,
path = path,
alternative_path = alternative_path,
selector = selector,
pattern = pattern,
parse_as = parse_as,
result = NULL
),
class = "chewie_instruction"
)
}
#' @export
is.chewie_instruction <- function(x) inherits(x, "chewie_instruction")
#' @export
print.chewie_instruction <- function(x) {
cat("<chewie_instruction>", "\n", sep = "")
cat(" * title: ", x$title, "\n", sep = "")
cat(" * path: ", x$path, "\n", sep = "")
cat(" * selector: ", x$selector, "\n", sep = "")
cat(" * parse as: ", x$parse_as, "\n", sep = "")
cat(" * pattern: ", x$pattern, "\n", sep = "")
if (inherits(x$result, "data.frame")) {
cat(" * result: a ", nrow(x$result),"x", length(x$result), " `data.frame`", "\n", sep = "")
} else {
cat(" * result: ", x$result, "\n", sep = "")
}
invisible(x)
}
#' Converts objects into instruction
#'
#' \code{as_instruction} parses a named list or vector as a `chewie_instruction`
#' object
#'
#' The function helps users and internal functions to deal with object
#' conversions into `chewie_instruction` class. As the package currently views
#' `data.frame` type of objects (later transformed into `chewie_scheme` objects)
#' as the main target of user experience, rows should be parsed as instructions
#' above all else.
#'
#' @param x an object generated by `list()`
#'
#' @export
#' @rdname as_instruction
as_instruction <- function(x) {
UseMethod("as_instruction")
}
#' @export
#' @rdname as_instruction
as_instruction.list <- function(x) {
instruction(
title = x$title,
path = x$path,
selector = ifelse(is.null(x$selector), "css", x$selector),
pattern = x$pattern,
parse_as = x$parse_as
)
}
#' Runs an instruction on a HTML page
#'
#' \code{execute_instruction} executes a `chewie_instruction` on a parsed HTML
#' page and deliver a polished scraped item to the user.
#'
#' @param page a `xml_document` object
#' @param instruction a `chewie_instruction` object
#'
#' @return an `chewie_instruction` object containing the extracted element from
#' the `path`/`alternative_path` provided.
#'
#' @export
execute_instruction <- function(page, instruction) {
check_instruction(instruction)
result <- page |>
find_elements(instruction) |>
parse_field(as = instruction$parse_as, pattern = instruction$pattern)
instruction_set_result(instruction, result)
}
#' @export
#' @rdname instruction
instruction_set_result <- function(x, result) {
x$result <- result
x
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.