R/extractors.R

Defines functions first_row_is_header detect_url_cols extract_table extract_datetime extract_price extract_numeric extract_text parse_field find_elements

Documented in extract_datetime extract_numeric extract_price extract_table extract_text find_elements parse_field

#' Find elements in a parsed HTML page
#'
#' \code{find_elements} scrapes a HTML page using `rvest::html_elements` and
#'   paths described in a `chewie_instruction` object. If `path` argument in
#'   instruction doesn't find any results it will automatically switch to an
#'   `alternative_path` if it is described in instruction as well.
#'
#' @param page a `xml_document` object
#' @param instruction a `chewie_instruction` object
#'
#' @return correspondent `xml_node` or `xml_nodeset` of the described `path`/
#'   `alternative_path`
#'
#' @examples
#' \dontrun{
#' sample_instruction <- instruction(
#'   title = "price_header",
#'   path = "h1:nth-of-type(1)",
#'   selector = "css",
#'   alternative_path = "h2:nth-of-type(2)",
#'   parse_as = "text"
#' )
#'
#' sample_page <- rvest::read_html(response)
#'
#' results <- find_elements(sample_page, sample_instruction)
#' }
#'
#' @export
find_elements <- function(page, instruction) {
  selector <- instruction$selector
  path <- instruction$path

  path_args <- list()
  path_args$x <- page
  path_args[selector[1]] <- path

  results <- do.call(rvest::html_elements, path_args)

  if (is.null(results) | length(results) == 0) {
    path_args[selector[1]] <- instruction$alternative_path

    results <- do.call(rvest::html_elements, path_args)
  }

  results
}

#' Extract texts from `xml_document` or `xml_node` element.
#'
#' Text parser using methods from both `stringr` and `rvest`. Automatically
#'   vectorised by both packages properties, can return a list of text or a
#'   single text
#'
#' @param field a `xml_node` or `xml_nodeset` element
#' @param output_type the type of extractor to be utilized
#' @param pattern a valid pattern pattern
#'
#' @keywords parser
#'
#' @export
parse_field <- function(field, as = NULL, pattern = NULL) {
  check_html_nodes(field)

  if (missing(as)) {
    return(field)
  }

  valid_outputs <- c(
    "text",
    "numeric",
    "table",
    "date",
    "datetime",
    "timedelta",
    "price"
  )

  switch(as,
    text      = extract_text(field, pattern),
    numeric   = extract_numeric(field, pattern),
    table     = extract_table(field),
    date      = extract_date(field, pattern),
    datetime  = extract_datetime(field, pattern),
    timedelta = extract_timedelta(field, pattern),
    price     = extract_price(field, pattern),
    rlang::abort(sprintf("Invalid `as` value: '%s'", as))
  )
}

#' Extract texts from `xml_document` or `xml_node` element.
#'
#' Text parser using methods from both `stringr` and `rvest`. Automatically
#'   vectorised by both packages properties, can return a list of text or a
#'   single text
#'
#' @param field a `xml_node` or `xml_nodeset` element
#' @param pattern a valid RegEx pattern
#'
#' @family extractors
#'
#' @keywords extractor
#'
#' @export
extract_text <- function(field, pattern = NULL) {
  txt <- rvest::html_text2(field)

  if (!is.null(pattern)) {
    txt <- stringr::str_extract(txt, pattern)
  }

  txt
}

#' Extract texts from `xml_node` object.
#'
#' Wrapper around `extract_text`. Converts extracted texts to numeric.
#'
#' @param field a `xml_node` or `xml_nodeset` element
#' @param pattern a valid RegEx pattern
#'
#' @return
#'
#' @family extractors
#'
#' @keywords extractor
#'
#' @export
extract_numeric <- function(field, pattern = NULL) {
  txt <- extract_text(field, pattern)
  as.numeric(txt)
}

#' Extract prices from `xml_node` elements.
#'
#' Wrapper around `extract_text`. Extracts digits from a string, convert them
#'   into a single integer and divide it by 100, considering that prices won't have
#'   more than 2 decimal digits.
#'
#' @param field a `xml_node` or `xml_nodeset` element
#' @param pattern a valid RegEx pattern
#'
#' @family extractors
#'
#' @keywords extractor
#'
#' @export
extract_price <- function(field, pattern = NULL) {
  txt <- extract_text(field, pattern)

  price <- txt %>%
    stringr::str_replace_all("\\D", "") %>%
    as.numeric() %>%
    magrittr::divide_by(100)

  price
}

#' Extract datetime from `xml_node` elements.
#'
#' @param field a `xml_node` or `xml_nodeset` element
#' @param pattern a valid RegEx pattern
#'
#' @export
extract_datetime <- function(field, pattern = NULL, ...) {
  txt <- extract_text(field, pattern)
  lubridate::as_datetime(txt, ...)
}

#' Extract table from `xml_node` elements.
#'
#' `extract_table` retrieves a table as `data.frame` from a parsed HTML page.
#'
#' Users should expect unstable results when `parse_url_cols` is TRUE due to
#'   uncovered cases. This function is an ongoing process and shall be modified
#'   in the future.
#'
#' @param field a `xml_node` or `xml_nodeset` element
#' @param parse_url_cols whether table should include `href` attributes from
#'   columns that have `<a>` tags attached to them, defaults to FALSE.
#'
#' @return a `data.frame` containing the extracted table.

#' @export
extract_table <- function(field, parse_url_cols = TRUE, ...) {
  tbl <- rvest::html_table(field, ...)

  if (!inherits(tbl, "data.frame")) {
    if (length(tbl) > 1) {
      rlang::abort("`field` parameter should aim one table at a time.")
    } else {
      tbl <- tbl[[1]]
    }
  }

  if (parse_url_cols) {
    url_cols <- detect_url_cols(field)

    rows_ <- rvest::html_elements(field, "tr")

    if (first_row_is_header(field)) {
      rows_ <- rows_[2:length(rows_)]
    }

    new_cols <- list()
    new_cols <- lapply(seq_along(url_cols), \(i) new_cols[[i]] <- list())
    names(new_cols) <- lapply(url_cols, \(i) paste0(colnames(tbl)[i], "_url"))

    for (row_ in rows_) {
      cols_ <- rvest::html_elements(row_, "td")

      for (i in seq_along(url_cols)) {
        col_ <- cols_[url_cols[i]]

        url_ <- col_ |>
          rvest::html_element("a") |>
          rvest::html_attr("href")

        new_cols[[i]] <- append(new_cols[[i]], url_)
      }
    }

    new_cols <- data.frame(lapply(new_cols, function(x) Reduce(c, x)))

    tbl <- dplyr::bind_cols(tbl, new_cols)
  }

  tbl
}

#' @export
detect_url_cols <- function(field) {
  cols_ <- c()

  all_rows <- rvest::html_elements(field, "tr")

  if (first_row_is_header(field)) {
    all_rows <- all_rows[2:length(all_rows)]
  }

  for (row_ in all_rows) {
    row_cols <- rvest::html_elements(row_, "td")

    for (i in seq_along(row_cols)) {
      col_ <- row_cols[i]

      if (i %in% cols_) next

      check_for_link <- rvest::html_element(col_, "a")

      if (!is.na(check_for_link)) {
        cols_ <- append(cols_, i)
      }
    }
  }

  sort(cols_)
}

first_row_is_header <- function(field) {
  !is.na(rvest::html_element(field, "tr > th"))
}
leonardodiegues/chewie documentation built on Dec. 21, 2021, 10:41 a.m.