R/cas_extract.R

Defines functions cas_extract_script cas_extract

Documented in cas_extract cas_extract_script

#' Extract fields and contents from downloaded files
#'
#' @param extractors A named list of functions. See examples for details.
#' @param post_processing Defaults to NULL. If given, it must be a function that
#'   takes a data frame as input (logically, a row of the dataset) and returns
#'   it with additional or modified columns.
#' @param id Defaults to NULL, identifiers to process when extracting. If given,
#'   must be a numeric vector, logically corresponding to the identifiers in the
#'   `id` column, e.g. as returned by ` cas_read_db_contents_id()`
#' @param ignore_id Defaults to TRUE. If TRUE, it checks if identifiers have
#'   been added to the local ignore list, typically with `cas_ignore_id()`, and
#'   as retrieved with `cas_read_db_ignore_id()`. It can also be a numeric
#'   vector of identifiers: the given identifiers will not be processed. If
#'   FALSE, items will be processed normally.
#' @param store_as_character Logical, defaults to TRUE. If TRUE, it converts to
#'   character all extracted contents before writing them to database. This
#'   reduces issues of type conversions with the default database backend (for
#'   example, SQLite automatically converts dates to numeric) or using different
#'   backends. This implies you will need to set data types when you read the
#'   database, but it also means that you can consistently expect all columns to
#'   be character vectors, which in one form or another are consistently
#'   implemented across database backends. Set to FALSE if you want to remain in
#'   control of column types.
#' @param check_previous Logical, defaults to TRUE. If FALSE, no check will be
#'   conducted to verify if the same content had been previously extracted. If
#'   FALSE, `write_to_db` must be set (or will be set) to FALSE, to prevent
#'   duplication of data.
#' @param keep_if_status Defaults to 200. Keep only if recorded download status
#'   matches the given status.
#' @inheritParams cas_download
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' if (interactive) {
#'   ### Post-processing example ####
#'   # For example, in order to add a column called `internal_id`
#'   # that takes the ending digits of the url (assuming the url ends with digits)
#'   # a function such as the following would be passed to cas_extract
#'   pp <- function(df) {
#'     df |>
#'       dplyr::mutate(internal_id = stringr::str_extract(url, "[[:digit:]]+$"))
#'   }
#' }
#'
#' cas_extract(
#'   extractors = extractors_l, # assuming it has already been set
#'   post_processing = pp
#' )
#' }
cas_extract <- function(extractors,
                        post_processing = NULL,
                        id = NULL,
                        ignore_id = TRUE,
                        custom_path = NULL,
                        index = FALSE,
                        store_as_character = TRUE,
                        check_previous = TRUE,
                        db_connection = NULL,
                        file_format = "html",
                        sample = FALSE,
                        write_to_db = FALSE,
                        keep_if_status = 200,
                        encoding = "UTF-8",
                        ...) {
  ellipsis::check_dots_unnamed()

  db <- cas_connect_to_db(
    db_connection = db_connection,
    ...
  )

  type <- dplyr::if_else(condition = index,
    true = "index",
    false = "contents"
  )

  if (is.null(custom_path)) {
    path <- cas_get_base_path(...)
  } else {
    path_ending <- stringr::str_c(file_format, type, sep = "_")

    custom_path_ending <- fs::path_file(custom_path)

    if (path_ending == custom_path_ending) {
      path <- fs::path(custom_path)
    } else {
      path <- fs::path(
        custom_path,
        path_ending
      )
    }
  }

  previous_download_df <- cas_read_db_download(
    index = index,
    db_connection = db,
    disconnect_db = FALSE,
    ...
  ) %>%
    dplyr::arrange(dplyr::desc(datetime)) %>%
    dplyr::distinct(id, .keep_all = TRUE) %>%
    dplyr::arrange(id, batch, datetime) %>%
    dplyr::filter(status %in% keep_if_status)

  stored_files_df <- previous_download_df %>%
    dplyr::select("id", "batch") %>%
    dplyr::mutate(path = fs::path(
      path,
      batch,
      stringr::str_c(id, "_", batch, ".", file_format)
    )) %>%
    dplyr::select("id", "path")

  if (store_as_character == TRUE) {
    stored_files_df <- stored_files_df %>%
      dplyr::mutate(id = as.character(id))
  }


  if (check_previous == FALSE) {
    files_to_extract_pre_df <- stored_files_df
    write_to_db <- FALSE
  } else {
    # Do not process previously extracted
    previously_extracted_df <- cas_read_db_contents_data(
      db_connection = db,
      disconnect_db = FALSE,
      ...
    )

    if (is.null(previously_extracted_df) == FALSE) {
      previously_extracted_df <- previously_extracted_df %>%
        dplyr::select(id) %>%
        dplyr::collect()

      if (store_as_character == TRUE) {
        previously_extracted_df <- previously_extracted_df %>%
          dplyr::mutate(id = as.character(id))
      }
    }

    if (is.null(previously_extracted_df) == FALSE) {
      files_to_extract_pre_df <- dplyr::anti_join(
        x = stored_files_df,
        y = previously_extracted_df,
        by = "id"
      )
    } else {
      files_to_extract_pre_df <- stored_files_df
    }
  }

  if (nrow(files_to_extract_pre_df) == 0) {
    # TODO return consistently data frame or S3 object
    return(invisible(NULL))
  }

  contents_id_df <- cas_read_db_contents_id(
    db_connection = db,
    disconnect_db = FALSE,
    ...
  ) %>%
    dplyr::collect()

  if (store_as_character == TRUE) {
    contents_id_df <- contents_id_df %>%
      dplyr::mutate(id = as.character(id))
  }

  files_to_extract_df <- files_to_extract_pre_df %>%
    dplyr::inner_join(
      y = contents_id_df,
      by = "id"
    )

  if (is.null(id) == FALSE) {
    id_to_keep <- id
    files_to_extract_df <- files_to_extract_df %>%
      dplyr::filter(id %in% id_to_keep)
  }

  if (isTRUE(ignore_id)) {
    ignore_id <- cas_read_db_ignore_id(
      db_connection = db,
      disconnect_db = FALSE
    ) |>
      dplyr::pull(id)

    files_to_extract_df <- files_to_extract_df %>%
      dplyr::filter(!(id %in% ignore_id))
  } else if (is.numeric(ignore_id)) {
    files_to_extract_df <- files_to_extract_df %>%
      dplyr::filter(!(id %in% ignore_id))
  }

  if (is.numeric(sample) == TRUE) {
    if (sample > nrow(files_to_extract_df)) {
      sample <- nrow(files_to_extract_df)
    }

    files_to_extract_df <- files_to_extract_df %>%
      dplyr::slice_sample(n = sample)
  } else if (isTRUE(sample)) {
    files_to_extract_df <- files_to_extract_df %>%
      dplyr::slice_sample(p = 1)
  }

  if (write_to_db == FALSE) {
    cas_disconnect_from_db(
      db_connection = db,
      disconnect_db = TRUE
    )

    db <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
  }

  available_files_to_extract_df <- files_to_extract_df %>%
    dplyr::mutate(available = fs::file_exists(path)) %>%
    dplyr::filter(available)

  if (nrow(available_files_to_extract_df) != nrow(files_to_extract_df)) {
    cli::cli_warn(c(
      `x` = glue::glue("Not all downloaded files are currently available in their expected location."),
      `*` = glue::glue("Total files expected:  {scales::number(nrow(files_to_extract_df))}"),
      `*` = glue::glue("Total files available: {scales::number(nrow(available_files_to_extract_df))}"),
      `i` = glue::glue("Only available files will be processed. Consider running `cas_restore()` or otherwise deal with missing files as needed.")
    ))
  }

  purrr::walk(
    .progress = "Extracting",
    .x = purrr::transpose(available_files_to_extract_df),
    function(x) {
      current_html_document <- xml2::read_html(
        x = x$path,
        options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE"),
        encoding = encoding
      )

      if (inherits(x = current_html_document, what = "xml_node") == FALSE) {
        return(NULL)
      }

      current_df <- names(extractors) %>%
        purrr::set_names() %>%
        purrr::map(.f = function(current_function) {
          current_function <- extractors[[current_function]](current_html_document)
        }) %>%
        tibble::as_tibble() %>%
        dplyr::mutate(
          id = as.numeric(x[["id"]]),
          url = as.character(x[["url"]])
        ) %>%
        dplyr::select("id", "url", dplyr::everything())

      if (is.null(post_processing) == FALSE) {
        if (is.function(post_processing) == FALSE) {
          cli::cli_abort("When given, {.val post_processing} must be a function.")
        }
        current_df <- post_processing(current_df)
      }

      if (store_as_character == TRUE) {
        current_df <- current_df %>%
          dplyr::mutate(dplyr::across(
            .cols = dplyr::everything(),
            .fns = as.character
          ))
      }

      cas_write_to_db(
        df = current_df,
        table = "contents_data",
        db_connection = db,
        disconnect_db = FALSE,
        ...
      )
    }
  )

  if (write_to_db == FALSE) {
    output_df <- cas_read_db_contents_data(
      db_connection = db,
      ...
    )
    return(output_df)
  }
}


#' Extracts scripts from an html page
#'
#' @param script_type Defaults to NULL. Type of script. Common script types
#'   include `application/ld+json`, `text/template`, etc.
#' @param match Default to NULL. If given, used to filter extracted scripts.
#'   Must be a named vector in the format `c(`@type` = "NewsArticle")` for a
#'   script of type "NewsArticle".
#' @param accessors Defaults to NULL. If given, a vector of accessors passed to
#'   `purrr::pluck` in order to extract sub-components of the list resulting
#'   from reading the with `jsonlite` the result of the previous steps and
#'   filter.
#' @param remove_from_script Defaults to NULL. If given, removed after the
#'   script has been extracted but before processing the json.
#'
#' @inheritParams cas_extract_html
#'
#' @return May return a list or a character vector. If no match is found, returns `NA_character_`
#' @export
#'
#' @examples
#' \dontrun{
#' if (interactive()) {
#'   url <- "https://www.digi24.ro/stiri/externe/casa-alba-pune-capat-isteriei-globale-nu-exista-indicii-ca-obiectele-zburatoare-doborate-de-rachetele-sua-ar-fi-extraterestre-2250863"
#'
#'   html_document <- rvest::read_html(x = url)
#'
#'   cas_extract_script(
#'     html_document = html_document,
#'     script_type = "application/ld+json"
#'   )
#'
#'   # get date published
#'   cas_extract_script(
#'     html_document = html_document,
#'     script_type = "application/ld+json",
#'     match = c(`@type` = "NewsArticle"),
#'     accessors = "datePublished"
#'   )
#'
#'   # get title
#'   cas_extract_script(
#'     html_document = html_document,
#'     script_type = "application/ld+json",
#'     match = c(`@type` = "NewsArticle"),
#'     accessors = "headline"
#'   )
#'
#'   # get nested element, e.g. url of the logo of the publisher
#'
#'   cas_extract_script(
#'     html_document = html_document,
#'     script_type = "application/ld+json",
#'     match = c(`@type` = "NewsArticle"),
#'     accessors = c("publisher", "logo", "url")
#'   )
#' }
#' }
cas_extract_script <- function(html_document,
                               script_type = NULL,
                               match = NULL,
                               accessors = NULL,
                               remove_from_script = NULL) {
  if (is.null(script_type) == TRUE) {
    script_pre <- html_document %>%
      rvest::html_elements("script")
  } else {
    script_pre <- html_document %>%
      rvest::html_elements(stringr::str_c("script[type='", script_type, "']"))
  }

  script_l <- purrr::map(
    .x = script_pre,
    .f = function(x) {
      if (is.null(remove_from_script)) {
        x %>%
          rvest::html_text2() %>%
          jsonlite::parse_json()
      } else {
        x %>%
          rvest::html_text2() %>%
          stringr::str_remove_all(pattern = remove_from_script) %>%
          jsonlite::parse_json()
      }
    }
  )

  if (is.null(match) == FALSE) {
    matched_pre <- purrr::map_chr(
      .x = script_l,
      .f = function(x) {
        x %>%
          purrr::pluck(names(match))
      }
    )

    match_index_v <- which(matched_pre == match)

    if (length(match_index_v) == 0) {
      return(NA_character_)
    } else {
      script_l <- script_l[[match_index_v]]
    }
  }

  if (is.null(accessors) == FALSE) {
    script_l %>%
      purrr::pluck(!!!accessors)
  } else {
    script_l
  }
}
giocomai/castarter documentation built on May 4, 2024, 1:14 a.m.