R/crux.R

Defines functions tbl_crux crux_urls .parse_crux_url .remove_na

Documented in crux_urls tbl_crux

.remove_na <-
  function(data) {
    janitor::remove_empty(data, which = "cols")
  }

.parse_crux_url <-
  function(url = "http://www.innovatorhealth.com",
           return_message = T,
           ...) {
    df_summary <- crux::classify_url(x = url, ...)
    df_info  <- as_tibble(crux::summarise_url(x = url))
    data <-
      df_info %>%
      left_join(df_summary, by = "url") %>%
      .remove_na()

    if (return_message) {
      glue::glue("Parsed {url}") %>% message()
    }
    data
  }


#' CRUX vector of urls
#'
#' @param urls vector of company urls
#' @param return_message if \code{TRUE} returns a message
#' @param url_column name of column output
#' @param case if not \code{NULL} `upper` coverts to upper or `lower` converts to lower
#' @param snake_names if \code{TRUE} returns snake names
#' @param unknown_icon_url if not `NULL` a link to override missing icon's
#' @param ... other parameters
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' crux_urls("http://www.pwcommunications.com/")
crux_urls <-
  function(urls = c(
    "http://www.fgcplasma.com",
    "http://www.dynepic.com",
    "http://re3d.org",
    "http://www.nearspacelaunch.com",
    "http://www.botfactory.co"
  )
  ,
  url_column = "urlCompany",
  case = NULL,
  snake_names = F,
  unknown_icon_url = "https://static.thenounproject.com/png/739239-200.png",
  return_message = T,
  ...) {
    if (length(urls) == 0) {
      stop("Enter URLs")
    }
    .parse_crux_url_safe <-
      purrr::possibly(.parse_crux_url, tibble())

    all_data <-
      urls %>%
      map_dfr(function(url) {
        .parse_crux_url_safe(url = url, return_message)
      })

    text_cols <-
      all_data %>% select_if(is.character) %>%
      select(-matches("url|theme|site_name")) %>% names()

    all_data <-
      all_data %>%
      mutate_at(text_cols,
                list(function(x) {
                  x %>%  str_squish() %>% stri_enc_tonative()
                }))

    if (length(case) > 0) {
      if (case %>% str_to_lower() %>% str_detect("upper")) {
        all_data <-
          all_data %>%
          mutate_at(text_cols,
                    list(function(x) {
                      x %>%  str_to_upper()
                    }))
      } else {
        all_data <-
          all_data %>%
          mutate_at(text_cols,
                    list(function(x) {
                      x %>%  str_to_lower()
                    }))
      }
    }

    df_text <-
      all_data %>%
      select(url, one_of(text_cols)) %>%
      gather(column, text, -url, na.rm = T) %>%
      group_by(url) %>%
      summarise(descriptionSiteText = str_c(text, collapse = " ") %>% str_squish() %>% stri_enc_tonative()) %>%
      ungroup()

    all_data <-
      all_data %>%
      left_join(df_text, by = "url") %>%
      select(url, descriptionSiteText, everything())

    if (length(url_column) > 0) {
      all_data <- all_data %>%
        rename(!!sym(url_column) := url)
    }

    actual_names <- names(all_data)

    actual_names <- case_when(
      actual_names %>% str_detect("url$") ~
        str_c("url_", actual_names %>% str_remove_all("_url")),
      actual_names %>% str_detect("name$") ~
        str_c("name_", actual_names %>% str_remove_all("_name")),
      actual_names == "reading_time" ~  "count_reading_time",
      TRUE ~ actual_names
    )

    all_data <-
      all_data %>%
      setNames(actual_names)

    if (all_data %>% hasName("url_favicon") && all_data %>% has_name("url_favicon") && length(unknown_icon_url) > 0) {
      all_data <- all_data %>%
        mutate(
          url_photo = case_when(
            is.na(url_image) & !is.na(url_favicon) ~ url_favicon,
            !is.na(url_image) & !is.na(url_favicon) ~ url_favicon,
            is.na(url_favicon) & !is.na(url_image) ~ url_image,
            !is.na(url_image) &
              !is.na(url_favicon) &
              url_favicon %>% str_detect(".ico") ~ url_image,
            TRUE ~ unknown_icon_url
          )
        )
    }

    if (all_data %>% hasName("url_favicon") && length(unknown_icon_url) > 0) {
      all_data <-
        all_data %>%
        mutate_at("url_favicon", list(function(x) {
          case_when(is.na(x) ~ unknown_icon_url,
                    TRUE  ~ x)
        }))
    }



    if (snake_names) {
      all_data <-
        all_data %>%
        clean_names()
    }
    all_data
  }


#' Crux URLs from a tibble
#'
#' @param urls vector of company urls
#' @param return_message if \code{TRUE} returns a message
#' @param url_column name of column output
#' @param case if not \code{NULL} `upper` coverts to upper or `lower` converts to lower
#' @param snake_names if \code{TRUE} returns snake names
#' @param unknown_icon_url if not `NULL` a link to override missing icon's
#' @param ... other parameters
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
tbl_crux <-
  function(data,
           url_column ="url_company",
           snake_names = F,
           unknown_icon_url = "https://static.thenounproject.com/png/739239-200.png",
           case = NULL,
           return_message = T) {
    urls <- data %>%
      filter(!is.na(!!(sym(url_column)))) %>%
      select(one_of(url_column)) %>%
      pull()

    df_crux <-
      crux_urls(
        urls = urls,
        url_column = url_column,
        return_message = return_message,
        case = case,
        snake_names = snake_names,
        unknown_icon_url = unknown_icon_url
      )

    data <-
      data %>%
      left_join(df_crux, by = url_column)

    data
  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.