.remove_na <-
function(data) {
janitor::remove_empty(data, which = "cols")
}
.parse_crux_url <-
function(url = "http://www.innovatorhealth.com",
return_message = T,
...) {
df_summary <- crux::classify_url(x = url, ...)
df_info <- as_tibble(crux::summarise_url(x = url))
data <-
df_info %>%
left_join(df_summary, by = "url") %>%
.remove_na()
if (return_message) {
glue::glue("Parsed {url}") %>% message()
}
data
}
#' CRUX vector of urls
#'
#' @param urls vector of company urls
#' @param return_message if \code{TRUE} returns a message
#' @param url_column name of column output
#' @param case if not \code{NULL} `upper` coverts to upper or `lower` converts to lower
#' @param snake_names if \code{TRUE} returns snake names
#' @param unknown_icon_url if not `NULL` a link to override missing icon's
#' @param ... other parameters
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' crux_urls("http://www.pwcommunications.com/")
crux_urls <-
function(urls = c(
"http://www.fgcplasma.com",
"http://www.dynepic.com",
"http://re3d.org",
"http://www.nearspacelaunch.com",
"http://www.botfactory.co"
)
,
url_column = "urlCompany",
case = NULL,
snake_names = F,
unknown_icon_url = "https://static.thenounproject.com/png/739239-200.png",
return_message = T,
...) {
if (length(urls) == 0) {
stop("Enter URLs")
}
.parse_crux_url_safe <-
purrr::possibly(.parse_crux_url, tibble())
all_data <-
urls %>%
map_dfr(function(url) {
.parse_crux_url_safe(url = url, return_message)
})
text_cols <-
all_data %>% select_if(is.character) %>%
select(-matches("url|theme|site_name")) %>% names()
all_data <-
all_data %>%
mutate_at(text_cols,
list(function(x) {
x %>% str_squish() %>% stri_enc_tonative()
}))
if (length(case) > 0) {
if (case %>% str_to_lower() %>% str_detect("upper")) {
all_data <-
all_data %>%
mutate_at(text_cols,
list(function(x) {
x %>% str_to_upper()
}))
} else {
all_data <-
all_data %>%
mutate_at(text_cols,
list(function(x) {
x %>% str_to_lower()
}))
}
}
df_text <-
all_data %>%
select(url, one_of(text_cols)) %>%
gather(column, text, -url, na.rm = T) %>%
group_by(url) %>%
summarise(descriptionSiteText = str_c(text, collapse = " ") %>% str_squish() %>% stri_enc_tonative()) %>%
ungroup()
all_data <-
all_data %>%
left_join(df_text, by = "url") %>%
select(url, descriptionSiteText, everything())
if (length(url_column) > 0) {
all_data <- all_data %>%
rename(!!sym(url_column) := url)
}
actual_names <- names(all_data)
actual_names <- case_when(
actual_names %>% str_detect("url$") ~
str_c("url_", actual_names %>% str_remove_all("_url")),
actual_names %>% str_detect("name$") ~
str_c("name_", actual_names %>% str_remove_all("_name")),
actual_names == "reading_time" ~ "count_reading_time",
TRUE ~ actual_names
)
all_data <-
all_data %>%
setNames(actual_names)
if (all_data %>% hasName("url_favicon") && all_data %>% has_name("url_favicon") && length(unknown_icon_url) > 0) {
all_data <- all_data %>%
mutate(
url_photo = case_when(
is.na(url_image) & !is.na(url_favicon) ~ url_favicon,
!is.na(url_image) & !is.na(url_favicon) ~ url_favicon,
is.na(url_favicon) & !is.na(url_image) ~ url_image,
!is.na(url_image) &
!is.na(url_favicon) &
url_favicon %>% str_detect(".ico") ~ url_image,
TRUE ~ unknown_icon_url
)
)
}
if (all_data %>% hasName("url_favicon") && length(unknown_icon_url) > 0) {
all_data <-
all_data %>%
mutate_at("url_favicon", list(function(x) {
case_when(is.na(x) ~ unknown_icon_url,
TRUE ~ x)
}))
}
if (snake_names) {
all_data <-
all_data %>%
clean_names()
}
all_data
}
#' Crux URLs from a tibble
#'
#' @param urls vector of company urls
#' @param return_message if \code{TRUE} returns a message
#' @param url_column name of column output
#' @param case if not \code{NULL} `upper` coverts to upper or `lower` converts to lower
#' @param snake_names if \code{TRUE} returns snake names
#' @param unknown_icon_url if not `NULL` a link to override missing icon's
#' @param ... other parameters
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
tbl_crux <-
function(data,
url_column ="url_company",
snake_names = F,
unknown_icon_url = "https://static.thenounproject.com/png/739239-200.png",
case = NULL,
return_message = T) {
urls <- data %>%
filter(!is.na(!!(sym(url_column)))) %>%
select(one_of(url_column)) %>%
pull()
df_crux <-
crux_urls(
urls = urls,
url_column = url_column,
return_message = return_message,
case = case,
snake_names = snake_names,
unknown_icon_url = unknown_icon_url
)
data <-
data %>%
left_join(df_crux, by = url_column)
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.