#' Linked Sites
#'
#' Crawl a website, building a site map, and reporting all internal and external
#' links found.
#'
#' The \code{max_depth} controls the number of links to follow. The root url is
#' level 0 and all the hrefs found on that page are level 1. Each href on a
#' level 1 page are labeled level 2. These labels and processing of the pages
#' will continue through level \code{max_depth}. You could think of
#' \code{max_depth} as the number of mouse clicks needed to navagate a web page
#' by a human in a graphical web browser to the noted url or file.
#'
#' @param x The root url as a character string, or a html session.
#' @param delay number of seconds to delay between http requests.
#' @param max_depth Starting with the root url (level 0) follow links upto \code{max_depth} "clicks".
#' @param excludesites (default is "none")
#' @param time_out time in seconds to set the timeout on how long to wait for a site to respond
#' @param keep_internal a boolean indicating whether to keep all internal pages the site
#' @param keep_subpages a boolean indicating whether to keep all sub pages pages the site
#' @param ... additional arguments (not yet used)
#'
#' @export
linked_urls <- function(x, delay = 0.2, max_depth = 5, excludesites="none", time_out=10, keep_internal=FALSE, keep_subpages=FALSE, ...) {
UseMethod("linked_urls")
}
#' @export
linked_urls.character <- function(x, delay = 0.2, max_depth = 5, excludesites="none", time_out=10, keep_internal=FALSE, keep_subpages=FALSE, ...) { #delay = delay, max_depth = max_depth, excludesites = excludesites, ...) {
library("dplyr");library("magrittr");
if( grep("\\.",x)==1 & !grepl("www",x) & length(unlist(strsplit(x,"\\.")))<3 ){
x <- gsub("://","://www.",x)
}
cl <- as.list(match.call())[-1]
cl$x <- try( rvest::html_session(x,httr::timeout(time_out)), silent = TRUE )
if( inherits(cl$x,"try-error") ) {
out <-
list(nodes = data.frame(url=x,
rooturl=gsub("http://|https://","",x),
name= gsub("http://|https://","",x),
is_root="TRUE",
specification=paste0('{"is_root":"TRUE","url":"',x,'","name":"',x,'"}')
),
edges = NULL,
message = "Success", # Hold over form older version...
is_blocked = FALSE) # Hold over form older version...
class(out) <- c("sna_linked_urls", "sna_urls")
return(out)
}else{
do.call(linked_urls, cl,...)
}
}
#' @export
linked_urls.session <- function(x, delay = 0.2, max_depth = 5, excludesites="none", time_out=10, keep_internal=FALSE, keep_subpages=FALSE,...) { # delay = delay, max_depth = max_depth, excludesites = excludesites, ...) {
# check that the max_depth is an integer valued and at least 1
library("dplyr");library("magrittr");
max_depth <- floor(max_depth)
if (max_depth < 1L) {
warning("max_depth is being set to 1.", call. = FALSE, immediate. = TRUE)
max_depth <- 1L
}
root_url <- gsub("/$","",x$url)
root_domain <- urltools::domain(root_url)
omit_regex <- paste0(paste(excludesites,collapse="|"),"|^mailto|pdf$|jpg$|png$|ppt$|pptx$|xls$|xlsx$|doc$|docx$|mp4$|mov$|avi$|flv$|wmv$")
# omit_regex <- paste0(paste(excludesites,collapse="|"),"|",gsub("www.","",root_domain),"|^mailto|pdf$|jpg$|png$|ppt$|pptx$|xls$|xlsx$|doc$|docx$")
tictoc::tic()
all_urls <- visit_url(root_url,time_out=time_out)
#all_urls <- visit_url(root_url,time_out=time_out)
all_urls$internal <- grepl(root_domain, urltools::domain(all_urls$url))
all_urls$subpage <- try(grepl(root_url, all_urls$url) & (root_url != all_urls$url))
#if(inherits(all_urls$subpage,"try-error")) browser()
# all_urls$internal <- grepl(root_url, urltools::domain(all_urls$url))
all_urls <- all_urls %>%
dplyr::mutate(hrefs = list(get_hrefs(url,omit_regex=omit_regex ))) %>%
dplyr::mutate(n_returns = unlist(lapply(hrefs,nrow)) )
scrape_time <- tictoc::toc()
all_urls$scrape_time <- scrape_time$toc-scrape_time$tic
all_urls$depth <- 0L
current_depth <- 1L
cat("max_depth:", max_depth," root_url: ",root_url," root_url:" ,root_url,"\n")
while (current_depth <= max_depth) {
tictoc::tic()
message(sprintf("current_depth: %d", current_depth))
links_to_visit <-
all_urls %>%
# dplyr::filter((root_url==.data$url)) %>%
dplyr::filter(grepl("text",.data$type)) %>%
dplyr::filter(.data$depth == current_depth - 1L) %>%
magrittr::extract2("hrefs") %>%
dplyr::bind_rows(.) %>%
dplyr::distinct(.) %>%
dplyr::filter(!((root_domain == gsub("http://|https://","",.data$url) )) )
# dplyr::filter(!parent) %>%
# dplyr::filter(!sibling)
message(sprintf("%d urls to visit", nrow(links_to_visit)))
if (nrow(links_to_visit) > 0) {
links_to_visit %<>% dplyr::anti_join(., all_urls, by = "url")
} else {
break
}
if (nrow(links_to_visit) == 0) {
break
}
v_urls <-
pbapply::pblapply(seq_along(links_to_visit$url),
function(i) { # i = 1
tictoc::tic()
u <- links_to_visit$url[i]
r <- links_to_visit$relative[i]
c <- links_to_visit$child[i]
cat("\n ---",u)
v <- try(visit_url(u, time_out=time_out)) ## Add error handle here? Other regex to omit from get_href?
if (c && v$status == 200) {
v$hrefs <- list(snaWeb::get_hrefs(attr(v, "session"), omit_regex = omit_regex))
scrape_time <- tictoc::toc()
v$scrape_time <- scrape_time$toc-scrape_time$tic
v$depth <- current_depth
v$n_returns <- unlist(lapply(v$hrefs,nrow))
cat(" success")
}else{
v$scrape_time <- NA
v$depth <- current_depth
v$n_returns <- 0
}
cat(" \n")
# if( inherits(v, "try-error") ){
# if (r && v$status == 200) {
# v$hrefs <- list(snaWeb::get_hrefs(attr(v, "session"), omit_regex = omit_regex))
# cat(" success\n")
# }else{
# cat(" fail\n")
# }
# }else{
# cat(" fail\n")
# }
Sys.sleep(delay)
v
}) %>%
dplyr::bind_rows()
if (nrow(v_urls)) {
all_urls %<>%
dplyr::bind_rows(.,
dplyr::mutate(v_urls,
internal = grepl(root_domain, urltools::domain(.data$url)),
subpage = grepl(root_url, .data$url) & (root_url != .data$url)
# depth = .data$depth,
# scrape_time = .data$
# n_returns = length(.data$hrefs)
)
# n_returns = ifelse(is.na(nrow(v_urls)),0,nrow(v_urls)))
)
}
current_depth <- current_depth + 1L
}
linked_sites <- all_urls %>%
dplyr::mutate(id=seq_along(.data$url)+(.data$depth*10)) %>%
dplyr::mutate(rooturl=urltools::domain(url)) %>%
# dplyr::mutate(tictoc_log=unlist(tictoc_log)) %>%
dplyr::filter((!internal | keep_internal | depth==0) &
(!subpage | (keep_subpages & keep_internal)) &
(status != "404" ) &
# (status == "200" | (!is.na(title) & title!=rooturl)) &
!duplicated(url) &
# !duplicated(rooturl) &
grepl("text",type) &
!grepl(paste0(excludesites,collapse="|"),rooturl))
nodes <-
dplyr::tibble(
id = linked_sites$id,
url = linked_sites$url,
rooturl = linked_sites$rooturl,
name = linked_sites$title,
is_root = linked_sites$depth==0
)
# nodes <-
# dplyr::data_frame(
# id = c(1,linked_sites$id),
# url = c(root_url, linked_sites$url),
# # url = c(x$url, linked_sites$url),
# rooturl = c(root_domain,linked_sites$rooturl),
# name = c(snaWeb::get_page_title(root_url,timeout=time_out),linked_sites$title),
# # name = c(snaWeb::get_page_title(x$url,timeout=time_out),linked_sites$title),
# is_root = c("TRUE", rep("FALSE", nrow(linked_sites)))
# )
# %>% dplyr::mutate(id = seq_along(.data$url))
# Error in bind_rows_(x, .id) : Argument 1 must have names [1] "Linked Error: site http://www.ny4p.org excludesites [\"none\"] Error in bind_rows_(x, .id) : Argument 1 must have names\n
#if(inherits(try(nchar(nodes$name)),"try-error")) browser()
# gsub('[^ -~]', [\x80-\xFF] [^[:alnum:][:blank:]?&/\\-] then "U00.."
nodes$name[is.na(nodes$name)] <- nodes$rooturl[is.na(nodes$name)]
if( !inherits(try(nchar(nodes$name)),"try-error") ){
nodes$name[nchar(nodes$name)==0] <- nodes$rooturl[nchar(nodes$name)==0]
}
nodes$name <- gsub("http://|https://","",nodes$name)
nodes$specification <-
with(nodes, sprintf('{"is_root":"%s","url":"%s","name":"%s"}', is_root, url, name))
# build the data frame to define the edges of the social network graph.
# edges <-
# dplyr::data_frame(name_from = x$url,
# name_to = linked_sites$url,
# id = seq_along(.data$name_to)
# ) %>%
# dplyr::left_join(.,
# dplyr::select(nodes, .data$url, node_to = .data$id),
# by = c("name_to" = "url")) %>%
# dplyr::left_join(.,
# dplyr::select(nodes, .data$url, node_from = .data$id),
# by = c("name_from" = "url")) %>%
# dplyr::mutate(rank = .data$node_to - .data$node_from)
#
# edges$predicate <-
# with(edges, sprintf('{"predicate":"is related","rank":"%d"}', rank))
if( length(linked_sites$rooturl) > 0 & nrow(nodes) > 1 ) {
edges <-
dplyr::tibble(name_from = rep(nodes$url[nodes$is_root],nrow(linked_sites)-1),
name_to = nodes$url[!nodes$is_root],
node_from = rep(nodes$id[nodes$is_root],nrow(linked_sites)-1),
node_to = nodes$id[!nodes$is_root]
)
# dplyr::data_frame(name_from = root_url,
# name_to = linked_sites$url)
# dplyr::data_frame(name_from = root_domain,
# name_to = linked_sites$rooturl)
edges$predicate <-
with(edges, sprintf('{"predicate":"is linked","rank":"NA"}'))
}else{
edges <- NULL
}
# return object
out <-
list(nodes = nodes,
edges = edges,
# message = "Success", # Hold over form older version...
# is_blocked = FALSE, # Hold over form older version...
depth = current_depth,
build_history = all_urls
)
# class(out) <- c("sna_linked_urls", "sna_urls")
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.