Nothing
#' Prepare cache for given tv trope urls
#'
#' @param urls url of tv trope pages
#' @param depth a size of tree-depth of trope urls. If you set it as 1, then
#' it will only fetch data of \code{urls}, and for 2, it will fetch both
#' \code{urls}, and their children urls.
#' @param trope_cache_dir a directory for trope data caching
#' @param redirect_to_cache_dir a directory for redirect urls caching
#' @param sleep wait time between queries
#' @param filter_pattern a pattern to filter tv trope urls
#' @param verbose verbosity option
#' @return \code{data.frame} which contains statistics how it constructs cache
#' @export
#' @examples
#' library(tropr)
#'
#' .urls <- c("http://tvtropes.org/pmwiki/pmwiki.php/Main/SenseiChan")
#' \dontrun{
#' res <- trope_cache(.urls)
#' }
trope_cache <- function(urls,
depth = 1,
trope_cache_dir = tempdir(),
redirect_to_cache_dir = tempdir(),
sleep = .5,
filter_pattern = NULL,
verbose = T) {
stopifnot(length(urls) > 0)
stopifnot(depth > 0)
stopifnot(sleep > 0)
stopifnot(dir.exists(trope_cache_dir))
stopifnot(dir.exists(redirect_to_cache_dir))
urls <- sort(unique(urls))
if (!is.null(filter_pattern)) {
indices <- grepl(filter_pattern, urls)
urls <- urls[indices]
}
ret <- data.frame(depth = as.numeric(),
number_of_urls = as.numeric(),
number_of_links = as.numeric(),
stringsAsFactors = F)
while(depth > 0) {
if (verbose) {
cat("depth:", depth, "..............................................\n")
}
if (verbose) {
cat("* check redirects...\n")
}
res <- trope_redirect_to(urls,
cache_dir = redirect_to_cache_dir,
sleep = sleep,
verbose = verbose)
urls_to_process <- sort(unique(res$redirect_to))
if (!is.null(filter_pattern)) {
indices <- grepl(filter_pattern, urls_to_process)
urls_to_process <- urls_to_process[indices]
}
if (verbose) {
cat("* download tropes...\n")
}
res <- trope_data(urls_to_process,
cache_dir = trope_cache_dir,
sleep = sleep,
verbose = verbose)
urls <- sort(unique(res$link))
if (!is.null(filter_pattern)) {
indices <- grepl(filter_pattern, urls)
urls <- urls[indices]
}
if (verbose) {
cat("* processed tropes:", length(urls_to_process),"\n")
cat("* tropes to be processed:", length(urls), "\n")
}
ret <- rbind(ret, data.frame(depth = depth,
number_of_urls = length(urls_to_process),
number_of_links = length(urls)))
depth <- depth - 1
}
if (verbose) {
cat("Successfully prepared cache\n")
}
ret
}
#' Fetch trope data
#'
#' @param trope_urls url of tv trope pages
#' @param cache_dir a directory for data caching
#' @param sleep wait time between queries
#' @param verbose verbosity option
#' @return \code{data.frame} which contains trope data
#' @importFrom digest digest
#' @importFrom magrittr %>%
#' @importFrom utils write.csv2
#' @export
#' @examples
#' library(tropr)
#'
#' .urls <- c("http://tvtropes.org/pmwiki/pmwiki.php/Main/SenseiChan",
#' "http://tvtropes.org/pmwiki/pmwiki.php/Main/YouAreBetterThanYouThinkYouAre")
#' \dontrun{
#' res <- trope_data(.urls)
#' }
trope_data <- function(trope_urls,
cache_dir = tempdir(),
sleep = .5,
verbose = F) {
stopifnot(dir.exists(cache_dir))
# Save trope urls into cache folder first
for (i in 1:length(trope_urls)) {
trope_url <- trope_urls[i]
if (verbose) {
cat(toString(Sys.time()),
" | ", i, "/", length(trope_urls), " | ",
trope_url, "... ", sep = "")
}
tryCatch({
key <- digest::digest(trope_url)
file_path <- file.path(cache_dir, paste0(key, ".csv"))
if (file.exists(file_path)) {
if (verbose) {
cat("pass\n-", trope_url, "exists\n")
}
next
}
content <- trope_content(trope_url)
res <- as.data.frame(content)
if (!is.null(res)) {
write.csv2(res,
file = file_path,
row.names = F)
}
}, error = function(x) {
if (verbose) {
cat("fail\n-")
}
print(x)
})
system(paste("sleep", sleep))
if (verbose) {
cat("done\n")
}
}
# Save trope urls into cache folder first
csv_files <- file.path(cache_dir,
paste0(lapply(trope_urls, digest::digest) %>% unlist,
".csv"))
do.call(rbind,
lapply(csv_files,
function(csv_file) {
target_data <- read.csv2(csv_file, stringsAsFactors = F)
if (nrow(target_data) > 0) {
target_data
} else {
NULL
}}))
}
#' Get the redirected urls of given trope urls
#'
#' @param trope_urls url of tv trope pages
#' @param cache_dir a directory for data caching
#' @param sleep wait time between queries
#' @param verbose verbosity option
#' @return \code{data.frame} which contains the redirected urls of trope urls
#' @importFrom digest digest
#' @importFrom httr GET
#' @importFrom magrittr %>%
#' @importFrom utils write.csv2 read.csv2
#' @export
#' @examples
#' library(tropr)
#'
#' .urls <- c("http://tvtropes.org/pmwiki/pmwiki.php/Main/SenseiChan",
#' "http://tvtropes.org/pmwiki/pmwiki.php/Main/YouAreBetterThanYouThinkYouAre")
#' \dontrun{
#' res <- trope_redirect_to(.urls)
#' }
trope_redirect_to <- function(trope_urls,
cache_dir = tempdir(),
sleep = .5,
verbose = F) {
for (i in 1:length(trope_urls)) {
trope_url <- trope_urls[i]
if (verbose) {
cat(toString(Sys.time()),
" | ", i, "/", length(trope_urls), " | ",
trope_url, "... ", sep = "")
}
tryCatch({
key <- digest::digest(trope_url)
file_path <- file.path(cache_dir, paste0(key, "_redirect_to.csv"))
if (file.exists(file_path)) {
if (verbose) {
cat("pass\n-", trope_url, "exists\n")
}
next
}
resp <- httr::GET(trope_url)
redirect_to <- gsub("\\?from..*","", resp$url)
res <- data.frame(link = trope_url,
redirect_to = redirect_to,
redirected = ifelse(trope_url == redirect_to, F, T))
if (!is.null(res)) {
write.csv2(res,
file = file_path,
row.names = F)
}
}, error = function(x) {
if (verbose) {
cat("fail\n-")
}
print(x)
})
system(paste("sleep", sleep))
if (verbose) {
cat("done\n")
}
}
# Save trope urls into cache folder first
csv_files <- file.path(cache_dir,
paste0(lapply(trope_urls, digest::digest) %>% unlist,
"_redirect_to.csv"))
do.call(rbind,
lapply(csv_files,
function(csv_file) {
target_data <- read.csv2(csv_file, stringsAsFactors = F)
if (nrow(target_data) > 0) {
target_data
} else {
NULL
}}))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.