#' Extract list of urls, titles, n_push from a board
#'
#' @param board_name String of PTT board name.
#' @param max_post Maximun number of the lastest posts. Default 1000.
#' If set to \code{-1}, will get all of listpage_urls. [Caucious!].
#' @param parallel If \code{TRUE}, the default, will use parallel threads.
#' @param mc.cores Number of parallel cores to use. Default \code{NULL} to auto detect.
#'
#' @examples
#' get_posts_list("Gossiping", max_post = 100)
#' get_post_urls("Gossiping", max_post = 100)
#' get_titles("Gossiping", max_post = 100)
#'
#' @export
get_posts_list <- function(board_name, max_post = 1000L,
parallel = TRUE, mc.cores = NULL, ...) {
## Setting # of parallel cores
if (parallel) {
if (is.null(mc.cores)) {
mc.cores <- ceiling(parallel::detectCores() *3/4)
} else if (!is.numeric(mc.cores) || mc.cores < 1) {
mc.cores <- 1L
} else {
mc.cores <- as.integer(mc.cores)
}
} else {
mc.cores <- 1L
}
message(sprintf("Getting urls using %s thread(s)...", mc.cores), appendLF = FALSE)
listpage_urls <- get_url_listpage(board_name)
out <- get_post_url(listpage_urls, max_post=max_post, mc.cores=mc.cores)
message(sprintf("(Got %s)", length(out[[1]])))
tryCatch(
setDT(out),
error = function(e) {
sapply(out, length, USE.NAMES = TRUE) %>% print
stop(e)
}
)
out
}
#' @export
#' @describeIn get_posts_list Get urls of posts in a board.
get_post_urls <- function(board_name, max_post = 1000L, parallel = TRUE, mc.cores = NULL, ...) {
get_posts_list(board_name,
max_post=max_post,
parallel=parallel,
mc.cores=mc.cores)[["post_urls"]]
}
#' @export
#' @describeIn get_posts_list Get post titles of posts in a board.
get_titles <- function(board_name, max_post = 1000L, parallel = TRUE, mc.cores = NULL, ...) {
get_posts_list(board_name,
max_post=max_post,
parallel=parallel,
mc.cores=mc.cores)[["title"]]
}
get_post_url = function(listpage_urls, board_name, max_post = 1000L, mc.cores, ...) {
# board_name = "Gossiping"
# listpage_urls = get_url_listpage("Gossiping")[1:5]
if (!is.numeric(max_post)) {
stop("'max_post' must be integer")
} else {
max_post <- as.integer(max_post)
}
if (max_post < 0) {
urls <- listpage_urls
} else {
url_front <- stringr::str_match(listpage_urls[[1]], "^(.*/)[^/]+.html$")[,2]
page_num <- stringr::str_match(listpage_urls, "index(\\d+)\\.html$")[,2]
urls <- page_num %>%
as.integer %>%
sort(decreasing = TRUE, na.last = F) %>%
`[<-`(which(is.na(.)), "") %>%
head(max_post/20 + 1) %>% # 20 posts per page
paste0(url_front, "index", ., ".html")
}
## Setting arallel cores
cl <- parallel::makeCluster(mc.cores)
on.exit(parallel::stopCluster(cl))
parallel::clusterExport(cl, c("urls"), envir = environment())
parallel::clusterExport(cl, "get_post_url_", envir = parent.env(environment()))
# parallel::clusterEvalQ(cl, library(data.table))
res <- parallel::parLapply(cl, urls, get_post_url_)
post_urls <- res %>%
lapply(function(x) x[["post_urls"]]) %>%
unlist(use.names = FALSE)
n_push <- res %>%
lapply(function(x) x[["n_push"]]) %>%
unlist(use.names = FALSE)
title <- res %>%
lapply(function(x) x[["title"]]) %>%
unlist(use.names = FALSE)
author <- res %>%
lapply(function(x) x[["author"]]) %>%
unlist(use.names = FALSE)
post_id <- res %>%
lapply(function(x) x[["post_id"]]) %>%
unlist(use.names = FALSE)
if (max_post > 0) {
post_urls <- post_urls %>% head(max_post)
n_push <- n_push %>% head(max_post)
title <- title %>% head(max_post)
author <- author %>% head(max_post)
post_id <- post_id %>% head(max_post)
}
# function output: post_urls
list(post_urls = post_urls, n_push = n_push, title = title, author = author,
post_id = post_id)
}
# single url
get_post_url_ <- function(listpage_url) {
# listpage_url = "https://www.ptt.cc/bbs/Gossiping/index18285.html"
# listpage_url = "https://www.ptt.cc/bbs/Gossiping/index.html"
res <- httr::GET(listpage_url, httr::set_cookies(over18 = 1))
node <- httr::content(res, encoding = "utf8")
post_urls <- node %>%
rvest::html_nodes(".title a") %>%
rvest::html_attr("href") %>%
sprintf("https://www.ptt.cc%s", .) %>%
rev
n_push <- node %>%
rvest::html_nodes(".nrec") %>%
rvest::html_text() %>%
rev
title <- node %>%
rvest::html_nodes(".title a") %>%
rvest::html_text() %>%
stringr::str_trim() %>%
rev
author <- node %>%
rvest::html_nodes(".author") %>%
rvest::html_text() %>%
rev
post_id <- str_match(post_urls, "([^/]+)\\.html")[,2] %>% str_trim()
keep <- which(author != "-")
author <- author[keep]
n_push <- n_push[keep]
if (!identical(length(post_urls), length(n_push))) {
warning(sprintf("length of post_urls and n_push are not the same in %s", listpage_url))
}
list(post_urls = post_urls, n_push = n_push, title = title, author = author,
post_id = post_id)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.