#' Extract content of a post from a post url
#'
#' @param post_url URL string of a post.
#' @examples
#' get_post_content("https://www.ptt.cc/bbs/Gossiping/M.1467117389.A.62D.html")
#'
#' @import httr rvest data.table stringr
#' @export
get_post_content = function(post_url, max_error_time = 3, verbose = TRUE) {
# post_url <- "https://www.ptt.cc/bbs/Gossiping/M.1467117389.A.62D.html"
# post_url <- "https://www.ptt.cc/bbs/give/M.1493725282.A.75C.html"
post_url <- unlist(post_url, use.names = FALSE)
if (! length(post_url) == 1) {
post_url <- post_url[1]
warning("only use first url\n[", post_url[[1]], "]")
}
res <- GET(post_url, set_cookies(over18 = 1))
error_time = 1
while (error_time <= max_error_time && http_error(res)) {
Sys.sleep(0.5)
if (verbose) {
message(sprintf("Connection fail (try %s) [%s]: %s",
error_time ,
status_code(res),
post_url))
}
res <- GET(post_url, set_cookies(over18 = 1))
error_time = error_time + 1
if (error_time > max_error_time) {
return(NULL)
}
}
node <- content(res, encoding = "UTF-8")
post_data <- list()
# post_data$board = node %>%
# rvest::html_nodes(".article-metaline-right > .article-meta-value") %>%
# rvest::html_text()
post_data$board <- post_url %>%
str_match("https?://www.ptt.cc/bbs/([^/]+)") %>%
.[,2]
metaTemp <- node %>%
rvest::html_nodes(".article-metaline > .article-meta-value") %>%
rvest::html_text()
post_data$author <- metaTemp[1] %>% str_trim()
post_data$author_ip = node %>%
html_text() %>%
str_match_all('(?:From|來自|編輯):\\s(?:\\w+\\s)?[(]?([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3})') %>%
.[[1]] %>% .[,2] %>%
tail(1) # select last IP in case of editing
post_data$title <- metaTemp[2] %>% str_trim()
post_time <- metaTemp[3] %>% str_trim()
post_data$post_time <- post_time %>% strptime("%c", tz = "ROC") %>% as.POSIXct()
# if (!is.na(post_time_parsed)) {
# post_data$post_time <- post_time_parsed
# } else {
# post_data$post_time <- NA
# }
post_data$post_url <- post_url
post_data$post_id <- str_match(post_url, "([^/]+)\\.html")[,2] %>% str_trim()
post_data$post_text <- node %>%
rvest::html_nodes("*#main-content") %>%
as.character() %>%
str_replace_all('(?s)\\<div class="push"\\>.*\\<\\/div\\>', "") %>% # clear push
str_replace_all('(?s).*\\<span class="article-meta-value"\\>.*\\d{2}:\\d{2}:\\d{2}[ ]\\d{4}\\<\\/span\\>', "") %>%
xml2::read_html() %>%
html_text() %>%
str_replace('(?s)--\\n\\u203b \\u767c\\u4fe1\\u7ad9: \\u6279\\u8e22\\u8e22\\u5be6\\u696d\\u574a.*$', "") %>%
str_trim()
## replace NULL with NA for making data.frame
null_to_na <- function(x) {
x[sapply(x, is.null)] <- NA
x[sapply(x, function(x) !as.logical(length(x)))] <- NA_character_
return(x)
}
post_data <- null_to_na(post_data)
post_data %>% setDT
## push data
push_df <- data.table(push_tag = character(),
push_uid = character(),
push_text = character(),
push_ip = character(),
push_time = character())
push_nodes <- node %>% rvest::html_nodes("div.push")
if (length(push_nodes)) {
push_rows <- lapply(push_nodes,
function (x) {
push_text <- html_text(html_nodes(x, "span"))
push <- data.table(
push_tag = str_trim(push_text[1]),
push_uid = str_trim(push_text[2]),
push_text = str_replace(push_text[3], "^:", "") %>% str_trim(),
push_ip = str_extract(
str_trim(push_text[4]),
"^(?:\\d{2,3}\\.){3}\\d{2,3}"),
push_time = str_extract(
str_trim(push_text[4]),
"\\d{2}\\/\\d{2}\\s\\d{2}:\\d{2}$"))
}) %>% data.table::rbindlist()
push_df <- data.table::rbindlist(list(push_df, push_rows))
push_df$post_id <- post_data$post_id
push_df$post_url <- post_url
}
# function output:
structure(list(post_main = post_data,
push = push_df),
class = "ptt_post")
}
#' @export
print.ptt_post <- function(x, ..., max.lines = 10, width = getOption("width")) {
cat("[", x$post$post_url, "]\n\n", sep = "")
cat("$", names(x)[[1]], "\n", sep = "")
cat(" Board: ", x$post$board, "\n", sep = "")
cat(" Title: ", x$post$title, "\n", sep = "")
cat(" Author: ", x$post$author, "\n", sep = "")
cat(" Post-Time: ", format(x$post$post_time, usetz = TRUE), "\n\n", sep = "")
## print
# code modified from:
# https://github.com/hadley/httr/blob/25557c0327ef2a4cae13db017813fbd2ed86d3c0/R/response.r
size <- length(x$post_main$post_text)
if (size == 0) {
cat("<EMPTY POST TEXT>\n")
} else {
cat("<POST TEXT>\n")
breaks <- gregexpr("\n", x$post_main$post_text, fixed = TRUE)[[1]]
last_line <- breaks[min(length(breaks), max.lines)]
lines <- strsplit(substr(x$post_main$post_text, 1, last_line), "\n")[[1]]
too_wide <- nchar(lines) > width
lines[too_wide] <-
paste0(substr(lines[too_wide], 1, width - 3), "...")
cat(lines, sep = "\n")
if (max.lines < length(breaks))
cat("...\n")
}
cat("\n$", names(x)[[2]], "\n", sep = "")
size_push <- nrow(x$push)
if (size_push == 0) {
cat("<EMPTY PUSH>\n")
} else {
cat("<PUSH CONTENT>\n")
head_push <- head(x$push)[, c("push_tag", "push_uid", "push_text")]
apply(head_push, MARGIN = 1, FUN = function(x){
cat(" ", x, "\n")
invisible(NULL)
})
if (max.lines < size_push)
cat(" ...\n")
}
invisible(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.